# Charting the Charts
## Sahil Chinoy

Analysis of the similarity of summer hits for a [New York Times story](https://www.nytimes.com/interactive/2018/08/09/opinion/do-songs-of-the-summer-sound-the-same.html). Billboard data is scraped from Bob Borst's website. Spotify data is from their API.

[Here's an interactive visualization](https://observablehq.com/@sahilchinoy/charting-the-charts) (with audio!) of every summer's hits.

In [None]:
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Data is from the Billboard Hot 100, merged with Spotify acoustic characteristics
df = pd.read_csv('spotify_data.csv')

In [None]:
df.head()

In [None]:
# These are the Spotify acoustic characteristics we will analyze
cols = [
    'acousticness',
    'loudness',
    'energy',
    'danceability',
    'tempo',
    'valence',
    'instrumentalness',
    'liveness',
    'speechiness',
    'duration_ms'
]

In [None]:
df[cols].describe()

In [None]:
f, axs = plt.subplots(3, 4, figsize=(16, 10))
axs = np.ravel(axs)
for i, col in enumerate(cols):
    sns.distplot(df[col], ax=axs[i])

In [None]:
z_cols = [col + '_z' for col in cols]

In [None]:
df[z_cols] = df[cols].apply(zscore)

## Over time

In [None]:
# Plot of five-year rolling averages of acoustic characteristics
df.groupby('year').mean().rolling(window=5).mean()[z_cols].plot(figsize=(10,6))

## Distance metrics

In [None]:
# Looks like there's a lot of variation in these columns, so let's use them
keys = ['acousticness_z', 'loudness_z', 'energy_z', 'danceability_z', 'valence_z']

In [None]:
def get_vector(a):
    return np.array([a[key] for key in keys])

In [None]:
def euclidean_distance(a, b):
    v1 = get_vector(a)
    v2 = get_vector(b)
    return np.linalg.norm(v1 - v2)

In [None]:
def cosine_similarity(a, b):
    v1 = get_vector(a)
    v2 = get_vector(b)
    return np.inner(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
# Average distance between two songs for each year, using both metrics
records = []
for year in range(1970, 2019):
    # List of song pairs for this year
    combinations = list(itertools.combinations(df[df.year == year].to_dict('records'), 2))
    records.append({
        'year': year,
        'euclidean_dist': np.mean([euclidean_distance(*c) for c in combinations]),
        'cosine_similarity': np.mean([cosine_similarity(*c) for c in combinations])
    })
years = pd.DataFrame.from_records(records)

In [None]:
# Most similar years, by cosine similarity
years.sort_values('cosine_similarity', ascending=False).head()

In [None]:
# Most similar years, by euclidean distance
years.sort_values('euclidean_dist').head()

In [None]:
# Most different years, by cosine similarity
years.sort_values('cosine_similarity').head()

In [None]:
# Most different years, by euclidean distance
years.sort_values('euclidean_dist', ascending=False).head()

In [None]:
# Five-year rolling average
rolling = years.rolling(window=5).mean()

In [None]:
rolling.plot(x='year', y='cosine_similarity')

In [None]:
rolling.plot(x='year', y='euclidean_dist')