In [None]:
import plotnine
import qgrid
from tidyspotify import get_artist_audio_features
from plotnine import *
import pandas as pd

import warnings
warnings.simplefilter("ignore")

qgrid.set_grid_option("forceFitColumns", False)
plotnine.options.figure_size = (8, 4.8)
plotnine.theme_set(theme_grey())

Getting the data
-----------

In [None]:
# the code below gets spotify data for three artists
def get_unique_tracks(artist_name):
    df = get_artist_audio_features(artist_name)
    return df[~df.track_name.duplicated()]

gaga = get_unique_tracks('Lady Gaga')
kanye = get_unique_tracks('Kanye West')
django = get_unique_tracks('Django Reinhardt')

# remove duplicates
data = pd.concat([gaga, kanye, django])
qgrid.show_grid(data)

Viewing the data
-------------

The code below compares each pair of features in the data.

### Exercise

1. Try to find an artist, where features predict track popularity.

In [None]:
import seaborn

# define variables used in regression ----

# artist we're doing regression for
# choose from: gaga, django, kanye
crnt_data = gaga

y_var = 'speechiness'       # y variable
k_features = 3              # number of features to get from stepwise regression

feat_cols = [
    'track_popularity',
    'acousticness',
    'danceability',
    'energy',
    'instrumentalness',
    'liveness',
    'loudness',
    'speechiness',
    'valence'
    ]

# Plot ----
#seaborn.pairplot(data[feat_cols], hue = 'artist_name')
seaborn.pairplot(
   crnt_data[feat_cols + ['artist_name']],
   y_vars= y_var,
   x_vars = feat_cols,
   hue = 'artist_name'
   )

In [None]:
import tidyspotify
from ols_utils import stepwise_ols

# run regression ----

X = crnt_data[list(feat_cols)]
std_X = (X - X.mean()) / X.std()

y = crnt_data[y_var]
std_y = (y - y.mean()) / y.std()

# fits model using the statsmodel package
fits = stepwise_ols(std_y, std_X, feat_cols)

# print summary of fitted coefficients
fit = fits[k_features - 1]
summary = fit.summary()
print("R squared", round(fit.rsquared, 3))
summary.tables[1]

Looking at predictions
----

In [None]:
predictions = crnt_data.assign(predictions = fit.predict())
seaborn.regplot(
    'predictions',
    y = 'speechiness',
    data = predictions,
    ci = False)

Appendix
========

Simple Regression
---------

In [None]:
seaborn.pairplot(gaga, x_vars = ['energy', 'acousticness', 'liveness'], y_vars = ['loudness'])

In [None]:
import statsmodels.formula.api as smf

#seaborn.regplot(x = 'energy', y = 'loudness', ci = False, data = gaga)
#smf.ols('loudness ~ 1 + energy', data = gaga).fit().summary()

fit = smf.ols('loudness ~ 1 + energy + acousticness + liveness', data = gaga).fit()
seaborn.regplot('predictions', y = 'loudness', data = gaga.assign(predictions = fit.predict()), ci = False)