### Perform EDA and analyze
**program:** 02b_eda_hit <br>
**author:** chris chan<br>
**date:** jan 27,2021<br>
**desc:** use spotify/bb data after cleaining and perform EDA<br>

**datasources:**<br>
- sb_analytic (balanced df thru 2010)
- billboard analytic (hot 100 thru 2019)
- spotify random (random thru 2020)

In [None]:
from importlib import reload
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [None]:
from sqlalchemy import create_engine
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split

**1. Bring in data**

In [None]:
sbdf=pd.read_csv(r'../data/clean/sbdf_clean.csv')
sbdf.head(3)

**remove rows below 1960**

In [None]:
#sbdf = sbdf[sbdf['year'] >= 1960] 

In [None]:
sbdf.columns = map(str.lower, sbdf.columns)

In [None]:
sbdf['decade'] = (sbdf.year//10*10).astype(int)
sbdf['year'] = sbdf.year.astype(int)

In [None]:
sbdf.decade.value_counts()

In [None]:
sbdf = sbdf[sbdf['year'] >= 1990] 

In [None]:
sbdf.info()

In [None]:
sbdf['track_seconds'] = sbdf['duration_ms'] / 1000

## EDA 

In [None]:
# This will help with plotting. Look up Pandas.Categorical for more methods ...
sbdf.loc[:, 'is_hit'] = sbdf['is_hit'].astype('category')

Our goal is to predict whether or not the song is a hit or not. For the sake of simplicity, we'll split the data once, between a train set and a test set. *Of course, in practice, you'll want to cross validate with multiple splits of the data.*

**May not need to split for EDA**

In [None]:
df_sample = sbdf.drop(['spotifyid','duration_ms'],
                     axis=1) \
               .sample(1000, random_state=44)

df_sample.shape


In [None]:
df_sample.decade.value_counts()

In [None]:
# We want to pick one feature to start that separates the two cases.
sns.pairplot(df_sample,  height=1.5, plot_kws={'size': 3}, hue='is_hit');
plt.savefig('pairplot_ishit.png')


**Histogram of hits**

In [None]:
sns.countplot(x='is_hit',data=sbdf)
plt.title('Song is a hit (Target)')
plt.savefig('target_hist.pdf')
plt.show()

In [None]:
plt.stem(sbdf['is_hit'])
plt.xlim(0, 1.2)
plt.show()

**Box: Time**

In [None]:
test=sbdf[sbdf['track_seconds']>3000]

In [None]:
sns.color_palette("light:b", as_cmap=True)
sns.catplot(data=sbdf,x='is_hit',y='track_seconds',kind='box')
plt.title('Track Seconds')
plt.xlabel('Is Hit')
plt.ylabel('Track Seconds')
plt.tight_layout()
plt.savefig('time_box.pdf',dpi=20)

**Box: Loudness**

In [None]:
sns.color_palette("light:b", as_cmap=True)
sns.catplot(data=sbdf,x='is_hit',y='loudness',kind='box')
plt.title('Loudness')
plt.xlabel('Is Hit')
plt.ylabel('Loudness')
plt.tight_layout()
plt.savefig('loud_box.pdf')

**Box: Valence**

In [None]:
sns.color_palette("light:b", as_cmap=True)
sns.catplot(data=sbdf,x='is_hit',y='valence',kind='box')
plt.title('Valence')
plt.xlabel('Is Hit')
plt.ylabel('Valence')
plt.tight_layout()
plt.savefig('valence_box.pdf')

**Box: Acousticness**

In [None]:
sns.color_palette("light:b", as_cmap=True)
sns.catplot(data=sbdf,x='is_hit',y='acousticness',kind='box')
plt.title('Acousticness')
plt.xlabel('Is Hit')
plt.ylabel('Acousticness')
plt.tight_layout()
plt.savefig('acoustics_box.pdf')

**Box: Popularity**

In [None]:
# sns.color_palette("light:b", as_cmap=True)
# sns.catplot(data=sbdf,x='is_hit',y='spotify_track_popularity',kind='box')
# plt.xlabel('Is Hit')
# plt.ylabel('Song popularity')
# plt.savefig('popularity_box.png')

**Box: Danceability**

In [None]:
sns.color_palette("light:b", as_cmap=True)
sns.catplot(data=sbdf,x='is_hit',y='danceability',kind='box')
plt.title('Danceability')
plt.xlabel('Is Hit')
plt.ylabel('Danceability')
plt.tight_layout()
plt.savefig('dance_box.pdf')

**Box: Energy**

In [None]:
sns.color_palette("light:b", as_cmap=True)
sns.catplot(data=sbdf,x='is_hit',y='energy',kind='box')
plt.title('Energy')
plt.xlabel('Is Hit')
plt.ylabel('Energy')
plt.tight_layout()
plt.savefig('energy_box.pdf')

**Box: mode**

In [None]:
sns.barplot(x='is_hit',y='mode',data=sbdf)
plt.title('Mode')
plt.tight_layout()
#g.set(xscale="log")
plt.savefig('mode_cat.pdf')

In [None]:
sns.color_palette("light:b", as_cmap=True)
sns.catplot(data=bbdf,x='decade',y='mode',kind='box')
plt.xlabel('Decade')
plt.ylabel('Mode')
plt.savefig('mode_box.png')

In [None]:
features = ['speechiness','liveness','tempo', 'energy', 'acousticness', 'loudness','danceability','mode','key','valence','track_seconds','instrumentalness']

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(12,10))

for i, c in enumerate(features, 1):
    plt.subplot(4,3,i) 
    g = sns.boxplot(x='is_hit', y=c,data=sbdf)
    #g.axes.set_title(fontsize=10)
    g.set_xlabel('Is Hit',fontsize=10)
    g.set_ylabel(c,fontsize=10)
    g.tick_params(labelsize=8)    
    plt.savefig('edaboxplots.pdf',bbox_inches='tight')
    #plt.savefig('edaboxplots.png')
    #sns.plt.show()

**radar graph**

In [None]:
print(feathits)

In [None]:
sbdf.describe()

In [None]:
feathits=pd.DataFrame()
feathits = sbdf.loc[sbdf['is_hit'] == 1]
print(feathits.is_hit.value_counts())
feathits = feathits[['acousticness',   'instrumentalness', 'tempo', 'speechiness', 'valence','energy','liveness','danceability','track_seconds','loudness']]# ,'track_seconds','loudness']]
feathits['loudness'] = abs(feathits['loudness'])

# from sklearn.preprocessing import MinMaxScaler
# min_max_scaler = MinMaxScaler()
# feathits.loc[:]=min_max_scaler.fit_transform(feathits.loc[:])


featnonhits=pd.DataFrame()
featnonhits = sbdf.loc[sbdf['is_hit'] == 0]
print(featnonhits.is_hit.value_counts())
featnonhits = featnonhits[['acousticness',   'instrumentalness', 'tempo', 'speechiness', 'valence','energy','liveness','danceability','track_seconds','loudness']]# ,'track_seconds','loudness']]
featnonhits['loudness'] = abs(featnonhits['loudness'])

# from sklearn.preprocessing import MinMaxScaler
# min_max_scaler = MinMaxScaler()
# featnonhits.loc[:]=min_max_scaler.fit_transform(featnonhits.loc[:])


# featnonhits=pd.DataFrame()
# featnonhits = sbdf.loc[sbdf['is_hit'] == 0]
# featnonhits = featnonhits[['acousticness', 'danceability', 'energy', 'instrumentalness', 'tempo', 'speechiness', 'valence','liveness','speechiness','track_seconds']]


In [None]:
feathits.describe()

In [None]:
featnonhits.describe()

In [None]:
import math 
# plot size
#fig=plt.figure(figsize=(12,8))

# convert column names into a list
categories=list(feathits.columns)
# number of categories
N=len(categories)

# create a list with the average of all features
value = list(feathits.mean())
value2 = list(featnonhits.mean())

# tempo - scaled
value[2] = value[2]/220
value2[2] = value2[2]/220

#speech - scaled
value[3] = value[3]*1.5
value2[3] = value2[3]*1.5

# inst - scaled
value[1] = value[1]*1.5
value2[1] = value2[1]*1.5

# loudness - scaled
value[9] = value[9]/25
value2[9] = value2[9]/25


# time - scaled
value[8] = value[8]/400
value2[8] = value2[8]/400


# repeat first value to close the circle
# the plot is a circle, so we need to "complete the loop"
# and append the start value to the end.
value+=value[:1]
value2+=value2[:1]

# calculate angle for each category
angles=[n/float(N)*2*math.pi for n in range(N)]
angles+=angles[:1]

# plot
fig=plt.figure(figsize = (18,18))

ax = fig.add_subplot(221, polar=True)

#plot 1 hits
ax.plot(angles, value,  linewidth=2, label = "Hit", color= 'slategray')
ax.fill(angles, value, alpha=0.35, facecolor='slategray')

#plot 2 non-hits
ax.plot(angles, value2, linewidth=2, label = "Non-Hit", color = 'darkorange')
ax.fill(angles, value2, alpha=0.15, facecolor='darkorange' )
#ax.set_title('Mean Values of the audio features')
ax.grid(True)

# plt.polar(angles, value,label='hit')
# plt.fill(angles,value,alpha=0.3)

# plt.polar(angles, value2,label='non-hit',color='m')
# plt.fill(angles,value2,alpha=0.3,facecolor='m')

# plt.title('Discovery Weekly Songs Audio Features', size=35)

# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1),fontsize=16)

plt.xticks(angles[:-1],categories, size=14)
plt.yticks(color='grey',size=16)

# Create a color palette:
plt.cm.get_cmap("Set2", len(feathits.index))
plt.savefig('radar_hit_compare.pdf')
plt.show()


**Violin plots**

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(12,10))

for i, c in enumerate(features, 1):
    plt.subplot(4,3,i) 
    g = sns.violinplot(x='is_hit', y=c,data=sbdf, palette=['darkorange','slategray'],alpha=0.15,as_cmap=True) #, palette='crest' )  #RdYlBu
    
    g.set_alpha(0.25)
    #sns.diverging_palette(220,20,as_cmap=True)
    #g.axes.set_title(fontsize=10)
    g.set_xlabel('Is Hit',fontsize=10)
    g.set_ylabel(c,fontsize=10)
    g.tick_params(labelsize=8)    
    plt.grid(b=None)
    plt.cm.get_cmap("Set3")
    plt.savefig('edavioplots.pdf',bbox_inches='tight')
    #plt.savefig('edaboxplots.png')
#    sns.plt.show()

In [None]:
sns.color_palette("Set2", as_cmap=True) #light:b
g = sns.violinplot(x='is_hit', y='energy',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=12)
g.set_ylabel('Energy',fontsize=12)
g.tick_params(labelsize=8)    
#plt.grid(b=None)
plt.savefig('energy_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='danceability',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=16)
g.set_ylabel('Danceability',fontsize=16)
g.tick_params(labelsize=16)    
plt.grid(b=None)
plt.tight_layout()
plt.savefig('dance_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='speechiness',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=16)
g.set_ylabel('Speechiness',fontsize=16)
g.tick_params(labelsize=16)    
plt.grid(b=None)
plt.tight_layout()
plt.savefig('speechiness_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='loudness',data=sbdf, palette=['darkorange','steelblue'],as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=16)
g.set_ylabel('Loudness',fontsize=16)
g.tick_params(labelsize=16)    
#plt.grid(b=None)
plt.tight_layout()
plt.savefig('loudness_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='tempo',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=10)
g.set_ylabel('Tempo',fontsize=10)
g.tick_params(labelsize=8) 
plt.grid(b=None)
plt.savefig('tempo_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='valence',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=10)
g.set_ylabel('Valence',fontsize=10)
g.tick_params(labelsize=8)    
plt.grid(b=None)
plt.savefig('valence_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='acousticness',data=sbdf, palette=['darkorange','steelblue'],as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=16)
g.set_ylabel('Acousticness',fontsize=16)
g.tick_params(labelsize=16)    
#plt.grid(b=None)
plt.tight_layout()
plt.savefig('acousticness_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='track_seconds',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=12)
g.set_ylabel('Track Seconds',fontsize=12)
g.tick_params(labelsize=8)    
plt.grid(b=None)
plt.savefig('track_seconds_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='key',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=10)
g.set_ylabel(c,fontsize=10)
g.tick_params(labelsize=8)    
plt.grid(b=None)
plt.savefig('key_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='mode',data=sbdf, palette='RdYlBu',as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=10)
g.set_ylabel('Mode',fontsize=10)
g.tick_params(labelsize=8)   
plt.grid(b=None)
plt.savefig('mode_violin.pdf')

In [None]:
g = sns.violinplot(x='is_hit', y='instrumentalness',data=sbdf, palette=['darkorange','steelblue'],as_cmap=True) #, palette='crest' )
   #sns.diverging_palette(220,20,as_cmap=True)
#g.axes.set_title(fontsize=10)
g.set_xlabel('Is Hit',fontsize=16)
g.set_ylabel('Instrumentalness',fontsize=16)
g.tick_params(labelsize=16)   
#plt.grid(b=None)
plt.tight_layout()
plt.savefig('inst_violin.pdf')

**Check for correlated variables**

In [None]:
df_corr = sbdf.corr()

In [None]:
f, ax = plt.subplots(figsize=(7, 7))
sns.heatmap(df_corr,  annot=False ,linewidths=.5, ax=ax ) ,
plt.tight_layout()
plt.savefig('corrheatmap.png')

In [None]:
sbdf.describe()