In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt

In [2]:
af = pd.read_csv('audio_features.csv')
af.head()

Unnamed: 0.1,Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,album
0,0,0.735,0.444,10,-10.519,1,0.0684,0.204,0.0012,0.17,0.0984,97.038,audio_features,4g2c7NoTWAOSYDy44l9nub,spotify:track:4g2c7NoTWAOSYDy44l9nub,https://api.spotify.com/v1/tracks/4g2c7NoTWAOS...,https://api.spotify.com/v1/audio-analysis/4g2c...,202396,4,midnights
1,1,0.658,0.378,7,-8.3,1,0.0379,0.0593,0.0,0.0976,0.0382,108.034,audio_features,199E1RRrVmVTQqBXih5qRC,spotify:track:199E1RRrVmVTQqBXih5qRC,https://api.spotify.com/v1/tracks/199E1RRrVmVT...,https://api.spotify.com/v1/audio-analysis/199E...,218271,4,midnights
2,2,0.638,0.634,4,-6.582,1,0.0457,0.133,1e-06,0.152,0.519,96.953,audio_features,02Zkkf2zMkwRGQjZ7T4p8f,spotify:track:02Zkkf2zMkwRGQjZ7T4p8f,https://api.spotify.com/v1/tracks/02Zkkf2zMkwR...,https://api.spotify.com/v1/audio-analysis/02Zk...,200690,4,midnights
3,3,0.659,0.323,9,-13.425,1,0.0436,0.735,0.00321,0.116,0.154,110.007,audio_features,6ADDIJxxqzM9LMpm78yzQG,spotify:track:6ADDIJxxqzM9LMpm78yzQG,https://api.spotify.com/v1/tracks/6ADDIJxxqzM9...,https://api.spotify.com/v1/audio-analysis/6ADD...,256124,4,midnights
4,4,0.694,0.38,2,-10.307,1,0.0614,0.416,8e-06,0.126,0.376,120.044,audio_features,7gVWKBcfIW93YxNBi3ApIE,spotify:track:7gVWKBcfIW93YxNBi3ApIE,https://api.spotify.com/v1/tracks/7gVWKBcfIW93...,https://api.spotify.com/v1/audio-analysis/7gVW...,194207,4,midnights


In [3]:
mid_val = af[af['album'] == 'midnights']['valence']
lon_val = af[af['album'] == 'loneliest']['valence']

In [4]:
result1 = stats.ttest_ind(mid_val, lon_val)

In [5]:
result1.statistic

-4.205937943601481

In [6]:
round(result1.pvalue, 5)

0.00018

In [7]:
mid_dance = af[af['album'] == 'midnights']['danceability']
lon_dance = af[af['album'] == 'loneliest']['danceability']

In [8]:
result2 = stats.ttest_ind(mid_dance, lon_dance)

In [9]:
result2.statistic

-2.1273893857382227

In [10]:
result2.pvalue

0.04072264959445446

In [11]:
modexalbum_freq = af.groupby(['album', 'mode']).count()['id']

In [12]:
result3 = stats.chisquare(modexalbum_freq)

In [13]:
result3.statistic

18.22222222222222

In [14]:
result3.pvalue

0.0003957850029937146

In [15]:
af.groupby(['album', 'key']).count()['id']

album      key
loneliest  0      2
           1      1
           2      2
           4      1
           5      1
           6      3
           7      2
           8      2
           9      2
midnights  0      4
           2      1
           4      3
           5      1
           6      1
           7      5
           8      1
           9      1
           10     2
           11     1
Name: id, dtype: int64

In [16]:
stats.chisquare(af.groupby(['album', 'key']).count()['id'])

Power_divergenceResult(statistic=12.555555555555557, pvalue=0.8172706096483118)

#### A slight correction to the chi-square test example
The `stats.chisquare()` function takes a single column (an array) of data and treats each cell in the column as a group. However, in our data, we know that there are actually two variables (the album and the mode) and each cell represents a combination of the group.
So, instead of `stats.chisquare()` we need to use `stats.chi2_contingency()` which accepts a multi-dimensional array (rows and columns).
The contingency table like what we drew on the board can be created using the `unstack()` method.
If we pass this contingency table to `stats.chisquare()`, we get a pair of results, one for each column in the table, instead of a single result for the whole table.

In [17]:
modexalbum_freq.unstack('album')

album,loneliest,midnights
mode,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4,2
1,12,18


In [18]:
stats.chisquare(modexalbum_freq.unstack('album'))

Power_divergenceResult(statistic=array([ 4. , 12.8]), pvalue=array([0.04550026, 0.00034662]))

In [19]:
stats.chisquare(modexalbum_freq.unstack('mode'))

Power_divergenceResult(statistic=array([0.66666667, 1.2       ]), pvalue=array([0.41421618, 0.27332168]))

#### This is how these tests should be run

In [20]:
corrected_result3 = stats.chi2_contingency(modexalbum_freq.unstack('album'))

In [21]:
corrected_result3.statistic

0.5625

In [22]:
round(corrected_result3.pvalue, 3)

0.453

In [23]:
keyxalbum_freq = af.groupby(['album', 'key']).count()['id'].unstack('key')
keyxalbum_freq

key,0,1,2,4,5,6,7,8,9,10,11
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
loneliest,2.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0,2.0,,
midnights,4.0,,1.0,3.0,1.0,1.0,5.0,1.0,1.0,2.0,1.0


In [24]:
result4 = stats.chi2_contingency(keyxalbum_freq)

In [25]:
result4.statistic, result4.pvalue

(nan, nan)

The `nan` result is because there is missing data in the contingency table. These need to be replaced with 0s since that is the number of observations in those categories.

In [26]:
keyxalbum_freq.replace(np.nan, 0)

key,0,1,2,4,5,6,7,8,9,10,11
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
loneliest,2.0,1.0,2.0,1.0,1.0,3.0,2.0,2.0,2.0,0.0,0.0
midnights,4.0,0.0,1.0,3.0,1.0,1.0,5.0,1.0,1.0,2.0,1.0


In [27]:
corrected_result4 = stats.chi2_contingency(keyxalbum_freq.replace(np.nan, 0))

In [28]:
corrected_result4.statistic

8.614285714285716

In [29]:
 round(corrected_result4.pvalue, 3)

0.569