# Predict Eurovision using Audio Features

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
import os.path
import json
import pandas as pd
from random import randint
import numpy as np

from patsy import dmatrices
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [4]:
# read songs info from file
tracks = []
track_ids_read = []
if os.path.isfile('available_tracks_info.json'):
    json_data=open('available_tracks_info.json').read()
    tracks = json.loads(json_data)
    track_ids_read = [t['id'] for t in tracks]
    print("{} tracks read".format(len(track_ids_read)))

# Create pandas dataframe
tracks_df = pd.DataFrame.from_dict(tracks).drop(['analysis_url', 'track_href', 'uri', 'id'], 1)
tracks_df[['Rank','Points']] = tracks_df[['Rank','Points']].apply(pd.to_numeric)

491 tracks read


## Linear Regression predicting Rank

In [20]:
# set the features to analyze in the model
features = ['acousticness', 'danceability', 'energy',\
           'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
           'speechiness', 'tempo', 'time_signature', 'valence']
features_string = ' + '.join(features)

# create input matrix and outut array
y, X = dmatrices('Rank ~ {}'.format(features_string), tracks_df, return_type = 'dataframe')

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(X), y, test_size=0.3, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
model = LinearRegression(fit_intercept = True, normalize = False, copy_X=True)
regressor = model.fit(X_train, y_train.values.ravel())

# print results
print("Score train: {}".format(regressor.score(X_train, y_train)))
print("Score test:  {}\n".format(regressor.score(X_test, y_test)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# predict test
pd.DataFrame(list(zip(model.predict(X_test), y_test.values.ravel())), columns=['predicted','real']).sort_values('predicted', ascending=True)

X_train: 343
X_test:  148
y_train: 343
y_test:  148

Score train: 0.11324834858373589
Score test:  -0.02123805796681366

            features     coefs
0          intercept  0.000000
1       acousticness -1.456328
2       danceability  1.088850
3             energy -0.538561
4   instrumentalness  0.533639
5                key -0.076811
6           liveness -0.119143
7           loudness  0.931903
8               mode -0.576098
9        speechiness -0.203410
10             tempo -0.053443
11    time_signature -0.053582
12           valence -1.731178


Unnamed: 0,predicted,real
44,4.426211,15.0
29,5.267160,8.0
92,5.877629,1.0
0,5.888678,9.0
28,5.915975,3.0
27,6.152553,7.0
69,6.165789,10.0
56,6.251600,1.0
17,6.484445,14.0
4,6.801565,1.0


## Logistic Regression Classification of TopN

In [5]:
# create label 'isTopN'
tracks_df['isTop5'] = tracks_df.apply(lambda r: 1 if r['Rank']<=5 else 0, axis=1)
tracks_df['isTop10'] = tracks_df.apply(lambda r: 1 if r['Rank']<=10 else 0, axis=1)

In [41]:
isTopN = 'isTop10'

# set the features to analyze in the model
features = ['acousticness', 'danceability', 'energy',\
           'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
           'speechiness', 'tempo', 'time_signature', 'valence']
features_string = ' + '.join(features)

# create the standardscaler object
scaler = StandardScaler()

# create input matrix and outut array
y, X = dmatrices('{} ~ {}'.format(isTopN, features_string), tracks_df, return_type = 'dataframe')
X = scaler.fit_transform(X)

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
regularization = 0.001
model = LogisticRegression(fit_intercept = False, C = 1/regularization)
regressor = model.fit(X_train, y_train.values.ravel())

# predict test
y_test_pred = regressor.predict(X_test)

# print scores
print('Classification performance metrics')
print('Accuracy: {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('F1 Score: {}'.format(metrics.f1_score(y_test, y_test_pred)))
print('ROC AUC:  {}\n'.format(metrics.roc_auc_score(y_test, y_test_pred)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# compute the new predicted score using the feature weights modeled in Logistic Regression
tracks_df['euroscore'] = np.dot(X, model.coef_.T)

# Ranking
print('\nEuroscore performance metrics')
print("Accuracy Top 5: {}%".format(5*tracks_df.sort_values(by = 'euroscore', ascending = False)[0:20]['isTop5'].sum()))
print("Accuracy Top 10: {}%".format(5*tracks_df.sort_values(by = 'euroscore', ascending = False)[0:20]['isTop10'].sum()))

X_train: 392
X_test:  99
y_train: 392
y_test:  99

Classification performance metrics
Accuracy: 0.5555555555555556
F1 Score: 0.5686274509803921
ROC AUC:  0.5561936013125514

            features     coefs
0          intercept  0.000000
1       acousticness  0.514583
2       danceability -0.141615
3             energy  0.340924
4   instrumentalness -0.063540
5                key  0.084567
6           liveness -0.001373
7           loudness -0.408644
8               mode  0.152556
9        speechiness -0.035908
10             tempo  0.050192
11    time_signature  0.002133
12           valence  0.212019

Euroscore performance metrics
Accuracy Top 5: 55%
Accuracy Top 10: 75%


## Predict topN of artists songs

In [39]:
def ComputeEuroscore(song, features_coefs, scaler):

    euroscore = features_coefs['intercept']
    for c in coefs:
        euroscore = euroscore + song[c]*features_coefs[c]
    return euroscore

In [120]:
# Create spotify wrapper object
from spotify_wrapper import SpotifyWrapper
sp = SpotifyWrapper(client_id = '9b23e599c83f45d9a93e6559d79b3f4a', 
                    client_secret = '710a69f7a7984a6c9c71c1682c46d7b5')

In [44]:
# create a dict with features coreficients from the trained model
for f in features:
    

#features_coefs = dict((k,v) for k, v in list(zip(features,model.coef_[0][1:].T)))
#features_coefs['intercept'] = model.intercept_

acousticness
danceability
energy
instrumentalness
key
liveness
loudness
mode
speechiness
tempo
time_signature
valence


In [43]:
scaler.mean_
scaler.var_

array([0.00000000e+00, 9.40465148e-02, 2.20953709e-02, 4.38375892e-02,
       9.00347970e-03, 1.25573645e+01, 3.17780519e-02, 1.05106211e+01,
       2.26355457e-01, 3.32442315e-03, 7.60070073e+02, 1.21569099e-01,
       5.45487919e-02])

In [40]:
features = ['acousticness', 'danceability', 'energy',\
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
       'speechiness', 'tempo', 'time_signature', 'valence']

song = tracks_df.head(1)

StandardScaler().fit_transform(np.array(song[features]))

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [130]:
np.array(song[features])

array([[ 8.51000e-01,  3.74000e-01,  2.55000e-01,  1.01000e-06,
         1.10000e+01,  2.72000e-01, -8.28400e+00,  1.00000e+00,
         3.91000e-02,  1.81982e+02,  3.00000e+00,  4.19000e-01]])

In [124]:
tracks_df.head(1).apply(ComputeEuroscore, axis=1)

0    32.731279
dtype: float64

In [99]:
artists_tracks_df[artists_tracks_df['artist_name']=='Pau Vallvé'].apply(ComputeEuroscore, axis=1)

483    27.780717
484    25.790164
485    27.392792
486    28.955497
487    30.266698
488    26.562862
489    24.946844
490    33.875635
491    26.405100
492    27.583402
493    18.022944
494    23.824488
495    26.686464
496    19.746158
497    24.417429
498    25.346401
499    20.042499
500    23.613063
501    19.798637
502    20.422381
503    20.971725
504    19.967394
505    24.436617
506    24.151689
507    16.315135
508    21.803931
509    20.587734
510    19.442168
511    20.936063
512    18.496287
         ...    
564    27.202829
565    29.798003
566    30.903621
567    25.406760
568    22.192558
569    35.692877
570    25.115792
571    18.324071
572    28.551344
573    24.991248
574    19.038795
575    21.859161
576    24.904849
577    18.751769
578    23.068575
579    27.297919
580    20.799614
581    24.201924
582    21.275910
583    26.636888
584    29.359970
585    22.455182
586    22.462045
587    27.877808
588    31.272191
589    22.150145
590    31.100850
591    25.4899

In [121]:
# read artists info to compare to eurovision songs
artists_tracks_full_info = []
artists_read = []
if os.path.isfile('../festivals2018/tracks_full_info.json'):
    json_data=open('../festivals2018/tracks_full_info.json').read()
    artists_tracks_full_info = json.loads(json_data)
    artists_read = set([sp.remove_accents(a['artist_name'].lower()) for a in artists_tracks_full_info])
    print("{} artists read - {} tracks read".format(len(artists_read),len(artists_tracks_full_info)))

# Create pandas dataframe
artists_tracks_df = pd.DataFrame.from_dict(artists_tracks_full_info).drop(['analysis_url', 'album_id', 'artist_id', 'track_href', 'uri', 'type', 'id'], 1)

59 artists read - 9646 tracks read


## Predict 2018 topN

In [None]:
# download 2018 songs info (create list, ids, audio features)

# predict score using trained model