# Predict Eurovision using Audio Features

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
import os.path
import json
import pandas as pd
from random import randint
import numpy as np

from patsy import dmatrices
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [4]:
# read songs info from file
tracks = []
track_ids_read = []
if os.path.isfile('available_tracks_info.json'):
    json_data=open('available_tracks_info.json').read()
    tracks = json.loads(json_data)
    track_ids_read = [t['id'] for t in tracks]
    print("{} tracks read".format(len(track_ids_read)))

# Create pandas dataframe
tracks_df = pd.DataFrame.from_dict(tracks).drop(['analysis_url', 'track_href', 'uri', 'id'], 1)
tracks_df[['Rank','Points']] = tracks_df[['Rank','Points']].apply(pd.to_numeric)

491 tracks read


## Linear Regression predicting Rank

In [20]:
# set the features to analyze in the model
features = ['acousticness', 'danceability', 'energy',\
           'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
           'speechiness', 'tempo', 'time_signature', 'valence']
features_string = ' + '.join(features)

# create input matrix and outut array
y, X = dmatrices('Rank ~ {}'.format(features_string), tracks_df, return_type = 'dataframe')

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(X), y, test_size=0.3, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
model = LinearRegression(fit_intercept = True, normalize = False, copy_X=True)
regressor = model.fit(X_train, y_train.values.ravel())

# print results
print("Score train: {}".format(regressor.score(X_train, y_train)))
print("Score test:  {}\n".format(regressor.score(X_test, y_test)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# predict test
pd.DataFrame(list(zip(model.predict(X_test), y_test.values.ravel())), columns=['predicted','real']).sort_values('predicted', ascending=True)

X_train: 343
X_test:  148
y_train: 343
y_test:  148

Score train: 0.11324834858373589
Score test:  -0.02123805796681366

            features     coefs
0          intercept  0.000000
1       acousticness -1.456328
2       danceability  1.088850
3             energy -0.538561
4   instrumentalness  0.533639
5                key -0.076811
6           liveness -0.119143
7           loudness  0.931903
8               mode -0.576098
9        speechiness -0.203410
10             tempo -0.053443
11    time_signature -0.053582
12           valence -1.731178


Unnamed: 0,predicted,real
44,4.426211,15.0
29,5.267160,8.0
92,5.877629,1.0
0,5.888678,9.0
28,5.915975,3.0
27,6.152553,7.0
69,6.165789,10.0
56,6.251600,1.0
17,6.484445,14.0
4,6.801565,1.0


## Logistic Regression Classification of TopN

In [101]:
# create label 'isTopN'
tracks_df['isTop5'] = tracks_df.apply(lambda r: 1 if r['Rank']<=5 else 0, axis=1)
tracks_df['isTop10'] = tracks_df.apply(lambda r: 1 if r['Rank']<=10 else 0, axis=1)

In [158]:
isTopN = 'isTop10'

# set the features to analyze in the model
features = ['acousticness']#, 'danceability']#, 'energy',\
           #'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
           #'speechiness', 'tempo', 'time_signature', 'valence']
features_string = ' + '.join(features)

# create the standardscaler object
scaler = StandardScaler()

# create input matrix and outut array
y, X = dmatrices('{} ~ {}'.format(isTopN, features_string), tracks_df, return_type = 'dataframe')

# normalize features
X_norm = pd.DataFrame(scaler.fit_transform(X))
X_norm[0] = 1 # set intercept back to 1 (scaler sets it to 0 because of 0 variance)

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
regularization = 0.001
model = LogisticRegression(fit_intercept = True, C = 1/regularization)
regressor = model.fit(X_train, y_train.values.ravel())

# predict test
y_test_pred = regressor.predict(X_test)

# print scores
print('Classification performance metrics')
print('Accuracy: {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('F1 Score: {}'.format(metrics.f1_score(y_test, y_test_pred)))
print('ROC AUC:  {}\n'.format(metrics.roc_auc_score(y_test, y_test_pred)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# compute the new predicted score using the feature weights modeled in Logistic Regression
tracks_df['euroscore'] = np.dot(X_norm, model.coef_.T)

# Ranking
print('\nEuroscore performance metrics')
print("Accuracy Top 5: {}%".format(5*tracks_df.sort_values(by = 'euroscore', ascending = False)[0:20]['isTop5'].sum()))
print("Accuracy Top 10: {}%".format(5*tracks_df.sort_values(by = 'euroscore', ascending = False)[0:20]['isTop10'].sum()))

X_train: 392
X_test:  99
y_train: 392
y_test:  99

Classification performance metrics
Accuracy: 0.5656565656565656
F1 Score: 0.6446280991735538
ROC AUC:  0.5430084745762712

       features     coefs
0     intercept  0.022324
1  acousticness  0.425618

Euroscore performance metrics
Accuracy Top 5: 50%
Accuracy Top 10: 90%


## Predict topN of artists songs

In [176]:
# Create spotify wrapper object
from spotify_wrapper import SpotifyWrapper
sp = SpotifyWrapper(client_id = '9b23e599c83f45d9a93e6559d79b3f4a', 
                    client_secret = '710a69f7a7984a6c9c71c1682c46d7b5')

In [178]:
# read artists info to compare to eurovision songs
artists_tracks_full_info = []
artists_read = []
if os.path.isfile('../festivals2018/tracks_full_info.json'):
    json_data=open('../festivals2018/tracks_full_info.json').read()
    artists_tracks_full_info = json.loads(json_data)
    artists_read = set([sp.remove_accents(a['artist_name'].lower()) for a in artists_tracks_full_info])
    print("{} artists read - {} tracks read".format(len(artists_read),len(artists_tracks_full_info)))

# Create pandas dataframe
artists_tracks_df = pd.DataFrame.from_dict(artists_tracks_full_info).drop(['analysis_url', 'album_id', 'artist_id', 'track_href', 'uri', 'type', 'id'], 1)

59 artists read - 9646 tracks read


In [183]:
songs_to_predict = artists_tracks_df[artists_tracks_df['artist_name']=='Pau Vallvé'][features]
songs_to_predict['intercept'] = 1

In [184]:
scaler.transform(songs_to_predict)

array([[-0.118     ,  2.03422733],
       [-0.022     ,  2.03422733],
       [-0.02      ,  2.03422733],
       [-0.014     ,  2.03422733],
       [-0.199     ,  2.03422733],
       [-0.015     ,  2.03422733],
       [-0.005     ,  2.03422733],
       [-0.024     ,  2.03422733],
       [-0.145     ,  2.03422733],
       [-0.04      ,  2.03422733],
       [-0.138     ,  2.03422733],
       [-0.407     ,  2.03422733],
       [-0.302     ,  2.03422733],
       [-0.885     ,  2.03422733],
       [-0.9744    ,  2.03422733],
       [-0.589     ,  2.03422733],
       [-0.392     ,  2.03422733],
       [-0.703     ,  2.03422733],
       [-0.624     ,  2.03422733],
       [-0.136     ,  2.03422733],
       [-0.232     ,  2.03422733],
       [-0.9009    ,  2.03422733],
       [-0.096     ,  2.03422733],
       [-0.9405    ,  2.03422733],
       [-0.423     ,  2.03422733],
       [-0.303     ,  2.03422733],
       [-0.415     ,  2.03422733],
       [-0.602     ,  2.03422733],
       [-0.141     ,

## Predict 2018 topN

In [None]:
# download 2018 songs info (create list, ids, audio features)

# predict score using trained model