# Predict Eurovision using Audio Features

In [76]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [77]:
import os.path
import json
import pandas as pd
from random import randint
import numpy as np

from patsy import dmatrices, dmatrix
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [78]:
# read songs info from file
tracks = []
track_ids_read = []
if os.path.isfile('available_tracks_info.json'):
    json_data=open('available_tracks_info.json').read()
    tracks = json.loads(json_data)
    track_ids_read = [t['id'] for t in tracks]
    print("{} tracks read".format(len(track_ids_read)))

# Create pandas dataframe
tracks_df = pd.DataFrame.from_dict(tracks).drop(['analysis_url', 'track_href', 'uri', 'id'], 1)
tracks_df[['Rank','Points']] = tracks_df[['Rank','Points']].apply(pd.to_numeric)

491 tracks read


## Linear Regression predicting Rank

In [20]:
# set the features to analyze in the model
features = ['acousticness', 'danceability', 'energy',\
           'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
           'speechiness', 'tempo', 'time_signature', 'valence']
features_string = ' + '.join(features)

# create input matrix and outut array
y, X = dmatrices('Rank ~ {}'.format(features_string), tracks_df, return_type = 'dataframe')

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(StandardScaler().fit_transform(X), y, test_size=0.3, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
model = LinearRegression(fit_intercept = True, normalize = False, copy_X=True)
regressor = model.fit(X_train, y_train.values.ravel())

# print results
print("Score train: {}".format(regressor.score(X_train, y_train)))
print("Score test:  {}\n".format(regressor.score(X_test, y_test)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# predict test
pd.DataFrame(list(zip(model.predict(X_test), y_test.values.ravel())), columns=['predicted','real']).sort_values('predicted', ascending=True)

X_train: 343
X_test:  148
y_train: 343
y_test:  148

Score train: 0.11324834858373589
Score test:  -0.02123805796681366

            features     coefs
0          intercept  0.000000
1       acousticness -1.456328
2       danceability  1.088850
3             energy -0.538561
4   instrumentalness  0.533639
5                key -0.076811
6           liveness -0.119143
7           loudness  0.931903
8               mode -0.576098
9        speechiness -0.203410
10             tempo -0.053443
11    time_signature -0.053582
12           valence -1.731178


Unnamed: 0,predicted,real
44,4.426211,15.0
29,5.267160,8.0
92,5.877629,1.0
0,5.888678,9.0
28,5.915975,3.0
27,6.152553,7.0
69,6.165789,10.0
56,6.251600,1.0
17,6.484445,14.0
4,6.801565,1.0


## Logistic Regression Classification of TopN

In [79]:
# create label 'isTopN'
tracks_df['isTop5'] = tracks_df.apply(lambda r: 1 if r['Rank']<=5 else 0, axis=1)
tracks_df['isTop10'] = tracks_df.apply(lambda r: 1 if r['Rank']<=10 else 0, axis=1)

In [207]:
isTopN = 'isTop5'

# set the features to analyze in the model
features = ['acousticness', 'danceability', 'energy',\
           'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
           'speechiness', 'tempo', 'time_signature', 'valence']
features_string = ' + '.join(features)

# create the standardscaler object
scaler = StandardScaler()

# create input matrix and outut array
y, X = dmatrices('{} ~ {}'.format(isTopN, features_string), tracks_df, return_type = 'dataframe')

# normalize features
X_norm = pd.DataFrame(scaler.fit_transform(X))
X_norm[0] = 1 # set intercept back to 1 (scaler sets it to 0 because of 0 variance)

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
regularization = 0.001
model = LogisticRegression(fit_intercept = True, C = 1/regularization)
regressor = model.fit(X_train, y_train.values.ravel())

# predict test
y_test_pred = regressor.predict(X_test)

# print scores
print('Classification performance metrics')
print('Accuracy: {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('F1 Score: {}'.format(metrics.f1_score(y_test, y_test_pred)))
print('ROC AUC:  {}\n'.format(metrics.roc_auc_score(y_test, y_test_pred)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# compute the new predicted score using the feature weights modeled in Logistic Regression
tracks_df['euroscore'] = np.dot(X_norm, model.coef_.T)

# Ranking
print('\nEuroscore performance metrics')
print("Accuracy Top 5: {}%".format(5*tracks_df.sort_values(by = 'euroscore', ascending = False)[0:20]['isTop5'].sum()))
print("Accuracy Top 10: {}%".format(5*tracks_df.sort_values(by = 'euroscore', ascending = False)[0:20]['isTop10'].sum()))

X_train: 392
X_test:  99
y_train: 392
y_test:  99

Classification performance metrics
Accuracy: 0.6565656565656566
F1 Score: 0.15000000000000002
ROC AUC:  0.5151515151515151

            features     coefs
0          intercept -0.452529
1       acousticness  0.356438
2       danceability -0.008212
3             energy  0.191979
4   instrumentalness -0.134044
5                key  0.021536
6           liveness -0.134357
7           loudness -0.275767
8               mode  0.094918
9        speechiness  0.084129
10             tempo  0.136408
11    time_signature -0.029344
12           valence  0.111954

Euroscore performance metrics
Accuracy Top 5: 55%
Accuracy Top 10: 75%


## Predict how artists songs would perform in Eurovision

In [166]:
# Create spotify wrapper object
from spotify_wrapper import SpotifyWrapper
sp = SpotifyWrapper(client_id = '9b23e599c83f45d9a93e6559d79b3f4a', 
                    client_secret = '710a69f7a7984a6c9c71c1682c46d7b5')

In [140]:
# read artists info to compare to eurovision songs
artists_tracks_full_info = []
artists_read = []
if os.path.isfile('../festivals2018/tracks_full_info.json'):
    json_data=open('../festivals2018/tracks_full_info.json').read()
    artists_tracks_full_info = json.loads(json_data)
    artists_read = set([sp.remove_accents(a['artist_name'].lower()) for a in artists_tracks_full_info])
    print("{} artists read - {} tracks read".format(len(artists_read),len(artists_tracks_full_info)))

# Create pandas dataframe
artists_tracks_df = pd.DataFrame.from_dict(artists_tracks_full_info).drop(['analysis_url', 'album_id', 'artist_id', 'track_href', 'uri', 'type', 'id'], 1)

59 artists read - 9646 tracks read


In [154]:
# create a dataframe with the songs to predict
#songs_to_predict = artists_tracks_df[artists_tracks_df['artist_name']=='Pau Vallvé']
songs_to_predict = artists_tracks_df[artists_tracks_df['artist_name']=='Bon Iver']

# create features matrix
X_new = dmatrix('{}'.format(features_string), songs_to_predict, return_type = 'dataframe')

# normalize features
X_new_norm = pd.DataFrame(scaler.transform(X_new))
X_new_norm[0] = 1 # set intercept back to 1 (scaler sets it to 0 because of 0 variance)

# preict topN
songs_to_predict[isTopN] = regressor.predict(X_new_norm)

# compute the new predicted score using the feature weights modeled in Logistic Regression
songs_to_predict['euroscore'] = np.dot(X_new_norm, model.coef_.T)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [155]:
songs_to_predict[['album_name', 'name', isTopN, 'euroscore']].sort_values(by=[isTopN, 'euroscore'], ascending=False).head(10)

Unnamed: 0,album_name,name,isTop10,euroscore
1912,"22, A Million",00000 Million,1.0,1.675253
1919,Bon Iver,Wash.,1.0,1.334506
1924,"For Emma, Forever Ago",Lump Sum,1.0,1.332698
1923,"For Emma, Forever Ago",Flume,1.0,1.189181
1917,Bon Iver,Michicant,1.0,1.062729
1903,"22, A Million",22 (OVER S∞∞N),1.0,0.978877
1928,"For Emma, Forever Ago",Creature Fear,1.0,0.954269
1927,"For Emma, Forever Ago",Blindsided,1.0,0.938964
1931,"For Emma, Forever Ago",re:stacks,1.0,0.903483
1911,"22, A Million",____45_____,1.0,0.899456


## Predict Eurovision 2018 topN

In [186]:
# read csv with all songs in finals 
import csv
songs = []
with open('2018_songs_with_ids_orig.csv') as csv_file:
    songs = [{k: v for k, v in row.items()} for row in csv.DictReader(csv_file, skipinitialspace=True)]

In [187]:
# count songs with id
count_songs_with_id = sum([ (1 if s['id']!='' else 0) for s in songs])
print('Count of songs with id:',count_songs_with_id)

# get spotify song ids
for i,s in enumerate(songs):
        
    # only search those songs without id
    if s['id']=='':
        
        # search song info
        song_info = sp.search_song(s['Artist'], s['Song'])
        
        # process received info
        if song_info == None:
            print('{} NOT FOUND: {} - {}'.format(i, s['Artist'], s['Song']))

        else:
            
            # sanity checks
            check_artist = 1 if sp.remove_accents(s['Artist'].lower()) == sp.remove_accents(song_info['artists'][0]['name'].lower()) else 0
            check_title = 1 if sp.remove_accents(s['Song'].lower()) == sp.remove_accents(song_info['name'].lower()) else 0
            
            print('{} FOUND {} {}: {} - {} | {} - {}'.format(i, check_artist, check_title, 
                                                            s['Artist'], s['Song'], 
                                                            song_info['artists'][0]['name'], song_info['name']))

            # store id in original object dictionary
            s['id'] = song_info['id']

            # store song with id in csv file
            keys = s.keys()
            with open('2018_songs_with_ids.csv', 'w') as output_file:
                w = csv.DictWriter(output_file, keys)
                w.writeheader()
                w.writerows(songs)

Count of songs with id: 0
0 FOUND 1 1: Aisel - X My Heart | Aisel - X My Heart
1 FOUND 1 1: Ari Ólafsson - Our Choice | Ari Ólafsson - Our Choice
2 FOUND 1 1: Eugent Bushpepa - Mall | Eugent Bushpepa - Mall
3 FOUND 1 1: Sennek - A Matter of Time | Sennek - A Matter Of Time
4 FOUND 1 1: Mikolas Josef - Lie to Me | Mikolas Josef - Lie to Me
5 FOUND 1 1: Ieva Zasimauskaitė - When We're Old | Ieva Zasimauskaitė - When We're Old
6 FOUND 1 1: Netta - Toy | Netta - Toy
7 FOUND 1 0: Alekseev - Forever | Alekseev - Forever - Eurovision Version
8 FOUND 1 1: Elina Nechayeva - La forza | Elina Nechayeva - La Forza
9 FOUND 1 0: Equinox - Bones | Equinox - Bones - Studio Version
10 FOUND 1 0: Eye Cue - Lost and Found | Eye Cue - Lost & Found
11 FOUND 1 1: Franka - Crazy | Franka - Crazy
12 FOUND 1 1: Cesár Sampson - Nobody but You | Cesar Sampson - Nobody But You
13 FOUND 1 1: Yianna Terzi - Oniro mou | Yianna Terzi - Oniro Mou
14 FOUND 1 1: Saara Aalto - Monsters | Saara Aalto - Monsters
15 FOUND 1

In [208]:
# read csv with all songs from 2018 (already with ids)
import csv
songs = []
with open('2018_songs_with_ids.csv') as csv_file:
    songs = [{k: v for k, v in row.items()} for row in csv.DictReader(csv_file, skipinitialspace=True)]
    
# create dataframe
songs_df = pd.DataFrame(songs)
available_songs = songs_df[songs_df['id'] != '']

In [209]:
# get audio features of songs
songs_full_info = list(available_songs.T.to_dict().values()) # list of dicts - dicts must have field 'id'
songs_full_info = sp.get_audio_features_of_lots_of_tracks(songs_full_info)

# store songs full info into file
with open('2018_songs_info.json', 'w') as fp:
    json.dump(songs_full_info, fp)

Api Call 0 from 0 to 19
Api Call 1 from 20 to 39
Api Call 2 from 40 to 42


In [210]:
# read from file if already downloaded
songs_full_info = []
track_ids_read = []
if os.path.isfile('2018_songs_info.json'):
    json_data=open('2018_songs_info.json').read()
    songs_full_info = json.loads(json_data)
    track_ids_read = [t['id'] for t in songs_full_info]
    print("{} songs read".format(len(track_ids_read)))
    
# Create pandas dataframe
songs_df = pd.DataFrame.from_dict(songs_full_info).drop(['analysis_url', 'track_href', 'uri', 'id'], 1)
songs_df[['Rank','Points']] = songs_df[['Rank','Points']].apply(pd.to_numeric)

43 songs read


In [211]:
# create a dataframe with the songs to predict
songs_to_predict = songs_df

# create features matrix
X_new = dmatrix('{}'.format(features_string), songs_to_predict, return_type = 'dataframe')

# normalize features
X_new_norm = pd.DataFrame(scaler.transform(X_new))
X_new_norm[0] = 1 # set intercept back to 1 (scaler sets it to 0 because of 0 variance)

# preict topN
songs_to_predict[isTopN] = regressor.predict(X_new_norm)

# compute the new predicted score using the feature weights modeled in Logistic Regression
songs_to_predict['euroscore'] = np.dot(X_new_norm, model.coef_.T)

In [212]:
songs_to_predict[['Country', isTopN, 'euroscore']].sort_values(by=[isTopN, 'euroscore'], ascending=False)

Unnamed: 0,Country,isTop5,euroscore
5,Lithuania,0.0,0.410783
39,Germany,0.0,0.168722
17,Ireland,0.0,0.002767
41,Spain,0.0,-0.086067
37,Portugal,0.0,-0.12758
19,Norway,0.0,-0.372494
2,Albania,0.0,-0.476215
40,Italy,0.0,-0.549892
33,Sweden,0.0,-0.650181
28,Georgia,0.0,-0.653562
