# Predict Eurovision using Audio Features

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [32]:
import os.path
import json
import pandas as pd
from random import randint
import numpy as np

from patsy import dmatrices
from sklearn.linear_model import LinearRegression, LogisticRegression
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

In [33]:
# read songs info from file
tracks = []
track_ids_read = []
if os.path.isfile('available_tracks_info.json'):
    json_data=open('available_tracks_info.json').read()
    tracks = json.loads(json_data)
    track_ids_read = [t['id'] for t in tracks]
    print("{} tracks read".format(len(track_ids_read)))

# Create pandas dataframe
tracks_df = pd.DataFrame.from_dict(tracks).drop(['analysis_url', 'track_href', 'uri', 'id'], 1)
tracks_df[['Rank','Points']] = tracks_df[['Rank','Points']].apply(pd.to_numeric)

491 tracks read


In [35]:
# set the features to analyze in the model
features = ['acousticness', 'danceability', 'energy',\
           'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\
           'speechiness', 'tempo', 'time_signature', 'valence']
features_string = ' + '.join(features)

# create input matrix and outut array
y, X = dmatrices('Rank ~ {}'.format(features_string), tracks_df, return_type = 'dataframe')

# sklearn split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=randint(0,1000))
print('X_train: {}\nX_test:  {}\ny_train: {}\ny_test:  {}\n'.format(len(X_train),len(X_test),len(y_train),len(y_test)))

# Linear Regression model with sklearn
model = LinearRegression(fit_intercept = True, normalize = False, copy_X=True)
regressor = model.fit(X_train, y_train.values.ravel())

# print results
print("Score: {}\n".format(regressor.score(X_test, y_test)))

# print feature relationship
features_tmp = np.insert(features,0,'intercept')
print(pd.DataFrame(list(zip(features_tmp, model.coef_.ravel())), columns=['features','coefs']))

# predict test
pd.DataFrame(list(zip(model.predict(X_test), y_test.values.ravel())), columns=['predicted','real']).sort_values('predicted', ascending=True)

X_train: 343
X_test:  148
y_train: 343
y_test:  148

Score: 0.03811761332885688

            features     coefs
0          intercept  0.000000
1       acousticness -4.894810
2       danceability  3.462243
3             energy -6.594796
4   instrumentalness  5.828009
5                key -0.007914
6           liveness  1.056405
7           loudness  0.470123
8               mode -1.124471
9        speechiness  9.883422
10             tempo  0.002920
11    time_signature  0.909413
12           valence -2.771554


Unnamed: 0,predicted,real
141,5.927683,1.0
78,6.828107,10.0
64,6.847310,4.0
1,6.872977,3.0
31,7.282745,10.0
6,7.381044,14.0
5,7.569342,7.0
117,7.733540,6.0
138,7.767730,1.0
73,7.866931,4.0
