In [95]:
import pandas as pd

In [96]:
data = pd.read_csv('features.csv')
del data['Unnamed: 0']

## Split data into features and targets

In [97]:
targets = data['Peak Position']
features = data
del features['Peak Position']

In [98]:
features

Unnamed: 0,Weeks on Chart,spotify_track_duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,explicit,YearID
0,1.0,226706.0,0.440,0.976,10.0,-3.170,1.0,0.1410,0.00177,0.000003,0.1600,0.412,150.005,4.0,0,1958
1,1.0,222333.0,0.646,0.527,11.0,-7.356,1.0,0.0478,0.01680,0.000000,0.2040,0.287,68.501,4.0,0,1958
2,1.0,141426.0,0.511,0.236,0.0,-14.496,1.0,0.0430,0.67000,0.000000,0.0953,0.196,141.222,4.0,0,1958
3,1.0,230320.0,0.565,0.245,5.0,-9.132,1.0,0.0262,0.78100,0.000000,0.0998,0.307,75.055,4.0,0,1958
4,1.0,191266.0,0.738,0.768,11.0,-7.045,0.0,0.0348,0.38100,0.145000,0.1000,0.603,120.013,4.0,0,1958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19609,5.0,183800.0,0.617,0.488,7.0,-9.567,0.0,0.0350,0.26600,0.000000,0.1100,0.685,136.236,4.0,0,2019
19610,14.0,179613.0,0.665,0.498,5.0,-8.185,1.0,0.0831,0.22800,0.000000,0.0757,0.127,81.967,4.0,1,2019
19611,34.0,272080.0,0.448,0.420,8.0,-9.948,1.0,0.0797,0.52600,0.000550,0.2230,0.325,183.337,3.0,0,2019
19612,8.0,191066.0,0.685,0.718,0.0,-7.724,1.0,0.0521,0.12200,0.000000,0.0556,0.958,131.463,4.0,0,2019


## Scale features and targets

In [99]:
# for feature in features:
#     features[feature] /= max(abs(features[feature]))
from sklearn.preprocessing import scale
features = scale(features)
targets = scale(targets)

## Test train split

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.33, random_state=42)

## Create and train model

In [101]:
from sklearn.linear_model import LinearRegression

In [102]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Evaluate

In [103]:
from sklearn import metrics

In [104]:
mse = metrics.mean_squared_error(y_test, y_pred)
mae = metrics.mean_absolute_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print(f'mse: {mse}, mae: {mae}, r2: {r2}')

mse: 0.4449599472984792, mae: 0.543198400006274, r2: 0.5548219644813057


## Try more models

In [130]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import SelectFromModel

In [131]:
models = [LinearRegression(),
          Ridge(),
          Lasso(),
          ElasticNet()]

In [132]:
selectors = [SelectFromModel(model).fit(X_train, y_train) for model in models]

In [133]:
for model in models:
    model.fit(X_train, y_train)
    print(metrics.r2_score(y_test, model.predict(X_test)))

0.5548219644813057
0.5548200328237567
-0.0005709357379117819
0.17105005423008635
