### Import packages and data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('data.csv')
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


## Preparing data for models

In [2]:
df.columns

Index(['valence', 'year', 'acousticness', 'artists', 'danceability',
       'duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'name', 'popularity', 'release_date',
       'speechiness', 'tempo'],
      dtype='object')

In [3]:
from sklearn.model_selection import train_test_split
df = df[['valence', 'year', 'acousticness', 'artists', 'danceability','duration_ms', 'energy', 'explicit', 'id', 'instrumentalness', 'key','liveness', 'loudness', 'mode', 'name', 'release_date','speechiness', 'tempo', 'popularity']].dropna()
X = df[['acousticness', 'danceability', 'duration_ms', 'energy','instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'key', 'mode']]
y = df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=126)

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures()
X_train_expanded = pf.fit_transform(X_train)
X_test_expanded = pf.transform(X_test)

## Dummy (Baseline)

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.dummy import DummyRegressor
dr = DummyRegressor()
dr.fit(X_train, y_train)
print('R-squared score for dummy model with raw values: ', dr.score(X_test, y_test))

R-squared score for dummy model with raw values:  -0.00013067055975835018


## Linear Regression

### Raw Values

In [7]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
print('R-squared score for linear regression with raw values: ', lr.score(X_test, y_test))

R-squared score for linear regression with raw values:  0.43875217918343734


### Scaled Values

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
print('R-squared score for linear regression with scaled values: ', lr.score(X_test_scaled, y_test))

R-squared score for linear regression with scaled values:  0.4387521791834357


### Polynomial Features

In [9]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_expanded, y_train)
print('R-squared score for linear regression with polynomial values: ', lr.score(X_test_expanded, y_test))

R-squared score for linear regression with polynomial values:  0.5249086678815692


## Ridge

### Raw Values

In [10]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1).fit(X_train, y_train)
print('R-squared score for ridge regression with raw values: ', ridge.score(X_test, y_test))

R-squared score for ridge regression with raw values:  0.43875105579242


In [11]:
from sklearn.model_selection import GridSearchCV
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gs = GridSearchCV(Ridge(), param_grid=params, scoring='r2', n_jobs=-1)
gs.fit(X_train, y_train)
print('Optimal alpha for ridge regression with raw values: ', gs.best_params_)
print('R-squared score for ridge regression with raw values: ', gs.score(X_test, y_test))

Optimal alpha for ridge regression with raw values:  {'alpha': 0.1}
R-squared score for ridge regression with raw values:  0.43875206832529934


### Scaled Values

In [12]:
ridge = Ridge(alpha=1).fit(X_train_scaled, y_train)
print('R-squared score for ridge regression with scaled values: ', ridge.score(X_test_scaled, y_test))

R-squared score for ridge regression with scaled values:  0.4387521562752168


In [13]:
from sklearn.model_selection import GridSearchCV
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gs = GridSearchCV(Ridge(), param_grid=params, scoring='r2', n_jobs=-1)
gs.fit(X_train_scaled, y_train)
print('Optimal alpha for ridge regression with scaled values: ', gs.best_params_)
print('R-squared score for ridge regression with scaled values: ', gs.score(X_test_scaled, y_test))

Optimal alpha for ridge regression with scaled values:  {'alpha': 10}
R-squared score for ridge regression with scaled values:  0.4387519473283663


### Polynomial Features

In [14]:
ridge = Ridge(alpha=1).fit(X_train_expanded, y_train)
print('R-squared score for ridge regression with expanded values: ', ridge.score(X_test_expanded, y_test))

R-squared score for ridge regression with expanded values:  0.5248140366382139


In [15]:
from sklearn.model_selection import GridSearchCV
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gs = GridSearchCV(Ridge(), param_grid=params, scoring='r2', n_jobs=-1)
gs.fit(X_train_expanded, y_train)
print('Optimal alpha for ridge regression with expanded values: ', gs.best_params_)
print('R-squared score for ridge regression with expanded values: ', gs.score(X_test_expanded, y_test))

Optimal alpha for ridge regression with expanded values:  {'alpha': 0.1}
R-squared score for ridge regression with expanded values:  0.5248972018295721


## Lasso

### Raw

In [16]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1).fit(X_train, y_train)
print('R-squared score for lasso regression with raw values: ', lasso.score(X_test, y_test))

R-squared score for lasso regression with raw values:  0.3333124820021308


In [17]:
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gs = GridSearchCV(estimator=lasso, param_grid=params, scoring='r2', n_jobs=-1)
gs.fit(X_train, y_train)
print('Optimal alpha for lasso regression with raw values: ', gs.best_params_)
print('R-squared score for lasso regression with raw values: ', gs.score(X_test, y_test))

Optimal alpha for lasso regression with raw values:  {'alpha': 0.001}
R-squared score for lasso regression with raw values:  0.4387408322380546


### Scaled

In [18]:
lasso = Lasso(alpha=1).fit(X_train_scaled, y_train)
print('R-squared score for lasso regression with scaled values: ', lasso.score(X_test_scaled, y_test))

R-squared score for lasso regression with scaled values:  0.41561446129230206


In [19]:
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gs = GridSearchCV(estimator=lasso, param_grid=params, scoring='r2', n_jobs=-1)
gs.fit(X_train_scaled, y_train)
print('Optimal alpha for lasso regression with scaled values: ', gs.best_params_)
print('R-squared score for lasso regression with scaled values: ', gs.score(X_test_scaled, y_test))

Optimal alpha for lasso regression with scaled values:  {'alpha': 0.01}
R-squared score for lasso regression with scaled values:  0.4387301331831652


### Polynomial features

In [20]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1).fit(X_train_expanded, y_train)
print('R-squared score for lasso regression with expanded values: ', lasso.score(X_test_expanded, y_test))

R-squared score for lasso regression with expanded values:  0.4498770376878116


In [21]:
params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
gs = GridSearchCV(estimator=lasso, param_grid=params, scoring='r2', n_jobs=-1)
gs.fit(X_train_expanded, y_train)
print('Optimal alpha for lasso regression with expanded values: ', gs.best_params_)
print('R-squared score for lasso regression with expanded values: ', gs.score(X_test_expanded, y_test))

Optimal alpha for lasso regression with expanded values:  {'alpha': 0.001}
R-squared score for lasso regression with expanded values:  0.5244216680795837


## ElasticNet

### Raw

In [22]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train, y_train)
print('R-squared score for ElasticNet regression with raw values: ', elastic_net.score(X_test, y_test))

R-squared score for ElasticNet regression with raw values:  0.2524313258224342


In [23]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha, l1_ratio=l1_ratio)
gs = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2', n_jobs=-1)
gs.fit(X_train, y_train)
print('Optimal alpha and L1 ratio for ElasticNet regression with raw values: ', gs.best_params_)
print('R-squared score for ElasticNet regression with raw values: ', gs.score(X_test, y_test))

Optimal alpha and L1 ratio for ElasticNet regression with raw values:  {'alpha': 0.001, 'l1_ratio': 1}
R-squared score for ElasticNet regression with raw values:  0.4387408322380546


### Scaled

In [24]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train_scaled, y_train)
print('R-squared score for ElasticNet regression with scaled values: ', elastic_net.score(X_test_scaled, y_test))

R-squared score for ElasticNet regression with scaled values:  0.3879998649551908


In [25]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha, l1_ratio=l1_ratio)
gs = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2', n_jobs=-1)
gs.fit(X_train_scaled, y_train)
print('Optimal alpha and L1 ratio for ElasticNet regression with scaled values: ', gs.best_params_)
print('R-squared score for ElasticNet regression with scaled values: ', gs.score(X_test_scaled, y_test))

Optimal alpha and L1 ratio for ElasticNet regression with scaled values:  {'alpha': 0.01, 'l1_ratio': 1}
R-squared score for ElasticNet regression with scaled values:  0.4387301331831652


### Polynomial features

In [26]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=1, l1_ratio=0.5).fit(X_train_expanded, y_train)
print('R-squared score for ElasticNet regression with expanded values: ', elastic_net.score(X_test_expanded, y_test))

R-squared score for ElasticNet regression with expanded values:  0.4517123835525958


In [None]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
param_grid = dict(alpha=alpha, l1_ratio=l1_ratio)
gs = GridSearchCV(estimator=elastic_net, param_grid=param_grid, scoring='r2', n_jobs=-1)
gs.fit(X_train_expanded, y_train)
print('Optimal alpha and L1 ratio for ElasticNet regression with expanded values: ', gs.best_params_)
print('R-squared score for ElasticNet regression with expanded values: ', gs.score(X_test_expanded, y_test))