# Regression Approach

This notebook is dedicated to modelling the problem as a regression problem. Different regression techniques are evaluated, and then the best-performing one is optimized in terms of hyper-parameters

In [1]:
# Change directory for cleaner path handling
%cd ..

C:\Users\georg\Documents\msc-project


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn import linear_model
from sklearn.decomposition import PCA

from sklearn.model_selection import cross_validate,train_test_split
from sklearn import tree, linear_model, dummy, kernel_ridge, gaussian_process
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [3]:
df = pd.read_csv("data/interim/tracks.csv")

## Feature normalization
Initially, the target variable is constrained to the 0-100 range so as to account for the large differences between values, as well as the large values themselves.

The other features are also scaled to be in the same range, as they are measurements with different initial scales.

In [9]:
targets = df['views']
features = [
    'danceability',
    'energy',
    'key',
    'loudness',
    'mode',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'time_signature',
    'duration_ms',
    'popularity'
]

data = df[features]

## Model comparison
Different regression models are compared with their default parameters to establish a baseline

In [24]:
def evaluate_model(model):
    results = cross_validate(model, data, targets, scoring=[
        'r2',
        'neg_mean_absolute_error',
        'neg_root_mean_squared_error',
    ])
    return results

In [None]:
def compare_models(data, targets):
    models = [
        ("Baseline", dummy.DummyRegressor(strategy='mean')),
        ("Linear Regression", linear_model.LinearRegression()),
        ("Polynomial Regression", Pipeline([
            ('poly', PolynomialFeatures(degree=3)),
            ('linear', linear_model.LinearRegression())
        ])),
        ("Decision Tree", tree.DecisionTreeRegressor()),
        ("Kernel Ridge", kernel_ridge.KernelRidge(kernel='rbf')),
        ("Gaussian Process", gaussian_process.GaussianProcessRegressor()),
    ]
    
    for name, model in models:
        scores = evaluate_model(model)
        table = pd.DataFrame(scores.values(),columns=scores.keys())
        print(table)

compare_models(data,targets)

       fit_time    score_time       test_r2  test_neg_mean_absolute_error  \
0  1.983643e-03  1.484871e-03  9.591579e-04                  9.908676e-04   
1  4.973412e-04  4.963875e-04  9.930134e-04                  9.918213e-04   
2 -5.492043e-02 -2.080874e-02 -3.950145e-04                 -4.663148e-02   
3 -9.606872e+06 -6.950660e+06 -6.965370e+06                 -6.336668e+06   
4 -2.059365e+07 -1.144987e+07 -1.465914e+07                 -1.100182e+07   

   test_neg_root_mean_squared_error  
0                      9.920597e-04  
1                      9.915829e-04  
2                     -2.230797e-01  
3                     -5.667681e+06  
4                     -1.007470e+07  
       fit_time    score_time       test_r2  test_neg_mean_absolute_error  \
0  5.458355e-03  5.952120e-03  5.431652e-03                  3.967285e-03   
1  2.973557e-03  2.008438e-03  1.983881e-03                  1.983881e-03   
2 -3.030485e-02 -1.333974e-02  2.986672e-02                 -2.206331e-02   
3