In [None]:
!pip install spotipy --ignore-installed

In [None]:
!pip install  --use-deprecated=legacy-resolver pycaret[full]

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

import glob
import re
from datetime import datetime
import time
import os
import itertools
from itertools import combinations

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from pycaret.regression import *

%matplotlib inline

  import pandas.util.testing as tm


In [4]:
# Train DF
try:
    df_ml = pd.read_csv('./audio_features_extra.csv')
    print('Reading Train DataFrame from file')
    df_ml = df_ml.append(pd.read_csv('./audio_features_spotify_search.csv'))
    print('Reading Train (Extra) DataFrame from file')
    df_ml = df_ml.append(pd.read_csv('./audio_features_spotify_scrap.csv'))
    print('Reading Train (Scrap) DataFrame from file')
except:
    if df_ml is not None:
        print('Test DataFrame Exists')
    else:
        print('Please run the first part to generate the DataFrame')
# df_ml.reset_index(inplace=True)
df_ml.dropna( inplace=True)
df_ml.drop_duplicates()
df_ml.shape

Reading Train DataFrame from file
Reading Train (Extra) DataFrame from file
Reading Train (Scrap) DataFrame from file


(100401, 52)

In [8]:
# Test DF
try:
    test_df = pd.read_csv('./test_df.csv')
    print('Reading Test DataFrame from file')
except:
    if test_df is not None:
        print('Test DataFrame Exists')
    else:
        print('Please run the first part to generate the DataFrame')

Reading Test DataFrame from file


In [5]:
def pre_processing(df_raw, reduced=False, norm=False, dummy=False):
    # We don't need the id columns, and the uri/l columns so we will drop those:
    col_to_drop = ['song_id', 'id', 'track_href', 'analysis_url', 'type', 'uri']
    df_raw.drop(col_to_drop, axis=1, inplace=True)
    df_raw.head()

    #  Move dependent variable to the end to make if more pythonic
    order = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
            'liveness', 'tempo', 'tempo_confidence', 'duration_ms', 'time_signature_confidence', 
            'key_confidence', 'mode_confidence', 'max_loundness', 'min_loundness', 'mode_seg_0', 'fade_in', 'fade_out',
        'pitch_0', 'pitch_1', 'pitch_2', 'pitch_3', 'pitch_4', 'pitch_5',
        'pitch_6', 'pitch_7', 'pitch_8', 'pitch_9', 'pitch_10', 'pitch_11',
        'timbre_0', 'timbre_1', 'timbre_2', 'timbre_3', 'timbre_4', 'timbre_5',
        'timbre_6', 'timbre_7', 'timbre_8', 'timbre_9', 'timbre_10',
        'timbre_11', 'key', 'mode', 'time_signature', 'valence']
    df_raw = df_raw[order]

    # drop uneeded columns 
    if reduced:
        not_needed = ['tempo_confidence', 'time_signature_confidence', 
                'key_confidence', 'mode_confidence', 'max_loundness', 'min_loundness', 'mode_seg_0', 'fade_in', 'fade_out',
            'pitch_0', 'pitch_1', 'pitch_2', 'pitch_3', 'pitch_4', 'pitch_5',
            'pitch_6', 'pitch_7', 'pitch_8', 'pitch_9', 'pitch_10', 'pitch_11',
            'timbre_0', 'timbre_1', 'timbre_2', 'timbre_3', 'timbre_4', 'timbre_5',
            'timbre_6', 'timbre_7', 'timbre_8', 'timbre_9', 'timbre_10',
            'timbre_11']
        df_raw.drop(not_needed, axis=1, inplace = True)

    #One hot encode variables
    if dummy:
        df_raw = pd.get_dummies(df_raw, columns=['key', 'mode', 'time_signature'])

    order = list(df_raw.columns)
    order.remove('valence')
    order.append('valence')

    df_raw = df_raw[order]


    return df_raw


In [6]:
df_ml = pre_processing(df_ml)
df_ml.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,tempo,tempo_confidence,duration_ms,time_signature_confidence,key_confidence,mode_confidence,max_loundness,min_loundness,mode_seg_0,fade_in,fade_out,pitch_0,pitch_1,pitch_2,pitch_3,pitch_4,pitch_5,pitch_6,pitch_7,pitch_8,pitch_9,pitch_10,pitch_11,timbre_0,timbre_1,timbre_2,timbre_3,timbre_4,timbre_5,timbre_6,timbre_7,timbre_8,timbre_9,timbre_10,timbre_11,key,mode,time_signature,valence
0,0.681,0.594,-7.028,0.282,0.165,3e-06,0.134,186.054,0.165,230453,1.0,0.647,0.633,-4.558,-10.447,0.272727,2.85025,221.73315,0.565518,0.480286,0.331422,0.226427,0.344564,0.232814,0.346735,0.411364,0.525577,0.562847,0.361318,0.291801,47.634602,2.557585,-23.780334,-6.578245,10.876114,-13.711015,1.514876,-2.376809,0.656761,5.700232,-18.592179,-1.092454,7,1,4,0.535
1,0.72,0.763,-4.068,0.0523,0.406,0.0,0.18,101.965,0.46,251088,1.0,0.396,0.525,-2.493,-8.846,0.666667,0.0,245.78322,0.494438,0.296443,0.310171,0.235609,0.431947,0.349232,0.28515,0.390142,0.26216,0.434979,0.227367,0.409523,52.477917,65.732926,50.89295,16.079133,41.991985,-15.83802,-1.474704,1.482797,-7.830749,3.769064,-12.888112,0.881602,9,0,4,0.742
2,0.748,0.524,-5.599,0.0338,0.414,0.0,0.111,95.01,0.314,244960,1.0,0.643,0.618,-1.905,-9.361,0.090909,0.17914,231.2011,0.364283,0.374847,0.196077,0.480253,0.236659,0.323469,0.15392,0.253767,0.454585,0.151896,0.390396,0.159136,49.261417,46.574495,-5.647053,-4.726581,48.401789,-16.350884,-13.422538,-4.113714,-10.971919,-1.160414,-14.684009,1.395501,8,1,4,0.661
3,0.735,0.451,-8.374,0.0585,0.0631,1.3e-05,0.325,117.973,0.75,245200,1.0,0.501,0.525,-5.836,-16.587,0.111111,0.0,241.48753,0.727198,0.478344,0.238704,0.194878,0.229576,0.278269,0.271341,0.288212,0.232823,0.294456,0.186011,0.260813,45.433614,10.516649,-7.445903,-9.215478,0.369456,-15.219158,20.243924,8.294884,-1.146117,3.245916,-17.540552,0.038808,0,1,4,0.0862
4,0.67,0.838,-4.031,0.0362,0.0604,0.000611,0.159,104.998,0.768,222041,1.0,0.131,0.369,-2.467,-5.772,0.222222,0.0,217.80898,0.575714,0.285637,0.36196,0.220904,0.349388,0.305209,0.317854,0.556199,0.321825,0.455768,0.267613,0.270743,52.274995,62.349399,-1.866774,-4.668169,36.712077,-33.158121,-15.529416,1.743342,-4.321017,7.561803,-12.431474,-5.487772,0,1,4,0.717


In [13]:

exp_reg101 = setup(data = df_ml, target = 'valence', session_id=123, train_size=0.8,  
                  #  remove_outliers=True, create_clusters=True, cluster_iter=30, 
                   silent=True) 

Unnamed: 0,Description,Value
0,session_id,123
1,Target,valence
2,Original Data,"(100401, 46)"
3,Missing Values,False
4,Numeric Features,42
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(80320, 59)"


In [14]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.1106,0.0197,0.1404,0.6771,0.0959,0.3931,32.172
xgboost,Extreme Gradient Boosting,0.1154,0.0216,0.1471,0.6457,0.1005,0.4116,69.554
et,Extra Trees Regressor,0.1143,0.0229,0.1512,0.6257,0.1033,0.4417,106.323
lightgbm,Light Gradient Boosting Machine,0.1199,0.0229,0.1512,0.6255,0.1033,0.4357,2.858
rf,Random Forest Regressor,0.1197,0.0236,0.1536,0.6137,0.1049,0.4556,285.222
gbr,Gradient Boosting Regressor,0.1326,0.0272,0.1648,0.5552,0.1125,0.5049,88.395
lr,Linear Regression,0.137,0.0293,0.1712,0.52,0.1168,0.558,0.483
ridge,Ridge Regression,0.137,0.0293,0.1712,0.52,0.1168,0.558,0.063
br,Bayesian Ridge,0.137,0.0293,0.1712,0.52,0.1168,0.558,0.531
lar,Least Angle Regression,0.1372,0.0295,0.1718,0.5165,0.1172,0.5586,0.079
