## 0. Import Packages and Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.linear_model import LogisticRegression
# import pycaret below <- automates the expiramentation process (test KNN, test logistic, etc...)
from pycaret.regression import setup, compare_models, create_model

# from pycaret.classification import setup
# from pycaret.classification import compare_models

In [2]:
# Import data that was output from previous notebook "2. Exploratory Data Analysis"
df = pd.read_csv(r'/Users/harrisonchaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/df_emo_full_eda.csv', index_col = 0)

In [3]:
# Check that the details of this dataset make sense.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 327895 entries, 0 to 327894
Data columns (total 54 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   WeekID                       327895 non-null  object 
 1   Week Position                327895 non-null  int64  
 2   Song                         327895 non-null  object 
 3   SongID                       327895 non-null  object 
 4   Instance                     327895 non-null  int64  
 5   Previous Week Position       327895 non-null  float64
 6   Peak Position                327895 non-null  int64  
 7   Weeks on Chart               327895 non-null  int64  
 8   Artist                       269355 non-null  object 
 9   wordCount                    327895 non-null  float64
 10  languages                    269355 non-null  object 
 11  allWordCount                 327895 non-null  float64
 12  year                         327895 non-null  float64
 13 

From the above, it appears that "decade" is the only object variable that needs to be transformed for modeling. All others objects, such as Week, SongID, and Artist are index related.

In [4]:
# Convert Decade Object Variable to Dummies
decade_dummies = pd.get_dummies(df['decade'])
df.drop("decade", axis = 1, inplace = True)
df = df.join(decade_dummies)

In [5]:
#Check that the Decade dummies are as expected
df.dtypes

WeekID                          object
Week Position                    int64
Song                            object
SongID                          object
Instance                         int64
Previous Week Position         float64
Peak Position                    int64
Weeks on Chart                   int64
Artist                          object
wordCount                      float64
languages                       object
allWordCount                   float64
year                           float64
MTLD                           float64
TTR                            float64
CTTR                           float64
sentimentScore                 float64
sentimentScore_pos             float64
sentimentScore_neg             float64
emoWordCount                   float64
joy_normalized                 float64
sadness_normalized             float64
anger_normalized               float64
disgust_normalized             float64
trust_normalized               float64
anticipation_normalized  

In [6]:
# Remove unnecessary columns, particularly the columns we are not interested in using.
df.drop(columns = ['joy_normalized', 'sadness_normalized', 'anger_normalized', 'disgust_normalized', 'trust_normalized', 'anticipation_normalized', 'fear_normalized', 'surprise_normalized', 'happy_normalized', 'sorrow_normalized', 'Week_dt', 'Week_dt_next', 'YearID_next', 'languages'], inplace = True)
# Check that these columns are removed.
df.dtypes

WeekID                          object
Week Position                    int64
Song                            object
SongID                          object
Instance                         int64
Previous Week Position         float64
Peak Position                    int64
Weeks on Chart                   int64
Artist                          object
wordCount                      float64
allWordCount                   float64
year                           float64
MTLD                           float64
TTR                            float64
CTTR                           float64
sentimentScore                 float64
sentimentScore_pos             float64
sentimentScore_neg             float64
emoWordCount                   float64
emo_score                      float64
YearID                           int64
DecadeID                        object
SuccessInd                       int64
PrimaryKey                      object
Artist_Count_Total             float64
Artist_Count_Year        

## 1. Train-Test Split

In [24]:
# Create a Train and Test dataset. 70% will be used for training.
## Week position will be our observed column.
len(df) * .7, len(df) * .3

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Week Position'), 
                                                    df['Week Position'], test_size=0.3, 
                                                    random_state=123)

In [26]:
# Check our training set.
X_train.shape, X_test.shape

((229526, 46), (98369, 46))

In [27]:
# Check our test set.
y_train.shape, y_test.shape

((229526,), (98369,))

In [28]:
# Do the ratios of 'Week Position' match across Train and Test sets?
print(y_train.sum()/(len(X_train) + len(y_train)))
print(y_test.sum()/(len(X_test) + len(y_test)))

25.216724902625412
25.326490052760523


## 2. Remove PrimaryKey columns from Train & Test data

In [11]:
#Save the 'Week Position', 'Song', 'SongID', 'Artist', 'languages', 'year', 'PrimaryKey', 'DecadeID' columns from the train/test data into names_train and names_test
#Then drop those columns from `X_train` and `X_test`. Use 'inplace=True'
names_list = ['SuccessInd', 'Song', 'SongID', 'Artist', 'year', 'PrimaryKey', 'WeekID', 'DecadeID', 'YearID']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((229526, 37), (98369, 37))

## 3. Utilize pycaret

Pycaret is an open source library for automating machine learning work flows. We use the functionality below to test for the optimal machine learning algorithm in our dataset.
For more information: https://pycaret.gitbook.io/docs/get-started/tutorials

In [12]:
# Create dataset for pycaret analysis. We will use our entire dataset for this analysis.
df_2 = df.drop(columns = names_list)

In [14]:
# Set up the dataset.
grid = setup(data = df_2, target = 'Week Position')

Unnamed: 0,Description,Value
0,session_id,3988
1,Target,Week Position
2,Original Data,"(327895, 38)"
3,Missing Values,True
4,Numeric Features,24
5,Categorical Features,13
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(229526, 86)"


In [15]:
# Evaluate and compare models
best = compare_models(fold = 5, n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,3.4965,36.9743,6.0806,0.9556,0.19,0.1341,5.28
rf,Random Forest Regressor,3.3383,37.5665,6.129,0.9549,0.1849,0.1209,255.502
et,Extra Trees Regressor,3.3555,37.8741,6.1541,0.9545,0.1878,0.1232,292.058
gbr,Gradient Boosting Regressor,4.4436,51.7918,7.1965,0.9378,0.2295,0.1811,88.366
dt,Decision Tree Regressor,4.2486,72.905,8.5384,0.9125,0.2475,0.1464,16.472
knn,K Neighbors Regressor,6.6152,103.5933,10.178,0.8757,0.3244,0.2621,30.068
ada,AdaBoost Regressor,9.6397,141.2494,11.8691,0.8305,0.4341,0.5298,48.506
ridge,Ridge Regression,9.2514,202.8921,14.244,0.7565,0.4783,0.5179,0.534
br,Bayesian Ridge,9.2512,202.8968,14.2441,0.7565,0.4783,0.518,3.068
lr,Linear Regression,9.2512,202.8944,14.244,0.7565,0.4783,0.5179,5.106


The top 5 models based on MSE, RMSE, and R2 are Light Gradient Boosting Machine, Random Forest Regressor, and Extra Trees Regressor, Gradient Boosting Regressor, and Gradient Boosting Regressor. 

## 4. Test Top Algorithms

We will test the top performing algorithms to check metrics on 10-fold and analyze whether or not they perform consistently.

### 4a. Light Gradient Boosting Machine

In [16]:
lightgbm = create_model("lightgbm")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.4968,36.836,6.0693,0.9556,0.1913,0.1344
1,3.452,35.8612,5.9884,0.9573,0.1861,0.1324
2,3.4777,37.58,6.1303,0.9548,0.1874,0.13
3,3.499,37.3604,6.1123,0.9555,0.1926,0.1376
4,3.4716,36.0861,6.0072,0.9564,0.186,0.1312
5,3.5065,36.6258,6.0519,0.9562,0.1902,0.1352
6,3.4924,36.0995,6.0083,0.9567,0.1917,0.1373
7,3.5279,38.0255,6.1665,0.954,0.1881,0.1312
8,3.4631,35.655,5.9712,0.9572,0.191,0.1341
9,3.5611,38.2457,6.1843,0.9542,0.1946,0.1377


### 4b. Random Forest

In [17]:
rf = create_model("rf")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.344,37.6142,6.133,0.9547,0.1871,0.1216
1,3.2659,36.2393,6.0199,0.9568,0.1805,0.1182
2,3.2959,38.0855,6.1713,0.9541,0.1828,0.1174
3,3.3439,37.8853,6.1551,0.9549,0.1873,0.1234
4,3.3307,37.2858,6.1062,0.9549,0.1819,0.1192
5,3.3306,36.6825,6.0566,0.9562,0.1852,0.1219
6,3.3013,36.518,6.043,0.9562,0.1837,0.1198
7,3.3459,38.3249,6.1907,0.9536,0.1808,0.1166
8,3.2968,36.5501,6.0457,0.9562,0.1851,0.1196
9,3.3935,39.273,6.2668,0.9529,0.1892,0.123


### 4c. Gradient Boosting Regressor

In [18]:
gb = create_model("gbr")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4.4222,50.7048,7.1207,0.9389,0.2289,0.1805
1,4.4204,51.7984,7.1971,0.9383,0.2301,0.1836
2,4.3743,51.2136,7.1564,0.9383,0.2251,0.174
3,4.4135,52.089,7.2173,0.938,0.2319,0.187
4,4.4383,51.3881,7.1685,0.9379,0.2263,0.1799
5,4.4546,52.0937,7.2176,0.9378,0.2307,0.1827
6,4.4637,51.7388,7.193,0.9379,0.2325,0.1842
7,4.4064,50.9597,7.1386,0.9383,0.2231,0.1742
8,4.4171,50.959,7.1386,0.9389,0.2299,0.1804
9,4.5057,52.7087,7.2601,0.9368,0.2318,0.183


### 4d. K-nearest neighbors

In [19]:
knn = create_model("knn")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.3927,97.5684,9.8777,0.8825,0.3106,0.2448
1,6.4692,100.9115,10.0455,0.8797,0.3149,0.2509
2,6.4972,102.1397,10.1064,0.877,0.3161,0.2443
3,6.478,101.6405,10.0817,0.879,0.3138,0.251
4,6.4232,99.9467,9.9973,0.8791,0.3092,0.2389
5,6.4732,100.8688,10.0433,0.8795,0.3141,0.25
6,6.4655,99.9776,9.9989,0.88,0.3176,0.2541
7,6.4553,99.3806,9.969,0.8798,0.3091,0.242
8,6.5002,102.2723,10.113,0.8774,0.3221,0.2519
9,6.4508,99.493,9.9746,0.8808,0.3114,0.2443


### 4e. Ridge Regression 

In [20]:
ridge = create_model("ridge")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,9.3252,206.8801,14.3833,0.7508,0.4813,0.5271
1,9.235,201.4963,14.1949,0.7599,0.4827,0.5248
2,9.2324,202.8831,14.2437,0.7557,0.4741,0.5096
3,9.2174,199.8987,14.1386,0.762,0.4797,0.5205
4,9.2063,199.7813,14.1344,0.7584,0.469,0.4978
5,9.3585,208.482,14.4389,0.751,0.4814,0.5248
6,9.2516,203.9646,14.2816,0.7553,0.4885,0.5392
7,9.2159,202.3793,14.226,0.7552,0.4731,0.503
8,9.1908,200.5581,14.1619,0.7595,0.4745,0.5075
9,9.2739,202.3365,14.2245,0.7575,0.4784,0.5243


## 5. Output Data

In [31]:
# Output Full dataset
df.to_csv(r'/Users/harrisonchaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/df_emo_full_pat.csv')

In [32]:
# Output Train and Test datasets
X_train.to_csv(r'/Users/harrisonchaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/X_train.csv')
X_test.to_csv(r'/Users/harrisonchaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/X_test.csv')
y_train.to_csv(r'/Users/harrisonchaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/y_train.csv')
y_test.to_csv(r'/Users/harrisonchaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/y_test.csv')

## 6. Summary

In this precprocessing and training step, we have created dummy variables for our categorical data (the song's decade label) and split our data into a train and test set. We will use 70% of the data for training and 30% for testing. 
As an initial exploration, we use PyCaret to analyze which models perform best. We run each of these models individually to check that their metrics are consistant across a 10-fold analysis. 

In our next step, we will dig further into these top performing algorithms to create a finalized model that can help us best predict a Song's week position on the Billboard Top 100 chart. 