## 0. Import Packages and Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.linear_model import LogisticRegression
# import pycaret below <- automates the expiramentation process (test KNN, test logistic, etc...)
from pycaret.regression import setup, compare_models, create_model

In [2]:
# Import data that was output from previous notebook "2. Exploratory Data Analysis"
df = pd.read_csv(r'/Users/chaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/df_emo_full_eda.csv', index_col = 0)

In [3]:
# Check that the details of this dataset make sense.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 327895 entries, 0 to 327894
Data columns (total 54 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   WeekID                       327895 non-null  object 
 1   Week Position                327895 non-null  int64  
 2   Song                         327895 non-null  object 
 3   SongID                       327895 non-null  object 
 4   Instance                     327895 non-null  int64  
 5   Previous Week Position       327895 non-null  float64
 6   Peak Position                327895 non-null  int64  
 7   Weeks on Chart               327895 non-null  int64  
 8   Artist                       327895 non-null  object 
 9   wordCount                    327895 non-null  float64
 10  languages                    269355 non-null  object 
 11  allWordCount                 327895 non-null  float64
 12  year                         327895 non-null  float64
 13 

From the above, it appears that "decade" is the only object variable that needs to be transformed for modeling. All others objects, such as Week, SongID, and Artist are index related.

In [4]:
# Convert Decade Object Variable to Dummies
decade_dummies = pd.get_dummies(df['decade'])
df.drop("decade", axis = 1, inplace = True)
df = df.join(decade_dummies)

In [5]:
#Check that the Decade dummies are as expected
df.dtypes

WeekID           object
Week Position     int64
Song             object
SongID           object
Instance          int64
                  ...  
50s               uint8
60s               uint8
70s               uint8
80s               uint8
90s               uint8
Length: 61, dtype: object

In [6]:
# Remove unnecessary columns, particularly the columns we are not interested in using.
df.drop(columns = ['joy_normalized', 'sadness_normalized', 'anger_normalized', 'disgust_normalized', 'trust_normalized', 'anticipation_normalized', 'fear_normalized', 'surprise_normalized', 'happy_normalized', 'sorrow_normalized', 'Week_dt', 'Week_dt_next', 'YearID_next', 'languages'], inplace = True)
# Check that these columns are removed.
df.dtypes

WeekID                          object
Week Position                    int64
Song                            object
SongID                          object
Instance                         int64
Previous Week Position         float64
Peak Position                    int64
Weeks on Chart                   int64
Artist                          object
wordCount                      float64
allWordCount                   float64
year                           float64
MTLD                           float64
TTR                            float64
CTTR                           float64
sentimentScore                 float64
sentimentScore_pos             float64
sentimentScore_neg             float64
emoWordCount                   float64
emo_score                      float64
YearID                           int64
DecadeID                        object
SuccessInd                       int64
PrimaryKey                      object
Artist_Count_Total               int64
Artist_Count_Year        

## 1. Train-Test Split

In [7]:
# Create a Train and Test dataset. 70% will be used for training.
## Week position will be our observed column.
len(df) * .7, len(df) * .3

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='Week Position'), 
                                                    df['Week Position'], test_size=0.3, 
                                                    random_state=123)

In [8]:
# Check our training set.
X_train.shape, X_test.shape

((229526, 46), (98369, 46))

In [9]:
# Check our test set.
y_train.shape, y_test.shape

((229526,), (98369,))

In [10]:
# Do the ratios of 'Week Position' match across Train and Test sets?
print(y_train.sum()/(len(X_train) + len(y_train)))
print(y_test.sum()/(len(X_test) + len(y_test)))

25.216724902625412
25.326490052760523


## 2. Remove PrimaryKey columns from Train & Test data

In [11]:
#Save the 'Week Position', 'Song', 'SongID', 'Artist', 'languages', 'year', 'PrimaryKey', 'DecadeID' columns from the train/test data into names_train and names_test
#Then drop those columns from `X_train` and `X_test`. Use 'inplace=True'
names_list = ['SuccessInd', 'Song', 'SongID', 'Artist', 'year', 'PrimaryKey', 'WeekID', 'DecadeID', 'YearID', 'Peak Position']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((229526, 36), (98369, 36))

## 3. Utilize pycaret

### 3a. Run pycaret on all features

In [12]:
# Create dataset for pycaret analysis. We will use our entire dataset for this analysis.
df_2 = df.drop(columns = names_list)

In [13]:
# Set up the dataset.
grid = setup(data = df_2, target = 'Week Position')

Unnamed: 0,Description,Value
0,Session id,261
1,Target,Week Position
2,Target type,Regression
3,Original data shape,"(327895, 37)"
4,Transformed data shape,"(327895, 37)"
5,Transformed train set shape,"(229526, 37)"
6,Transformed test set shape,"(98369, 37)"
7,Numeric features,36
8,Preprocess,True
9,Imputation type,simple


In [14]:
# Evaluate and compare models
best = compare_models(fold = 5, n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,5.7055,76.8578,8.7667,0.9079,0.2413,0.2027,7.57
lightgbm,Light Gradient Boosting Machine,5.7869,78.7547,8.8743,0.9056,0.2406,0.2064,1.356
rf,Random Forest Regressor,5.9214,82.1829,9.0654,0.9015,0.2434,0.2038,54.546
et,Extra Trees Regressor,6.0936,86.1485,9.2816,0.8967,0.249,0.2075,66.048
gbr,Gradient Boosting Regressor,6.1121,87.5695,9.3577,0.895,0.2565,0.2267,14.602
knn,K Neighbors Regressor,8.2671,143.3877,11.9744,0.8281,0.3494,0.3186,662.212
dt,Decision Tree Regressor,8.2524,160.4391,12.6664,0.8077,0.3348,0.2608,2.322
ada,AdaBoost Regressor,9.8303,164.1767,12.8118,0.8032,0.3604,0.3929,5.966
br,Bayesian Ridge,19.6665,575.5213,23.99,0.3101,0.7868,1.4486,0.216
ridge,Ridge Regression,19.6656,575.5136,23.9898,0.3101,0.7867,1.4483,0.294


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

The top 5 models for all factors based on MSE, RMSE, and R2 are CatBoost, Light Gradient Boosting Machine, Random Forest, Extra Trees, and Gradient Boosting.

### 3b. Run pycaret on controlable features

In [15]:
# Create dataset for pycaret analysis. We will use ONLY CONTROLABLE dataset for this analysis.
df_2 = df.drop(columns = names_list)
df_2 = df_2.drop(columns = {'Instance', 'Previous Week Position', 'Weeks on Chart', 'Artist_Count_Total', 'Artist_Count_Year', 'Artist_Count_Year_prior', 'Artist_Count_Week', 'Artist_Count_Week_prior', '00s', '10s', '20s', '50s', '60s', '70s', '80s', '90s'})
print(df_2.columns)

Index(['Week Position', 'wordCount', 'allWordCount', 'MTLD', 'TTR', 'CTTR',
       'sentimentScore', 'sentimentScore_pos', 'sentimentScore_neg',
       'emoWordCount', 'emo_score', 'joy_normalized_ind',
       'sadness_normalized_ind', 'anger_normalized_ind',
       'disgust_normalized_ind', 'trust_normalized_ind',
       'anticipation_normalized_ind', 'fear_normalized_ind',
       'surprise_normalized_ind', 'happy_normalized_ind',
       'sorrow_normalized_ind'],
      dtype='object')


In [16]:
# Set up the dataset.
grid = setup(data = df_2, target = 'Week Position')

Unnamed: 0,Description,Value
0,Session id,6645
1,Target,Week Position
2,Target type,Regression
3,Original data shape,"(327895, 21)"
4,Transformed data shape,"(327895, 21)"
5,Transformed train set shape,"(229526, 21)"
6,Transformed test set shape,"(98369, 21)"
7,Numeric features,20
8,Preprocess,True
9,Imputation type,simple


In [17]:
# Evaluate and compare models
best = compare_models(fold = 5, n_select = 5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,18.179,523.5849,22.8819,0.3712,0.7147,1.1297,9.322
rf,Random Forest Regressor,18.2429,523.7808,22.8862,0.371,0.7144,1.1314,18.148
dt,Decision Tree Regressor,18.2369,530.7099,23.037,0.3626,0.7173,1.1306,1.336
knn,K Neighbors Regressor,19.4559,607.4218,24.6449,0.2705,0.7427,1.1601,641.536
catboost,CatBoost Regressor,22.3702,689.69,26.2619,0.1717,0.8279,1.6403,5.086
lightgbm,Light Gradient Boosting Machine,23.6068,754.8686,27.4748,0.0934,0.859,1.7776,0.518
gbr,Gradient Boosting Regressor,24.3786,800.0069,28.2843,0.0392,0.877,1.8558,6.282
ada,AdaBoost Regressor,24.6599,814.9266,28.5469,0.0213,0.8832,1.8857,1.33
br,Bayesian Ridge,24.7056,818.9067,28.6165,0.0165,0.884,1.886,0.222
ridge,Ridge Regression,24.7046,818.9008,28.6164,0.0165,0.884,1.8858,0.156


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

The top 5 models for controlable factors are Extra Trees Regressor, Random Forest, Decision Tree, KNN, and CatBoost.

## 4. Test Top Algorithms

Test the top performing algorithms to check metrics on 10-fold and analyze whether or not they perform consistently.

### 4a. CatBoost Gradient Boosting

In [18]:
catboost = create_model("catboost")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,22.1964,681.9013,26.1132,0.1768,0.824,1.6193
1,22.2893,684.4733,26.1624,0.1805,0.8265,1.6457
2,22.3927,690.4362,26.2762,0.1702,0.8224,1.6184
3,22.2942,683.9966,26.1533,0.1781,0.8248,1.6455
4,22.3064,685.0102,26.1727,0.1787,0.8329,1.6907
5,22.4475,694.768,26.3585,0.1692,0.8375,1.679
6,22.225,681.6823,26.109,0.1791,0.8221,1.6093
7,22.1825,678.6252,26.0504,0.1818,0.8239,1.6362
8,22.3374,686.5517,26.2021,0.1768,0.8263,1.6055
9,22.3768,689.9119,26.2662,0.1733,0.8225,1.5922


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### 4b. Light Gradient Boosting Machine

In [19]:
lightgbm = create_model("lightgbm")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,23.4811,749.8421,27.3832,0.0948,0.8568,1.7626
1,23.6068,753.7035,27.4537,0.0976,0.8587,1.7875
2,23.6862,759.8163,27.5648,0.0868,0.8546,1.7554
3,23.6297,754.31,27.4647,0.0936,0.8578,1.793
4,23.5952,754.3539,27.4655,0.0955,0.8656,1.8381
5,23.6951,760.2628,27.5729,0.0909,0.8686,1.8196
6,23.4875,748.9943,27.3678,0.0981,0.8546,1.7497
7,23.4725,747.445,27.3394,0.0988,0.857,1.7817
8,23.653,754.6425,27.4708,0.0952,0.8583,1.7432
9,23.6116,754.8901,27.4753,0.0954,0.8538,1.7278


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### 4c. Random Forest

In [20]:
rf = create_model("rf")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,18.1016,514.9967,22.6935,0.3783,0.7097,1.124
1,18.105,513.6264,22.6633,0.385,0.7105,1.1303
2,18.2975,526.6915,22.9498,0.367,0.7134,1.1386
3,18.0093,513.7961,22.6671,0.3826,0.7081,1.1289
4,18.0451,513.404,22.6584,0.3844,0.7129,1.1515
5,18.2528,523.8741,22.8883,0.3736,0.7222,1.1616
6,18.0911,515.8024,22.7113,0.3789,0.71,1.1195
7,17.9955,510.4134,22.5923,0.3846,0.7079,1.1172
8,18.1593,520.5524,22.8156,0.3759,0.7146,1.1195
9,18.1587,518.8127,22.7775,0.3783,0.7095,1.0941


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### 4d. Gradient Boosting Regressor

In [21]:
gb = create_model("gbr")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,24.261,795.1144,28.1978,0.0402,0.8751,1.8419
1,24.4029,801.3621,28.3083,0.0405,0.8772,1.8693
2,24.4132,802.6223,28.3306,0.0354,0.8714,1.8257
3,24.422,801.0979,28.3037,0.0374,0.8762,1.873
4,24.3931,800.3425,28.2903,0.0404,0.8838,1.9182
5,24.4857,806.4077,28.3973,0.0357,0.8868,1.9022
6,24.2768,794.7458,28.1912,0.043,0.8735,1.8314
7,24.2789,795.4018,28.2029,0.041,0.8762,1.8642
8,24.4188,799.4081,28.2738,0.0415,0.8762,1.8198
9,24.4008,800.4954,28.293,0.0407,0.872,1.8082


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### 4e. K-nearest neighbors

In [22]:
knn = create_model("knn")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,19.2212,592.9958,24.3515,0.2841,0.7348,1.1405
1,19.6793,621.6616,24.9331,0.2557,0.7432,1.1552
2,19.4864,605.3452,24.6038,0.2725,0.7379,1.1431
3,19.1498,586.1033,24.2096,0.2958,0.7307,1.1377
4,19.1858,602.4879,24.5456,0.2776,0.7488,1.2438
5,19.3355,598.6123,24.4666,0.2842,0.7506,1.185
6,19.1443,595.6386,24.4057,0.2828,0.7417,1.1715
7,19.1882,602.0243,24.5362,0.2742,0.7395,1.1612
8,19.5385,619.3561,24.8869,0.2574,0.7421,1.1243
9,19.1654,581.5975,24.1163,0.3031,0.734,1.1211


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

### 4f. Ridge Regression 

In [23]:
ridge = create_model("ridge")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,24.6021,814.7156,28.5432,0.0165,0.8824,1.8743
1,24.7365,821.1596,28.6559,0.0168,0.8845,1.8999
2,24.7048,820.1656,28.6385,0.0143,0.8783,1.8557
3,24.7343,819.2574,28.6227,0.0156,0.8831,1.9009
4,24.7012,818.8854,28.6162,0.0182,0.8906,1.9482
5,24.8191,825.4859,28.7313,0.0129,0.8939,1.9343
6,24.6179,813.8433,28.5279,0.02,0.8805,1.8622
7,24.622,815.0161,28.5485,0.0174,0.8832,1.8946
8,24.7693,820.0955,28.6373,0.0167,0.8838,1.8505
9,24.7401,820.3885,28.6424,0.0169,0.8792,1.8377


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

## 5. Output Data

In [24]:
# Output Full dataset
df.to_csv(r'/Users/chaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/df_emo_full_pat.csv')

In [25]:
# Output Train and Test datasets
X_train.to_csv(r'/Users/chaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/X_train.csv')
X_test.to_csv(r'/Users/chaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/X_test.csv')
y_train.to_csv(r'/Users/chaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/y_train.csv')
y_test.to_csv(r'/Users/chaseweber/Desktop/Springboard/Billboard Top 10 Prediction/DataFiles/y_test.csv')

## 6. Summary

In this pre-processing and training step, we have created dummy variables for our categorical data (the song's decade label) and split our data into a train and test set. We will use 70% of the data for training and 30% for testing. 
As an initial exploration, we use PyCaret to analyze which models perform best. We check optimal algorithms using all factors as well as only controlable factors. We run each of these models individually to check that their metrics are consistant across a 10-fold analysis. 

In our next step, we will dig further into these top performing algorithms to create a finalized model that can help us best predict a Song's week position on the Billboard Top 100 chart. 

In [None]:
# Last Update: 2023-03-15