# Popularity Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import sklearn.metrics as sm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

## Train and Test Data

In [2]:
X_train = pd.read_csv('x_train_popularity.csv')
X_test = pd.read_csv('x_test_popularity.csv')
y_train = pd.read_csv('y_train_popularity.csv')
y_test = pd.read_csv('y_test_popularity.csv')

In [3]:
y_train = X_train['popularity_target']
y_test = X_test['popularity_target']
X_train = X_train.drop(columns = ['popularity_target'],axis = 1)
X_test = X_test.drop(columns = ['popularity_target'],axis = 1)
[X_train.shape, y_train.shape, X_test.shape, y_test.shape]

[(175770, 49), (175770,), (58590, 49), (58590,)]

In [14]:
X_train.shape

(175770, 49)

In [13]:
X_train.columns

Index(['acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'songname_noneng',
       'num_words_in_songname', 'acousticness_year', 'danceability_year',
       'duration_ms_year', 'energy_year', 'instrumentalness_year',
       'liveness_year', 'loudness_year', 'speechiness_year', 'tempo_year',
       'valence_year', 'key_year', 'mode_year', 'acousticness_artist',
       'danceability_artist', 'duration_ms_artist', 'energy_artist',
       'instrumentalness_artist', 'liveness_artist', 'loudness_artist',
       'speechiness_artist', 'tempo_artist', 'valence_artist', 'key_artist',
       'mode_artist', 'count_artist', 'period_1920_to_1950',
       'period_1950_to_1980', 'period_1980_to_2000', 'period_2000_to_2010',
       'period_2010_to_2020', 'num_artists_grp_numart_1',
       'num_artists_grp_numart_2', 'num_artists_grp_numart_3',
       'num_artists_grp_numart_4_a

## Regression

### 1. Linear Regression

In [4]:
#X = data_reg.drop(columns = ['popularity_target'],axis = 1)
#y = data_reg['popularity_target']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_y_predict = lr_model.predict(X_test)
training_accuracy = lr_model.score(X_train,y_train)
test_accuracy = lr_model.score(X_test,y_test)
print('lr_y_predict:', lr_y_predict)

lr_y_predict: [3.63848433 1.04207685 1.04602914 ... 3.21853119 3.72945463 3.13514288]


In [5]:
training_accuracy = lr_model.score(X_train,y_train)
print('training accuracy:', training_accuracy)

training accuracy: 0.5930892194611539


In [6]:
test_accuracy = lr_model.score(X_test,y_test)
print('testing accuracy:', test_accuracy)

testing accuracy: 0.5921086591418128


In [7]:
r2_score(y_test, lr_y_predict)

0.5921086591418128

### 1.1 Linear Regression -- Grid Search

In [8]:
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(lr_model,parameters, cv=None)
grid.fit(X_train, y_train)
print ("r2 / variance : ", grid.best_score_)
print("Residual sum of squares: %.2f"
              % np.mean((grid.predict(X_test) - y_test) ** 2))

r2 / variance :  0.5928209552956973
Residual sum of squares: 0.83


In [9]:
grid.predict(X_train)
print('training accuracy:', r2_score(y_train, grid.predict(X_train)))

training accuracy: 0.5930892194611539


In [10]:
grid.predict(X_test)
print("testing accuracy:", r2_score(y_test, grid.predict(X_test)))

testing accuracy: 0.592108659141712


### 2. Gradient Boosting

In [11]:
gb_model = GradientBoostingRegressor(max_depth=4, n_estimators=200, random_state=42)
gb_model.fit(X_train, y_train)
gb_y_predict = gb_model.predict(X_test)
gb_model.score(X_train, y_train)
gb_model.score(X_test, y_test)
print('gb_y_predict:', gb_y_predict)

gb_y_predict: [3.72868502 0.84829503 1.16975416 ... 3.70536779 5.04286808 3.31985748]


### 3. XGB

In [None]:
xgbr = xgb.XGBRegressor(verbosity=0) 

xgbr.fit(X_train, y_train)
score = xgbr.score(X_train, y_train)  
ypred = xgbr.predict(X_test)
mse = mean_squared_error(y_test, ypred)
r2_score(ypred, y_test, multioutput='variance_weighted')
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))

### 4. SVM -- Took forever to run, didn't get the result

In [None]:
from sklearn import svm
svm_model = svm.SVC(kernel = 'rbf', C = 30, gamma = 'auto')
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

# Application

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import sklearn.metrics as sm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

#### 1. Load master dataset

In [21]:
data = pd.read_csv('data_final_popularity.csv')

In [9]:
data.columns

Index(['acousticness', 'artists', 'danceability', 'duration_ms', 'energy',
       'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'name', 'speechiness', 'tempo', 'valence', 'year',
       'artist_list', 'num_artists_grp', 'songname_noneng',
       'num_words_in_songname', 'period', 'artist_song_id',
       'acousticness_year', 'danceability_year', 'duration_ms_year',
       'energy_year', 'instrumentalness_year', 'liveness_year',
       'loudness_year', 'speechiness_year', 'tempo_year', 'valence_year',
       'key_year', 'mode_year', 'acousticness_artist', 'danceability_artist',
       'duration_ms_artist', 'energy_artist', 'instrumentalness_artist',
       'liveness_artist', 'loudness_artist', 'speechiness_artist',
       'tempo_artist', 'valence_artist', 'key_artist', 'mode_artist',
       'count_artist', 'popularity_target'],
      dtype='object')

In [22]:
#col_to_dummy = ['period', 'num_artists_grp']
data = pd.get_dummies(data = data, columns=['period', 'num_artists_grp'])
data.columns

Index(['acousticness', 'artists', 'danceability', 'duration_ms', 'energy',
       'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'name', 'speechiness', 'tempo', 'valence', 'year',
       'artist_list', 'songname_noneng', 'num_words_in_songname',
       'artist_song_id', 'acousticness_year', 'danceability_year',
       'duration_ms_year', 'energy_year', 'instrumentalness_year',
       'liveness_year', 'loudness_year', 'speechiness_year', 'tempo_year',
       'valence_year', 'key_year', 'mode_year', 'acousticness_artist',
       'danceability_artist', 'duration_ms_artist', 'energy_artist',
       'instrumentalness_artist', 'liveness_artist', 'loudness_artist',
       'speechiness_artist', 'tempo_artist', 'valence_artist', 'key_artist',
       'mode_artist', 'count_artist', 'popularity_target',
       'period_1920_to_1950', 'period_1950_to_1980', 'period_1980_to_2000',
       'period_2000_to_2010', 'period_2010_to_2020',
       'num_artists_grp_numart_

In [23]:
data.shape

(234372, 56)

#### 2. Load training and testing datasets

In [24]:
X_train = pd.read_csv('x_train_popularity.csv')
X_test = pd.read_csv('x_test_popularity.csv')
y_train = pd.read_csv('y_train_popularity.csv')
y_test = pd.read_csv('y_test_popularity.csv')

In [5]:
print("X_train info:")
print(X_train.shape)
print(X_train.columns)
print()

print("y_train info:")
print(y_train.shape)
print(y_train.columns)
print()

print("X_test info:")
print(X_test.shape)
print(X_test.columns)
print()

print("y_test info:")
print(y_test.shape)
print(y_test.columns)

X_train info:
(175770, 50)
Index(['acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'songname_noneng',
       'num_words_in_songname', 'acousticness_year', 'danceability_year',
       'duration_ms_year', 'energy_year', 'instrumentalness_year',
       'liveness_year', 'loudness_year', 'speechiness_year', 'tempo_year',
       'valence_year', 'key_year', 'mode_year', 'acousticness_artist',
       'danceability_artist', 'duration_ms_artist', 'energy_artist',
       'instrumentalness_artist', 'liveness_artist', 'loudness_artist',
       'speechiness_artist', 'tempo_artist', 'valence_artist', 'key_artist',
       'mode_artist', 'count_artist', 'popularity_target',
       'period_1920_to_1950', 'period_1950_to_1980', 'period_1980_to_2000',
       'period_2000_to_2010', 'period_2010_to_2020',
       'num_artists_grp_numart_1', 'num_artists_grp_numart_2',
       'num_ar

#### 3. Create training and testing dataframes for modeling

In [25]:
#col_to_exclude = ['artist_song_id', 'artists', 'artist_list', 'id', 'name', 'year']
#target_var = 'popularity_target

#X = data.drop(columns = ['popularity_target', 'artist_song_id', 'artists', 'artist_list', 'id', 'name', 'year'],axis = 1)
X = data.drop(columns = ['popularity_target'],axis = 1)
y = data['popularity_target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [26]:
print(X.shape)
print(y.shape)

(234372, 55)
(234372,)


In [27]:
X_real_train = X_train.drop(columns = ['artist_song_id', 'artists', 'artist_list', 'id', 'name', 'year'],axis = 1)
X_real_train

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,...,count_artist,period_1920_to_1950,period_1950_to_1980,period_1980_to_2000,period_2000_to_2010,period_2010_to_2020,num_artists_grp_numart_1,num_artists_grp_numart_2,num_artists_grp_numart_3,num_artists_grp_numart_4_above
62309,0.84300,0.148,237293,0.3040,0,0.827000,8,0.0959,-16.175,1,...,286.0,0,1,0,0,0,1,0,0,0
32016,0.97000,0.559,191707,0.1420,0,0.927000,5,0.1210,-15.550,1,...,48.0,1,0,0,0,0,1,0,0,0
43654,0.66700,0.463,247409,0.5120,0,0.000129,8,0.0815,-7.305,1,...,111.0,0,0,1,0,0,0,1,0,0
32400,0.99500,0.789,177408,0.1060,0,0.441000,7,0.1860,-14.662,0,...,9.0,1,0,0,0,0,1,0,0,0
90798,0.25600,0.513,290400,0.5440,0,0.000000,0,0.0862,-7.300,0,...,192.0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176963,0.91300,0.528,235773,0.3590,0,0.850000,5,0.2010,-10.415,1,...,18.0,0,1,0,0,0,0,1,0,0
117952,0.10600,0.569,123587,0.4250,0,0.000000,9,0.2750,-12.670,1,...,26.0,0,0,1,0,0,1,0,0,0
173685,0.77700,0.722,176040,0.6890,0,0.000000,3,0.6040,-7.516,1,...,8.0,0,1,0,0,0,0,1,0,0
43567,0.00185,0.533,253960,0.9240,0,0.185000,6,0.3730,-11.679,0,...,94.0,0,0,1,0,0,1,0,0,0


In [28]:
X_real_test = X_test.drop(columns = ['artist_song_id', 'artists', 'artist_list', 'id', 'name', 'year'],axis = 1)
X_real_test

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,...,count_artist,period_1920_to_1950,period_1950_to_1980,period_1980_to_2000,period_2000_to_2010,period_2010_to_2020,num_artists_grp_numart_1,num_artists_grp_numart_2,num_artists_grp_numart_3,num_artists_grp_numart_4_above
104429,0.963000,0.202,219613,0.0445,0,0.780000,2,0.3470,-30.649,0,...,30.0,1,0,0,0,0,0,1,0,0
227545,0.846000,0.594,256307,0.0626,0,0.000000,7,0.1060,-16.298,1,...,7.0,0,0,1,0,0,1,0,0,0
126434,0.975000,0.528,182987,0.2880,0,0.011300,5,0.0972,-12.759,1,...,3179.0,1,0,0,0,0,0,1,0,0
35186,0.987000,0.198,444880,0.2260,0,0.155000,5,0.3620,-12.531,1,...,250.0,0,1,0,0,0,0,0,0,1
84647,0.994000,0.359,171818,0.0364,0,0.914000,0,0.0627,-24.188,1,...,62.0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178945,0.801000,0.290,243640,0.3300,0,0.000724,3,0.1140,-11.307,1,...,1003.0,0,1,0,0,0,1,0,0,0
204364,0.000034,0.264,315667,0.9280,0,0.571000,11,0.1750,-13.599,1,...,34.0,0,0,1,0,0,1,0,0,0
11455,0.582000,0.228,141840,0.3580,0,0.000003,5,0.1290,-7.161,1,...,34.0,0,1,0,0,0,1,0,0,0
1120,0.975000,0.622,202760,0.1550,0,0.000000,4,0.2600,-16.196,1,...,4.0,1,0,0,0,0,1,0,0,0


In [None]:
print(X_real_train.shape)
print(X_real_test.shape)

#### 4. Run gradient boosting model and find the predicted values

In [29]:
gb_model = GradientBoostingRegressor(max_depth=4, n_estimators=200, random_state=42)
gb_model.fit(X_real_train, y_train)

gb_y_test = gb_model.predict(X_real_test)
gb_y_train = gb_model.predict(X_real_train)

print('gb_y_test:', gb_y_test)
print('gb_y_train:', gb_y_train)

gb_y_test: [1.1828935  4.37726246 1.01026774 ... 2.63365833 1.32789927 1.03590927]
gb_y_train: [2.7733698  1.57484786 4.05264807 ... 1.38076138 3.85250491 3.5592157 ]


In [31]:
print('train accuracy:', gb_model.score(X_real_train, y_train))
print('test accuracy:', gb_model.score(X_real_test, y_test))

train accuracy: 0.7620910154040665
test accuracy: 0.7571456187325651


#### 5. Attach results back to the master dataset

In [32]:
# Round popularity predicted
round_test = gb_y_test.round().astype(int)
roudn_train = gb_y_train.round().astype(int)

# Create dataframes and save popularity into them
test_output = pd.DataFrame(data = gb_y_test, columns = ['predicted_popularity'])
test_output['popularity_group'] = pd.DataFrame(data = round_test)

train_output = pd.DataFrame(data = gb_y_train, columns = ['predicted_popularity'])
train_output['popularity_group'] = pd.DataFrame(data = roudn_train)

In [33]:
# Combine with the datasets
X_test['predicted_popularity'] = test_output['predicted_popularity']
X_test['popularity_group'] = test_output['popularity_group']

X_train['predicted_popularity'] = train_output['predicted_popularity']
X_train['popularity_group'] = train_output['popularity_group']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

In [38]:
X_train.shape

(175779, 57)

In [37]:
# combine two dataframes
all_data = pd.concat([X_test, X_train], axis=0)
# Output results
all_data.to_csv('all_popularity.csv', index = False)

In [40]:
all_data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,period_1950_to_1980,period_1980_to_2000,period_2000_to_2010,period_2010_to_2020,num_artists_grp_numart_1,num_artists_grp_numart_2,num_artists_grp_numart_3,num_artists_grp_numart_4_above,predicted_popularity,popularity_group
104429,0.963,"['Richard Strauss', 'Bavarian State Orchestra']",0.202,219613,0.0445,0,7xU1KumXHBCXABmkYg722C,0.78,2,0.347,...,0,0,0,0,0,1,0,0,,
227545,0.846,['Tsai Chin'],0.594,256307,0.0626,0,2KjgqLUhkmEkXCXicj7iFP,0.0,7,0.106,...,0,1,0,0,1,0,0,0,,
126434,0.975,"['Francisco Canaro', 'Eduardo Adrian']",0.528,182987,0.288,0,5OmHEO46I13khn9Kf1g4Hu,0.0113,5,0.0972,...,0,0,0,0,0,1,0,0,,
35186,0.987,"['Giacomo Puccini', 'Maria Callas', 'Nicolai G...",0.198,444880,0.226,0,2pqaM7qZVySU9q1I58kXhp,0.155,5,0.362,...,1,0,0,0,0,0,0,1,1.341113,1.0
84647,0.994,"['Claude Debussy', 'Walter Gieseking']",0.359,171818,0.0364,0,07O6boPfQDWRzYmVR0N5o7,0.914,0,0.0627,...,1,0,0,0,0,1,0,0,,


# Recommendation

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
input_data = pd.read_csv('pre.csv')

In [67]:
input_data['cluster_labels'][input_data.artists == '2NE1']

29    5
Name: cluster_labels, dtype: int64

In [11]:
input_data.shape

(8301, 15)

In [6]:
input_data.artists

0       "Joseph And The Amazing Technicolor Dreamcoat"...
1                                     "Weird Al" Yankovic
2                                            'Til Tuesday
3                  (Con La Participación de Marc Anthony)
4                                              (Hed) P.E.
                              ...                        
8296                                                  陈昭伶
8297                                                  陳秀男
8298                                                  颜薇恩
8299                                                  麥志誠
8300                                                  龔詩嘉
Name: artists, Length: 8301, dtype: object

In [7]:
all_data = pd.read_csv('all_popularity.csv')

In [15]:
all_data.shape

(234372, 57)

In [45]:
all_data.name

0         Don Quixote Op.35: VI Variation V: The Knight’...
1                                       恰似你的溫柔 - Remastered
2                        Amando en Silencio - Remasterizado
3         Puccini: Madama Butterfly, Act 1: "Vogliatemi ...
4         Debussy: 12 Études, CD 143, L. 136: No. 1, Pou...
                                ...                        
234367                                          Night Rider
234368                                 No Hay Dedos Iguales
234369                                            Jambalaya
234370                                        Tommy The Cat
234371                                       All Or Nothing
Name: name, Length: 234372, dtype: object

In [9]:
all_data.artist_list

0                        Bavarian State Orchestra
1                                       Tsai Chin
2                                Francisco Canaro
3         Orchestra Del Teatro Alla Scala, Milano
4                                Walter Gieseking
                           ...                   
234367                               Eddie Sauter
234368                           Beto Quintanilla
234369                            Sally Sweetland
234370                                     Primus
234371                              Frank Sinatra
Name: artist_list, Length: 234372, dtype: object

In [69]:
combined = input_data.merge(all_data, left_on='artists', right_on='artist_list')[['artists_x', 'artist_list', 'name', 'predicted_popularity', 'popularity_group']]

In [47]:
combined = input_data.merge(all_data, 
                            left_on='artists', 
                            right_on='artist_list')[['artists_x', 'artist_list', 'name', 
                                                     'predicted_popularity', 'popularity_group']]
combined

Unnamed: 0,artists_x,artist_list,name,predicted_popularity,popularity_group
0,"""Joseph And The Amazing Technicolor Dreamcoat""...","""Joseph And The Amazing Technicolor Dreamcoat""...","Go, Go, Go Joseph",,
1,"""Joseph And The Amazing Technicolor Dreamcoat""...","""Joseph And The Amazing Technicolor Dreamcoat""...",Any Dream Will Do,3.603946,4.0
2,"""Joseph And The Amazing Technicolor Dreamcoat""...","""Joseph And The Amazing Technicolor Dreamcoat""...",Close Every Door To Me,,
3,"""Joseph And The Amazing Technicolor Dreamcoat""...","""Joseph And The Amazing Technicolor Dreamcoat""...",Jacob And Sons / Joseph's Coat,,
4,"""Weird Al"" Yankovic","""Weird Al"" Yankovic",Happy Birthday,,
...,...,...,...,...,...
68188,陳秀男,陳秀男,選擇,,
68189,陳秀男,陳秀男,瀟灑走一回,,
68190,颜薇恩,颜薇恩,祝你 Happy New Year,,
68191,麥志誠,麥志誠,難為正邪定分界,3.715483,4.0


In [48]:
combined.artists_x.nunique()

8301

In [57]:
# Only based on artists
#artist_popularity = combined.groupby('artists_x').mean('predicted_popularity')
#artist_popularity

In [61]:
artist_popularity = combined.groupby(['artists_x', 'name']).mean('predicted_popularity')
artist_popularity.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted_popularity,popularity_group
artists_x,name,Unnamed: 2_level_1,Unnamed: 3_level_1
"""Joseph And The Amazing Technicolor Dreamcoat"" 1991 London Cast",Any Dream Will Do,3.603946,4.0
"""Joseph And The Amazing Technicolor Dreamcoat"" 1991 London Cast",Close Every Door To Me,,
"""Joseph And The Amazing Technicolor Dreamcoat"" 1991 London Cast","Go, Go, Go Joseph",,
"""Joseph And The Amazing Technicolor Dreamcoat"" 1991 London Cast",Jacob And Sons / Joseph's Coat,,
"""Weird Al"" Yankovic",(This Song's Just) Six Words Long,5.201951,5.0
"""Weird Al"" Yankovic",Addicted to Spuds,,
"""Weird Al"" Yankovic",Albuquerque,,
"""Weird Al"" Yankovic","Amish Paradise (Parody of ""Gangsta's Paradise"" by Coolio)",2.455375,2.0
"""Weird Al"" Yankovic",Another One Rides the Bus,0.571289,1.0
"""Weird Al"" Yankovic",Bohemian Polka,1.099822,1.0


In [56]:
artist_popularity.sort_values(by=['predicted_popularity'], ascending = False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted_popularity,popularity_group
artists_x,name,Unnamed: 2_level_1,Unnamed: 3_level_1
Dailey & Vincent,Cumberland River,5.747432,6.0
Funkadelic,Maggot Brain,5.649359,6.0
Jwonder,Dirty S Hwy,5.592359,6.0
Interpol,C'mere,5.563474,6.0
Michael Jackson,Liberian Girl - 2012 Remastered Version,5.560343,6.0


# Output

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### 1. Read test and train files

In [2]:
X_train = pd.read_csv('x_train_popularity.csv')
X_test = pd.read_csv('x_test_popularity.csv')
y_train = pd.read_csv('y_train_popularity.csv')
y_test = pd.read_csv('y_test_popularity.csv')

In [3]:
y_train = X_train['popularity_target']
y_test = X_test['popularity_target']
X_train = X_train.drop(columns = ['popularity_target'],axis = 1)
X_test = X_test.drop(columns = ['popularity_target'],axis = 1)
[X_train.shape, y_train.shape, X_test.shape, y_test.shape]

[(175770, 49), (175770,), (58590, 49), (58590,)]

#### 2. Fit in the gradient boosting model

In [4]:
gb_model = GradientBoostingRegressor(max_depth=4, n_estimators=200, random_state=42)
gb_model.fit(X_train, y_train)

# Get popularity predicted
gb_y_test = gb_model.predict(X_test)
gb_y_train = gb_model.predict(X_train)
print('gb_y_test:', gb_y_test)
print('gb_y_train:', gb_y_train)

gb_y_test: [3.72868502 0.84829503 1.16975416 ... 3.70536779 5.04286808 3.31985748]
gb_y_train: [3.61372389 1.58944808 3.20849484 ... 3.36884802 1.11285303 0.67874693]


In [5]:
# Round popularity predicted
round_test = gb_y_test.round().astype(int)
roudn_train = gb_y_train.round().astype(int)

In [6]:
# Create dataframes and save popularity into them
test_output = pd.DataFrame(data = gb_y_test, columns = ['predicted_popularity'])
test_output['popularity_group'] = pd.DataFrame(data = round_test)

train_output = pd.DataFrame(data = gb_y_train, columns = ['predicted_popularity'])
train_output['popularity_group'] = pd.DataFrame(data = roudn_train)

In [7]:
# Combine with the datasets
X_test['predicted_popularity'] = test_output['predicted_popularity']
X_test['popularity_group'] = test_output['popularity_group']

X_train['predicted_popularity'] = train_output['predicted_popularity']
X_train['popularity_group'] = train_output['popularity_group']

In [8]:
X_test

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,...,period_1950_to_1980,period_1980_to_2000,period_2000_to_2010,period_2010_to_2020,num_artists_grp_numart_1,num_artists_grp_numart_2,num_artists_grp_numart_3,num_artists_grp_numart_4_above,predicted_popularity,popularity_group
0,0.249000,0.561,225480,0.636,0,0.003600,0,0.0588,-9.525,1,...,1,0,0,0,1,0,0,0,3.728685,4
1,0.000456,0.506,320014,0.974,0,0.958000,10,0.9440,-6.599,0,...,0,0,0,1,1,0,0,0,0.848295,1
2,0.739000,0.434,36040,0.438,0,0.879000,4,0.3560,-18.721,1,...,0,0,0,0,0,0,0,1,1.169754,1
3,0.796000,0.627,167960,0.317,0,0.000000,8,0.1840,-12.787,1,...,1,0,0,0,0,1,0,0,2.740016,3
4,0.000138,0.416,194520,0.819,0,0.001080,9,0.1220,-3.974,1,...,0,0,1,0,1,0,0,0,4.297614,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58585,0.987000,0.461,79400,0.214,0,0.000000,2,0.5190,-13.182,1,...,0,0,0,0,0,0,0,1,1.046192,1
58586,0.020300,0.740,232440,0.849,0,0.000004,10,0.3460,-3.205,1,...,0,0,1,0,1,0,0,0,4.318287,4
58587,0.840000,0.725,231907,0.362,0,0.181000,7,0.1060,-12.424,1,...,0,0,0,1,1,0,0,0,3.705368,4
58588,0.067300,0.569,214363,0.865,1,0.000000,4,0.3120,-3.844,0,...,0,0,0,1,0,1,0,0,5.042868,5


In [9]:
X_train

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,...,period_1950_to_1980,period_1980_to_2000,period_2000_to_2010,period_2010_to_2020,num_artists_grp_numart_1,num_artists_grp_numart_2,num_artists_grp_numart_3,num_artists_grp_numart_4_above,predicted_popularity,popularity_group
0,0.743000,0.558,281134,0.282,0,0.000000,8,0.0651,-17.901,1,...,1,0,0,0,0,0,1,0,3.613724,4
1,0.799000,0.204,294800,0.227,0,0.879000,0,0.2140,-12.959,0,...,1,0,0,0,0,1,0,0,1.589448,2
2,0.614000,0.366,151133,0.788,0,0.000036,9,0.1230,-7.001,1,...,1,0,0,0,1,0,0,0,3.208495,3
3,0.194000,0.805,321649,0.858,1,0.000008,0,0.2560,-3.067,1,...,0,0,0,1,0,0,0,1,2.785609,3
4,0.168000,0.579,266173,0.874,0,0.001060,1,0.0749,-7.248,0,...,0,1,0,0,1,0,0,0,3.799443,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175765,0.901000,0.565,180000,0.309,0,0.735000,0,0.1370,-19.260,0,...,1,0,0,0,1,0,0,0,3.475318,3
175766,0.961000,0.174,429227,0.092,0,0.801000,5,0.1340,-20.070,1,...,0,0,0,0,0,1,0,0,1.158789,1
175767,0.000149,0.177,226280,0.962,0,0.068000,9,0.9530,-6.298,1,...,1,0,0,0,1,0,0,0,3.368848,3
175768,0.926000,0.500,170400,0.299,0,0.838000,2,0.3680,-9.251,1,...,0,0,0,0,0,1,0,0,1.112853,1


In [10]:
# Output results
X_test.to_csv('test_output.csv', index = False)
X_train.to_csv('train_output.csv', index = False)