###  Importing Libraries

In [35]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings   
warnings.filterwarnings("ignore")

### Data Inspection

In [36]:
# Reading data
train = pd.read_csv("train_0OECtn8.csv")
test = pd.read_csv("test_1zqHu22.csv")

In [37]:
# Shape of data
train.shape,test.shape

((89197, 10), (11121, 9))

In [38]:
train.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35
3,4,12597,23,112,19,Male,Student,220,613,3.77
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13


In [39]:
test.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views
0,89198,7986,12,42,14,Male,Student,180,138
1,89199,11278,34,115,14,Male,Student,230,840
2,89200,17245,8,110,44,Female,Working Professional,280,628
3,89201,9851,16,137,18,Male,Student,270,462
4,89202,16008,34,96,47,Female,Other,230,840


In [40]:
# Checking for missing values
train.isnull().sum()


row_id              0
user_id             0
category_id         0
video_id            0
age                 0
gender              0
profession          0
followers           0
views               0
engagement_score    0
dtype: int64

In [41]:
# Checking for missing values
test.isnull().sum()

row_id         0
user_id        0
category_id    0
video_id       0
age            0
gender         0
profession     0
followers      0
views          0
dtype: int64

In [42]:
# Checking datatype
train.dtypes

row_id                int64
user_id               int64
category_id           int64
video_id              int64
age                   int64
gender               object
profession           object
followers             int64
views                 int64
engagement_score    float64
dtype: object

In [43]:
# Checking datatype
test.dtypes

row_id          int64
user_id         int64
category_id     int64
video_id        int64
age             int64
gender         object
profession     object
followers       int64
views           int64
dtype: object

### Typecasting

In [44]:
# Typecasting variables
train['gender'] = train['gender'].astype('category')
train['profession'] = train['profession'].astype('category')
test['gender'] = test['gender'].astype('category')
test['profession'] = test['profession'].astype('category')

In [45]:
train.dtypes

row_id                 int64
user_id                int64
category_id            int64
video_id               int64
age                    int64
gender              category
profession          category
followers              int64
views                  int64
engagement_score     float64
dtype: object

In [46]:
test.dtypes

row_id            int64
user_id           int64
category_id       int64
video_id          int64
age               int64
gender         category
profession     category
followers         int64
views             int64
dtype: object

### Feature Engineering

In [47]:
train['age'].value_counts()

18    4870
19    4528
20    4399
17    4356
16    4014
15    3875
21    3722
22    3576
14    3086
23    2971
24    2915
31    2796
30    2781
33    2756
32    2687
28    2687
29    2602
13    2588
34    2541
26    2507
25    2463
35    2422
27    2301
36    2003
37    1816
12    1776
38    1480
39    1238
11    1171
40    1066
41     834
10     752
42     675
43     592
44     449
46     343
45     340
47     264
48     197
50     172
49     131
51      97
53      81
52      66
54      51
55      48
57      40
56      24
58       8
63       8
68       6
66       5
64       5
59       4
60       3
67       3
61       3
62       3
Name: age, dtype: int64

In [48]:
train['age_range'] = 'str'
train['age_range'][(train['age']<= 40)] = 'Young Adults'
train['age_range'][(train['age']>40) & (train['age']<=57) ] = 'Middle-aged Adults'
train['age_range'][(train['age']>57)] = 'Old Adults'
train['age_range'] = train['age_range'].astype('category')

test['age_range'] = 'str'
test['age_range'][(test['age']<= 40)] = 'Young Adults'
test['age_range'][(test['age']>40) & (test['age']<=57) ] = 'Middle-aged Adults'
test['age_range'][(test['age']>57)] = 'Old Adults'
test['age_range'] = test['age_range'].astype('category')

In [49]:
train['followers'].describe()

count    89197.000000
mean       252.460172
std         46.094468
min        160.000000
25%        230.000000
50%        240.000000
75%        280.000000
max        360.000000
Name: followers, dtype: float64

In [50]:
test['followers'].describe()

count    11121.000000
mean       249.691574
std         38.511469
min        160.000000
25%        230.000000
50%        240.000000
75%        270.000000
max        360.000000
Name: followers, dtype: float64

In [51]:
train['followers_range'] = 'str'
train['followers_range'][(train['followers']<= 240)] = 'Low Followers'
train['followers_range'][(train['followers']>240)] = 'High Followers'
train['followers_range'] = train['followers_range'].astype('category')

test['followers_range'] = 'str'
test['followers_range'][(test['followers']<= 240)] = 'Low Followers'
test['followers_range'][(test['followers']>240)] = 'High Followers'
test['followers_range'] = test['followers_range'].astype('category')

In [52]:
train['views'].describe()

count    89197.000000
mean       502.980268
std        268.569482
min         30.000000
25%        229.000000
50%        467.000000
75%        714.000000
max       1000.000000
Name: views, dtype: float64

In [53]:
test['views'].describe()

count    11121.000000
mean       454.959986
std        249.759038
min         44.000000
25%        229.000000
50%        369.000000
75%        662.000000
max        990.000000
Name: views, dtype: float64

In [54]:
train['views_range'] = 'str'
train['views_range'][(train['views']<= 467)] = 'Low Views'
train['views_range'][(train['views']>467)] = 'High Views'
train['views_range'] = train['views_range'].astype('category')

test['views_range'] = 'str'
test['views_range'][(test['views']<= 369)] = 'Low Views'
test['views_range'][(test['views']>369)] = 'High Views'
test['views_range'] = test['views_range'].astype('category')

In [55]:
train['category_id'].value_counts()[:].describe()

count      47.000000
mean     1897.808511
std      1713.480651
min        47.000000
25%       546.500000
50%      1309.000000
75%      2691.500000
max      8104.000000
Name: category_id, dtype: float64

In [56]:
# getting category ids which have less than 1309 records
tmp = train['category_id'].value_counts()[:]
categories = tmp[tmp<1309].index

In [57]:
train['category_range'] = 'str'
train['category_range'][train['category_id'].isin(categories[:])] = 'UnPopular Category'
train['category_range'][~train['category_id'].isin(categories[:])] = 'Popular Category'
train['category_range'] = train['category_range'].astype('category')

In [58]:
test['category_id'].value_counts()[:].describe()

count      23.000000
mean      483.521739
std       447.375973
min        12.000000
25%       159.000000
50%       187.000000
75%       762.500000
max      1489.000000
Name: category_id, dtype: float64

In [59]:
# getting category ids which have less than 187 records
tmp = test['category_id'].value_counts()[:]
categories = tmp[tmp<187].index

In [60]:
test['category_range'] = 'str'
test['category_range'][test['category_id'].isin(categories[:])] = 'UnPopular Category'
test['category_range'][~test['category_id'].isin(categories[:])] = 'Popular Category'
test['category_range'] = test['category_range'].astype('category')

In [61]:
train['video_id'].value_counts()[:].describe()

count     175.000000
mean      509.697143
std       259.594699
min        47.000000
25%       271.500000
50%       586.000000
75%       664.500000
max      1337.000000
Name: video_id, dtype: float64

In [62]:
# getting video ids which have less than 586 records
tmp = train['video_id'].value_counts()[:]
videos = tmp[tmp<586].index

In [63]:
train['video_range'] = 'str'
train['video_range'][train['video_id'].isin(videos[:])] = 'UnPopular Video'
train['video_range'][~train['video_id'].isin(videos[:])] = 'Popular Video'
train['video_range'] = train['video_range'].astype('category')

In [64]:
test['video_id'].value_counts()[:].describe()

count    128.000000
mean      86.882812
std       59.886588
min        5.000000
25%       35.000000
50%       61.500000
75%      148.000000
max      232.000000
Name: video_id, dtype: float64

In [65]:
# getting video ids which have less than 61 records
tmp = test['video_id'].value_counts()[:]
videos = tmp[tmp<61].index

In [66]:
test['video_range'] = 'str'
test['video_range'][test['video_id'].isin(videos[:])] = 'UnPopular Video'
test['video_range'][~test['video_id'].isin(videos[:])] = 'Popular Video'
test['video_range'] = test['video_range'].astype('category')

In [67]:
train.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score,age_range,followers_range,views_range,category_range,video_range
0,1,19990,37,128,24,Male,Student,180,1000,4.33,Young Adults,Low Followers,High Views,UnPopular Category,UnPopular Video
1,2,5304,32,132,14,Female,Student,330,714,1.79,Young Adults,High Followers,High Views,Popular Category,Popular Video
2,3,1840,12,24,19,Male,Student,180,138,4.35,Young Adults,Low Followers,Low Views,Popular Category,UnPopular Video
3,4,12597,23,112,19,Male,Student,220,613,3.77,Young Adults,Low Followers,High Views,Popular Category,Popular Video
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13,Young Adults,Low Followers,High Views,Popular Category,Popular Video


In [68]:
test.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,age_range,followers_range,views_range,category_range,video_range
0,89198,7986,12,42,14,Male,Student,180,138,Young Adults,Low Followers,Low Views,Popular Category,Popular Video
1,89199,11278,34,115,14,Male,Student,230,840,Young Adults,Low Followers,High Views,Popular Category,Popular Video
2,89200,17245,8,110,44,Female,Working Professional,280,628,Middle-aged Adults,High Followers,High Views,Popular Category,Popular Video
3,89201,9851,16,137,18,Male,Student,270,462,Young Adults,High Followers,High Views,Popular Category,Popular Video
4,89202,16008,34,96,47,Female,Other,230,840,Middle-aged Adults,Low Followers,High Views,Popular Category,Popular Video


### Building Model

In [69]:
# Handling categorical variables
train = pd.get_dummies(train)
train.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,followers,views,engagement_score,gender_Female,gender_Male,profession_Other,profession_Student,profession_Working Professional,age_range_Middle-aged Adults,age_range_Old Adults,age_range_Young Adults,followers_range_High Followers,followers_range_Low Followers,views_range_High Views,views_range_Low Views,category_range_Popular Category,category_range_UnPopular Category,video_range_Popular Video,video_range_UnPopular Video
0,1,19990,37,128,24,180,1000,4.33,0,1,0,1,0,0,0,1,0,1,1,0,0,1,0,1
1,2,5304,32,132,14,330,714,1.79,1,0,0,1,0,0,0,1,1,0,1,0,1,0,1,0
2,3,1840,12,24,19,180,138,4.35,0,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1
3,4,12597,23,112,19,220,613,3.77,0,1,0,1,0,0,0,1,0,1,1,0,1,0,1,0
4,5,13626,23,112,27,220,613,3.13,0,1,0,0,1,0,0,1,0,1,1,0,1,0,1,0


In [70]:
# Handling categorical variables
test = pd.get_dummies(test)
test.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,followers,views,gender_Female,gender_Male,profession_Other,profession_Student,profession_Working Professional,age_range_Middle-aged Adults,age_range_Old Adults,age_range_Young Adults,followers_range_High Followers,followers_range_Low Followers,views_range_High Views,views_range_Low Views,category_range_Popular Category,category_range_UnPopular Category,video_range_Popular Video,video_range_UnPopular Video
0,89198,7986,12,42,14,180,138,0,1,0,1,0,0,0,1,0,1,0,1,1,0,1,0
1,89199,11278,34,115,14,230,840,0,1,0,1,0,0,0,1,0,1,1,0,1,0,1,0
2,89200,17245,8,110,44,280,628,1,0,0,0,1,1,0,0,1,0,1,0,1,0,1,0
3,89201,9851,16,137,18,270,462,0,1,0,1,0,0,0,1,1,0,1,0,1,0,1,0
4,89202,16008,34,96,47,230,840,1,0,1,0,0,1,0,0,0,1,1,0,1,0,1,0


In [71]:
train_cleaned=train.drop('row_id',axis=1) 
test_cleaned=test.drop('row_id',axis=1)

In [72]:
train_cleaned.head()

Unnamed: 0,user_id,category_id,video_id,age,followers,views,engagement_score,gender_Female,gender_Male,profession_Other,profession_Student,profession_Working Professional,age_range_Middle-aged Adults,age_range_Old Adults,age_range_Young Adults,followers_range_High Followers,followers_range_Low Followers,views_range_High Views,views_range_Low Views,category_range_Popular Category,category_range_UnPopular Category,video_range_Popular Video,video_range_UnPopular Video
0,19990,37,128,24,180,1000,4.33,0,1,0,1,0,0,0,1,0,1,1,0,0,1,0,1
1,5304,32,132,14,330,714,1.79,1,0,0,1,0,0,0,1,1,0,1,0,1,0,1,0
2,1840,12,24,19,180,138,4.35,0,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1
3,12597,23,112,19,220,613,3.77,0,1,0,1,0,0,0,1,0,1,1,0,1,0,1,0
4,13626,23,112,27,220,613,3.13,0,1,0,0,1,0,0,1,0,1,1,0,1,0,1,0


In [73]:
# Seperating independent and dependent variables
y = train_cleaned['engagement_score']
X = train_cleaned.drop(['engagement_score'], axis=1)

In [74]:
# Creating the train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 101, stratify=y, test_size=0.25)

In [75]:
# Import required libraries
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.metrics import mean_absolute_error,r2_score

In [76]:
# Creating the Gradient Boosting function
gbm = GradientBoostingRegressor(random_state=10)

In [77]:
# Fitting the model
gbm.fit(X_train, y_train)

GradientBoostingRegressor(random_state=10)

In [78]:
# Predictions 
train_predict = gbm.predict(X_train)
val_predict = gbm.predict(X_valid)
test_predict = gbm.predict(test_cleaned)

In [79]:
# MAE
mae_train=mean_absolute_error(y_train,train_predict)
mae_val=mean_absolute_error(y_valid,val_predict)
print('Training MAE: ', mae_train)
print('Validation MAE: ', mae_val)

Training MAE:  0.5316487808047178
Validation MAE:  0.529964570167084


In [80]:
# R2 score
r2_train=r2_score(y_train,train_predict)
r2_val=r2_score(y_valid, val_predict)
print('Training R2 score: ', r2_train)
print('Validation R2 score: ', r2_val)

Training R2 score:  0.34457682244527243
Validation R2 score:  0.34580502909824906


## Tuning

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
gbm = GradientBoostingRegressor(random_state=10)

gbm_params = {'max_depth': [2, 3, 5, 7], 
          'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.5],
          'n_estimators': [100, 200, 300, 400, 500]}

random_search = RandomizedSearchCV(gbm, 
                                   param_distributions = gbm_params, 
                                   scoring = r2_score, cv = 3, 
                                   verbose = 3, n_iter = 10, n_jobs = -1)

In [None]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=GradientBoostingRegressor(random_state=10),
                   n_jobs=-1,
                   param_distributions={'learning_rate': [0.05, 0.1, 0.2, 0.3,
                                                          0.5],
                                        'max_depth': [2, 3, 5, 7],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   scoring=<function r2_score at 0x7fad1f309b00>, verbose=3)

In [None]:
random_search.best_params_

{'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 500}

In [None]:
random_search.best_estimator_

GradientBoostingRegressor(learning_rate=0.5, max_depth=5, n_estimators=500,
                          random_state=10)

## Creating a GBM instance with new hyperparameter values

In [81]:
# Creating the Gradient Boosting function
gbm = GradientBoostingRegressor(learning_rate=0.5, max_depth=5, n_estimators=1500,
                          random_state=10)

In [82]:
# Fitting the model
gbm.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.5, max_depth=5, n_estimators=1500,
                          random_state=10)

In [83]:
# Predictions 
train_predict = gbm.predict(X_train)
val_predict = gbm.predict(X_valid)
test_predict = gbm.predict(test_cleaned)

In [84]:
# MAE
mae_train=mean_absolute_error(y_train,train_predict)
mae_val=mean_absolute_error(y_valid,val_predict)
print('Training MAE: ', mae_train)
print('Validation MAE: ', mae_val)

Training MAE:  0.24903694060309253
Validation MAE:  0.5013291110451759


In [85]:
# R2 score
r2_train=r2_score(y_train,train_predict)
r2_val=r2_score(y_valid, val_predict)
print('Training R2 score: ', r2_train)
print('Validation R2 score: ', r2_val)

Training R2 score:  0.8500656957286451
Validation R2 score:  0.38389531118206244


## Submission

In [86]:
TEST=pd.read_csv("sample_submission_JPlpRcN.csv")
TEST['engagement_score'] = test_predict
TEST.head(10)

Unnamed: 0,row_id,engagement_score
0,89198,3.942616
1,89199,3.682638
2,89200,2.480802
3,89201,3.194431
4,89202,2.207043
5,89203,3.400901
6,89204,3.885111
7,89205,4.186017
8,89206,2.418137
9,89207,4.104301


In [87]:
# Converting to csv format
TEST.to_csv('DanielThommanajobathonfeb2022final.csv',index=False)