In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import statsmodels.api as sm
seed = 123

In [3]:
# Import dataset
df = pd.read_csv('data/features_df.csv')

### Feature Selection

In [4]:
# Split the data into training and test sets (30% held out for testing)
y = df.loc[:, ['viewCount']] # dependent variable
X = df.loc[:, ['titleLen', 'subscriberCount', 'avgViewCount', 'humanCount', 'HOW TO & STYLE', 'SPORTS', 'TRAVEL', 'Negative', 'titleINTJ']] # selected independant variables

# Feature Scaling
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X),columns=['titleLen', 'subscriberCount', 'avgViewCount', 'humanCount', 'HOW TO & STYLE', 'SPORTS', 'TRAVEL', 'Negative', 'titleINTJ'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123)

In [5]:
from sklearn.ensemble import RandomForestRegressor

In [6]:
### Grid Search
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [7]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train.values.ravel())
grid_search.best_params_

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 36.2min finished


{'bootstrap': True,
 'max_depth': 5,
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 100}

In [8]:
#fitting data
regr = RandomForestRegressor(max_depth=5, min_samples_leaf = 3,min_samples_split = 10, n_estimators = 100, random_state=0)
regr.fit(X_train, y_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=3,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [9]:
rf_pred = regr.predict(X_test)

In [10]:
#model eval
print('MAE', mean_absolute_error(y_test, rf_pred))
print('RMSE', np.sqrt(mean_squared_error(y_test, rf_pred)))
print('MSE', mean_squared_error(y_test, rf_pred))
print('R^2: %.2f' % r2_score(y_test, rf_pred))

MAE 2369201.468021728
RMSE 29491372.798912834
MSE 869741069564455.9
R^2: 0.09


In [11]:
import copy
res = copy.deepcopy(y_test)
res["Predicted"] = rf_pred

In [12]:
res

Unnamed: 0,viewCount,Predicted
53357,72813.0,1.537525e+06
353821,4631639.0,8.719152e+05
176077,1562.0,1.168055e+05
400735,6328.0,1.568150e+05
398987,5883.0,1.568150e+05
...,...,...
83889,590744.0,2.323929e+06
49880,38124.0,1.537525e+06
286915,781.0,1.168055e+05
393254,93338.0,2.756973e+05


In [13]:
res.Predicted.value_counts()

1.168055e+05    59686
8.719152e+05     6000
3.786658e+06     5975
1.537525e+06     5966
1.503723e+05     5388
                ...  
1.885899e+08        1
1.040817e+08        1
1.244506e+07        1
1.543571e+08        1
1.220589e+07        1
Name: Predicted, Length: 713, dtype: int64