## Hyperparameter Tuning

### Loading Dataset

In [2]:
import pandas as pd
import numpy as np

online_df = pd.read_csv(r'https://raw.githubusercontent.com/doryaswi/Data-Science/master/cleaned_onlinepopularity.csv')
online_df.drop('Unnamed: 0',axis=1,inplace=True)
online_df.head()

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,data_channel_Technology,data_channel_World,data_channel_missing,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,731,12,219.0,0.663594,1.0,0.815385,4,2,1,0,...,0,0,0,0,1,0,0,0,0,0
1,731,9,255.0,0.604743,1.0,0.791946,3,1,1,0,...,0,0,0,0,1,0,0,0,0,0
2,731,9,211.0,0.57513,1.0,0.663866,3,1,1,0,...,0,0,0,0,1,0,0,0,0,0
3,731,9,531.0,0.503788,1.0,0.665635,9,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,731,13,1072.0,0.415646,1.0,0.54089,19,19,20,0,...,1,0,0,0,1,0,0,0,0,0


In [3]:
copied_online_df = online_df.copy()

In [8]:
y = online_df.loc[:,'shares']
y[:5]

0     593
1     711
2    1500
3    1200
4     505
Name: shares, dtype: int64

In [9]:
X = online_df.drop('shares',axis=1)
X[:5]

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,data_channel_Technology,data_channel_World,data_channel_missing,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,731,12,219.0,0.663594,1.0,0.815385,4,2,1,0,...,0,0,0,0,1,0,0,0,0,0
1,731,9,255.0,0.604743,1.0,0.791946,3,1,1,0,...,0,0,0,0,1,0,0,0,0,0
2,731,9,211.0,0.57513,1.0,0.663866,3,1,1,0,...,0,0,0,0,1,0,0,0,0,0
3,731,9,531.0,0.503788,1.0,0.665635,9,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,731,13,1072.0,0.415646,1.0,0.54089,19,19,20,0,...,1,0,0,0,1,0,0,0,0,0


### Feature Sampling

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=13)

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
def feature_sampling_scores (max_features_list):
    feature_sampling_scores_list = []
    for max_feature in max_features_list:
        rfr = RandomForestRegressor(max_features=max_feature, random_state=1,n_estimators=50)
        rfr.fit(X_train, y_train)
        score = (max_feature, rfr.score(X_val,y_val))
        feature_sampling_scores_list.append(score)
    return feature_sampling_scores_list

In [15]:
%%time
feature_sampling_scores([0.5,'log2','sqrt',None])

Wall time: 5min 58s


[(0.5, -0.017188724437829483),
 ('log2', 0.030714467971431958),
 ('sqrt', 0.012181168695198696),
 (None, -0.05584395836689571)]

"log2" will be the best option to use when sampling my features.

### Number of Trees

In [16]:
rfr_n_trees = RandomForestRegressor(n_estimators=50, random_state=1, max_features='log2')

In [17]:
%%time
rfr_n_trees.fit(X_train, y_train)

Wall time: 16.5 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [18]:
rfr_n_trees.score(X_val,y_val)

0.030714467971431958

In [20]:
tree_predictions = np.vstack([estimator.predict(X_val) for estimator in rfr_n_trees.estimators_])

In [21]:
from sklearn.metrics import r2_score
r2_scores = [r2_score(y_val, np.mean(tree_predictions[:i + 1], axis = 0)) for i in range(0, len(rfr_n_trees.estimators_)) ]

In [22]:
from graph import trace_values, plot
x_vals = list(range(1, len(rfr_n_trees.estimators_) + 1))
trace = trace_values(x_vals, r2_scores)
plot([trace])

For this dataset, it looks like by using at least 40 trees/estimators, we will get a higher score which is about 0.03. For this reason, number of trees that will be used is 42.

### Number of Leaves

In [23]:
min_samples = np.arange(2,51,1)
min_samples

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [24]:
def num_leaves_wrapped_scores (min_samples):
    num_leaves_wrapped_scores_list = []
    for min_sample in min_samples:
        rfr = RandomForestRegressor(n_estimators=42, min_samples_leaf=min_sample, random_state=13)
        rfr.fit(X_train, y_train)
        score = rfr.score(X_val,y_val)
        num_leaves_wrapped_scores_list.append(score)
    return num_leaves_wrapped_scores_list

In [25]:
%%time
scores = num_leaves_wrapped_scores(min_samples)

Wall time: 1h 4min 10s


In [26]:
scores[:5]

[-0.000623431696067156,
 0.01013850646823733,
 0.017375281306550527,
 0.015715178648257777,
 0.014380746240613917]

In [27]:
wrapped_scores = np.column_stack((min_samples, scores))

In [28]:
trace_1 = trace_values(wrapped_scores[:, 0], wrapped_scores[:, 1])

In [29]:
plot([trace_1])

By having more samples in a leaf, the model seems to become more stable. For this reason, we will be using at least 46 as minimum sample for each leaf when training our model.