<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Reading-the-datasets" data-toc-modified-id="Reading-the-datasets-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Reading the datasets</a></span></li><li><span><a href="#One-hot-encoding-the-different-attributes-as-a-separate-feature:" data-toc-modified-id="One-hot-encoding-the-different-attributes-as-a-separate-feature:-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>One-hot encoding the different attributes as a separate feature:</a></span></li><li><span><a href="#Defining-the-X-and-y-parameters-for-the-random-forest-classifier" data-toc-modified-id="Defining-the-X-and-y-parameters-for-the-random-forest-classifier-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Defining the X and y parameters for the random forest classifier</a></span></li><li><span><a href="#Running-a-random-search-over-the-random-forest-classifier" data-toc-modified-id="Running-a-random-search-over-the-random-forest-classifier-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Running a random search over the random forest classifier</a></span></li></ul></div>

In [29]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

### Reading the datasets 

In [13]:
df = pd.read_csv("input/raw_data.csv", parse_dates=['date'])

In [2]:
df_exploded = pd.read_csv("outputs/preprocessed/review_attribute_exploded_tagging.csv")

In [3]:
df_exploded.head()

Unnamed: 0,modified_id,keyword,cluster
0,6254,soft carb full bod,taste
1,6254,dark roast malts,style
2,6254,dark roast coffee,ingredients
3,6254,dark brown,appearance
4,6347,hazy golden body large,appearance


### One-hot encoding the different attributes as a separate feature:
The aim is to to test the features (in this case the attributes) with the highest feature relevance after fitting a random forest model on top of it.

In [5]:
one_hot_encoded_data = pd.get_dummies(df_exploded, columns = ['cluster'])

In [6]:
one_hot_encoded_data.head()

Unnamed: 0,modified_id,keyword,cluster_alcohol,cluster_appearance,cluster_aroma,cluster_carbonation,cluster_finish,cluster_ingredients,cluster_packaging,cluster_palate,cluster_place_of_service,cluster_price,cluster_season,cluster_style,cluster_taste
0,6254,soft carb full bod,0,0,0,0,0,0,0,0,0,0,0,0,1
1,6254,dark roast malts,0,0,0,0,0,0,0,0,0,0,0,1,0
2,6254,dark roast coffee,0,0,0,0,0,1,0,0,0,0,0,0,0
3,6254,dark brown,0,1,0,0,0,0,0,0,0,0,0,0,0
4,6347,hazy golden body large,0,1,0,0,0,0,0,0,0,0,0,0,0


In [7]:
one_hot_encoded_data.drop(columns = ['keyword'], inplace=True)

In [9]:
one_hot_encoded_data.columns.unique()
cluster_agg_dict = {}
for column in one_hot_encoded_data.columns.unique():
    if('cluster_' in column):
        cluster_agg_dict[column] = 'sum'

In [10]:
cluster_agg_dict

{'cluster_alcohol': 'sum',
 'cluster_appearance': 'sum',
 'cluster_aroma': 'sum',
 'cluster_carbonation': 'sum',
 'cluster_finish': 'sum',
 'cluster_ingredients': 'sum',
 'cluster_packaging': 'sum',
 'cluster_palate': 'sum',
 'cluster_place_of_service': 'sum',
 'cluster_price': 'sum',
 'cluster_season': 'sum',
 'cluster_style': 'sum',
 'cluster_taste': 'sum'}

In [11]:
# doing the aggregation because, in case for an attribute there's the mention of an attribute multiple times, it would get added up and contain more weight than the other features
one_hot_encoded_data_grouped = one_hot_encoded_data.groupby(['modified_id']).agg(cluster_agg_dict).reset_index()

In [14]:
merged_df = one_hot_encoded_data_grouped.merge(df[['id', 'rating']], left_on='modified_id', right_on='id', how='inner')

In [15]:
merged_df.head()

Unnamed: 0,modified_id,cluster_alcohol,cluster_appearance,cluster_aroma,cluster_carbonation,cluster_finish,cluster_ingredients,cluster_packaging,cluster_palate,cluster_place_of_service,cluster_price,cluster_season,cluster_style,cluster_taste,id,rating
0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,1,2.0
1,2,0,0,0,0,0,1,0,0,0,1,0,2,1,2,4.0
2,3,0,0,0,0,0,0,0,0,0,0,1,0,2,3,5.0
3,4,0,0,0,0,0,1,0,0,0,0,1,0,2,4,5.0
4,6,0,0,0,0,0,0,0,0,0,0,1,0,1,6,4.0


### Defining the X and y parameters for the random forest classifier 

In [17]:
features = [i for i in one_hot_encoded_data.columns.unique() if 'cluster_' in i]
features

['cluster_alcohol',
 'cluster_appearance',
 'cluster_aroma',
 'cluster_carbonation',
 'cluster_finish',
 'cluster_ingredients',
 'cluster_packaging',
 'cluster_palate',
 'cluster_place_of_service',
 'cluster_price',
 'cluster_season',
 'cluster_style',
 'cluster_taste']

In [18]:
X = merged_df[features].to_numpy()
target_ar = merged_df['rating'].to_numpy()
y = [str(i) for i in target_ar]

### Running a random search over the random forest classifier

In [26]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid,
                               n_iter = 100, 
                               cv = 3,
                               verbose=2,
                               random_state=42,
                               n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


