# Hyperparameter tuning

Importing the libraries

In [1]:
# Importing the libraries
import swifter
import pandas as pd

Download NLTK resources

In [2]:
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/dathd6/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dathd6/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Importing data

In [3]:
from constants import DATASET_FOLDER, REVIEW_DATASET
from utils import stream_json

reviews = stream_json(f'{DATASET_FOLDER}/{REVIEW_DATASET}', chunk_size=10000)
df = pd.DataFrame(reviews)
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


### PreProcessing

In [None]:
from utils import preprocess_text, assign_sentiment

df['sentiment'] = df['stars'].apply(assign_sentiment)
df['processed_review'] = df['text'].swifter.apply(preprocess_text)

### Model Training

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from models.model_tuning import ModelTuning

VECTORIZERS = [CountVectorizer(), TfidfVectorizer()]
PARAM_GRID = {
    'random_state': [50],
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

scores = []

for vectorizer in VECTORIZERS:
    print(RandomForestClassifier, vectorizer)
    grid_search = ModelTuning(
        classifier=RandomForestClassifier,
        params=PARAM_GRID,
        cv=5,
        scoring='accuracy',
    )
    vec_X = vectorizer.fit_transform(df['processed_review'])
    grid_search.fit(vec_X, df['sentiment'])
    grid_search.calculate_metrics()
    scores.append({
        'classifier': grid_search.classifier,
        'vectorizer': vectorizer,
        'score': grid_search.metrics
    })

<class 'sklearn.ensemble._forest.RandomForestClassifier'> CountVectorizer()
<class 'sklearn.ensemble._forest.RandomForestClassifier'> TfidfVectorizer()


In [6]:
for score in scores:
    print(score)

{'classifier': RandomForestClassifier(n_estimators=200, random_state=50), 'vectorizer': CountVectorizer(), 'score': 0.8108000000000001}
{'classifier': RandomForestClassifier(min_samples_split=10, n_estimators=200, random_state=50), 'vectorizer': TfidfVectorizer(), 'score': 0.8097}
