### 1. import libraries

In [1]:
import re
import string
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, make_scorer, confusion_matrix, ConfusionMatrixDisplay

In [2]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### 2. import dataset

In [3]:
df = pd.read_csv('input.csv', encoding='latin1')
df = df[df['Review'].notnull()]
df.shape

(54, 2)

In [4]:
df['Review'].value_counts()

Review
0.0    36
1.0    18
Name: count, dtype: int64

### 3. clean data

In [5]:
def clean_text(text):
    if pd.isnull(text): return ''
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = text.strip()  # remove leading/trailing spaces
    
    stemmer = PorterStemmer() # reduces a word to its root form by removing suffixes
    words = text.split() # split sentences to words 
    words = [word for word in words if word not in stopwords.words('english')]  # remove stopwords
    words = [stemmer.stem(word) for word in words]  # stemming
    return ' '.join(words)

df['Clean Text'] = df['Review Text'].apply(clean_text)
df_clean = df[['Clean Text', 'Review']]
df_clean.shape

(54, 2)

In [6]:
tfidf = TfidfVectorizer()
df_tfidf = tfidf.fit_transform(df_clean['Clean Text'])
df_tfidf = pd.DataFrame(df_tfidf.toarray(), columns=tfidf.get_feature_names_out())
df_tfidf['review'] = df_clean['Review'].values
df_tfidf.shape

(54, 104)

### 4. grid search

In [7]:
x = df_tfidf.drop('review', axis=1)
y = df_tfidf[['review']]

rf = RandomForestClassifier()
param_grid = {'max_depth': [2, 5, 10],
              'max_features': ['sqrt', None, 'log2'],
              'min_samples_leaf': [3, 5, 10],
              'n_estimators': [5, 10, 20],
              'random_state': [0, 7, 42]}

scorer = make_scorer(roc_auc_score, needs_proba=True)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,    
                           scoring=scorer, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(x, y)

print("Best Parameters:", grid_search.best_params_)
print("Best AUC-ROC Score:", grid_search.best_score_)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 3, 'n_estimators': 10, 'random_state': 0}
Best AUC-ROC Score: 0.9236111111111112


In [8]:
rf = grid_search.best_estimator_
imp = rf.feature_importances_
fname = x.columns

df_imp = pd.DataFrame({'feature': fname, 'importance': imp})
df_imp = df_imp.sort_values('importance', ascending=False)
df_imp

Unnamed: 0,feature,importance
57,love,0.301363
76,read,0.200885
38,good,0.140564
39,great,0.135108
17,book,0.132368
87,terribl,0.027814
14,best,0.024782
19,charact,0.015669
85,struggl,0.009593
10,bad,0.005302


### 5. model fit

In [9]:
#rf = RandomForestClassifier(max_depth=2, max_features=None, min_samples_leaf=5, n_estimators=10, random_state=7)
rf = RandomForestClassifier(max_depth=5, max_features=None, min_samples_leaf=3, n_estimators=10, random_state=0)
rf = rf.fit(x,y)
p = rf.predict_proba(x)[:, 1]
print("AUC-ROC Score:", np.round(roc_auc_score(y, p),3))

AUC-ROC Score: 0.961


In [10]:
p = rf.predict(x)
cm = confusion_matrix(y, p)
cm

array([[35,  1],
       [ 4, 14]], dtype=int64)