In [None]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Intialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_reviews(reviews):
    corpus = []
    
    for review in reviews:
        review = review.lower()                      # Lowercase
        
        # Tokenization
        words = word_tokenize(review)
        # Stemming
        words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
        # Lemmatization
        words = [lemmatizer.lemmatize(word.lower()) for word in words]

        review = ' '.join(words)
        corpus.append(review)
        
    return corpus

# Read data
dataset = pd.read_csv('./musical1.tsv', delimiter='\t', encoding='utf-8')

# Review counts
print(dataset['Score'].value_counts())
print()

# Pre-process the reviews
processed_reviews = preprocess_reviews(dataset['Review'])
dataset['Processed_Review'] = processed_reviews

# Vectorization (BOW)
bow_vectorizer = CountVectorizer()
X = bow_vectorizer.fit_transform(dataset['Processed_Review'])
y = dataset['Score']

# Split the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tuning Random Forest Hyperparameters 
param_grid = {
    'n_estimators': [100, 200, 300], #Number of trees in forest
    'max_depth': [None, 10, 20, 30]  #Depth of each tree. Note: None value for max_depth indicates all nodes are continued to be split until all leaves are pure
}
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Random Forest Model
best_rf_classifier = grid_search.best_estimator_

# Predictions for test data
y_pred = best_rf_classifier.predict(X_test)

print(f"Best Parameters: {grid_search.best_params_}\n")

# Confusion Matrix
mat = confusion_matrix(y_test, y_pred)
truNeg = mat[0][0]
falPos = mat[0][1]
falNeg = mat[1][0]
truPos = mat[1][1]

print("Confusion matrix:")
print(mat)
print()
#print(classification_report(y_test, y_pred))

# Evaluation
accuracy = (truPos + truNeg) / (truPos + falPos + falNeg + truNeg)
precision = truPos / (truPos + falPos)
recall = truPos / (truPos + falNeg)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mosha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mosha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mosha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Score
1    533
0    467
Name: count, dtype: int64

Best Parameters: {'max_depth': None, 'n_estimators': 200}

Confusion matrix:
[[60 26]
 [19 95]]

Accuracy: 0.775
Precision: 0.7851239669421488
Recall: 0.8333333333333334
F1 Score: 0.8085106382978725
