## Model Evaluation

In [None]:
# Importing libraries

import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
# Libraries to help with machine learning
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Load train data 
df = pd.read_csv("victor_clean.csv")


In [None]:
df['genre'].value_counts().plot.pie(autopct='%1.1f%%')
r,c = df.shape
print("Number of rows: ",r)
print("Number of columns: ",c)
df.isnull().sum()

In [None]:
# Drop the null values
df = df.dropna()

# Check null values
df.isnull().sum()

In [None]:
# Split Dataset
from sklearn.model_selection import train_test_split
X = df['description']
Y = df['genre']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print('The size of the original dataset is', X.shape)
print('The size of the training dataset is', X_train.shape)
print('The size of the test dataset is', X_test.shape)


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100), # Lan
    "Support Vector Machine": SVC(kernel='linear'), # Gabriel
    "Gradient Boosting": GradientBoostingClassifier(), # Gabriel
    "Logistic Regression": LogisticRegression(), # Victor
    "Naive Bayes": MultinomialNB(), # Lan
    "Decision Tree": DecisionTreeClassifier() # Victor
}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Accuracy of {model_name}: {accuracy}")
    print(f"Classification Report of {model_name}:\n{report}\n")

From the results above, the best model for predicting the genres of the animes is the Naive Bayes model. 

However for all the models, the accuracy rate is moderately low ranging from 48%-52%. This may be because off the following reasons

1. Small dataset size: If there are only ~1000 samples total to train and evaluate on, that may not be enough data for the models to learn the patterns and relationships within each class. More training data could help improve accuracy.

2. Imbalanced classes: From the support counts, it looks like the classes are imbalanced, with Comedy, Romance and Adventure having fewer samples than Action and Slice of Life. This makes it harder for models to properly learn the minority classes. Resampling or oversampling techniques could help address class imbalance.

3. Noisy labels: There may be some inaccuracies or noise in the assigned labels of the training data that makes it hard for models to capture the correct signals. Cleaning up the labeling could help. Standard scale the data, overcome the data

4. Features need engineering: The current features being used may not be fully representative of the patterns needed to distinguish classes. Creating, transforming and selecting more predictive features could potentially improve separability.

5. Overfitting on train data: Some complex models like Random Forest may be overfitting too closely to the training data, hurting generalizability. More regularization, cross-validation and tuning could help reduce overfitting.

6. Class ambiguity: The classes may inherently have significant overlap or ambiguities that make them hard to precisely separate. For example, some Romance anime may also be Slice of Life. Disambiguating classes more cleanly could help. libraries such as smote to remove rows of similar data

## Model tuning

In [None]:
# Random Forest hyperparameters
rf_hyperparameters = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Support Vector Machine hyperparameters
svm_hyperparameters = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [3, 4, 5],
    "gamma": ["scale", "auto"]
}

# Gradient Boosting hyperparameters
gb_hyperparameters = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 1.0],
    "max_depth": [3, 4, 5]
}   

# Logistic Regression hyperparameters
lr_hyperparameters = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": [0.1, 1, 10],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

# Naive Bayes hyperparameters
nb_hyperparameters = {
    "alpha": [0.1, 0.5, 1.0],
    "fit_prior": [True, False]
}

# Decision Tree hyperparameters
dt_hyperparameters = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [None]:
# Tune model
hyperparameters = {
    "Random Forest": rf_hyperparameters,
    "Support Vector Machine": svm_hyperparameters,
    "Gradient Boosting": gb_hyperparameters,
    "Logistic Regression": lr_hyperparameters,
    "Naive Bayes": nb_hyperparameters,
    "Decision Tree": dt_hyperparameters
}

for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    grid = GridSearchCV(model, hyperparameters[model_name], cv=5)
    grid.fit(X_train_tfidf, y_train)

    print(f"Best hyperparameters for {model_name}:")
    print(grid.best_params_)
    print("="*50)

    # Update model with best hyperparameters
    models[model_name] = grid.best_estimator_

# Evaluate model
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"Accuracy of {model_name}: {accuracy}")
    print(f"Classification Report of {model_name}:\n{report}\n")