## Baseline with Simple Model

In this notebook, we tried simple models like logistic regression and naive Bayes with TF-IDF and bag of words using stratified k-fold cross-validation for training nd the GridSearchCrossValidation to find the parameters.

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

### Import libraries
We used pandas to read the datasets, pandarallel for parallel processing of the dataset, and scikit-learn for the Naive Bayes and Logistic Regression models to split the dataset and compute metrics.

In [None]:
import pandas as pd
from pandarallel import pandarallel
import ast
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [None]:
pandarallel.initialize(progress_bar=True)

## Read the dataset
This function **ast.literal_eval** is used to convert strings into Python objects, because when we load the cleaned dataset, its contents appeared as strings. We used the preprocessed dataset, created by the first data exploration and preprocess notebook.

In [None]:
dataRew=pd.read_csv('../Dataset/datiClean.csv')
dataMovie=pd.read_csv('../Dataset/movieclean.csv')

In [None]:
dataRew["clean_review"]=dataRew.loc[:,"clean_review"].parallel_apply(ast.literal_eval)

In [None]:
dataMovie["plot_clean"]=dataMovie.loc[:,"plot_clean"].parallel_apply(ast.literal_eval)

In [None]:
dataMovie.drop(['plot_synopsis','plot_summary'],axis=1,inplace=True)

### Split the Dataset

Let's divide the dataset into train and test sets, ensuring that the sets are balanced, we set the same random state in each notebook to ensure consistent division and facilitate better result comparison.

drop the useless field

In [None]:
dataRew.drop(['review_date','movie_id','user_id','rating','review_summary','review_text'],axis=1,inplace=True)

In [None]:
x=dataRew['clean_review']
y=dataRew['is_spoiler']

In [None]:
## Stratify balance the dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,random_state=42)

The train and test dataset follow the spoiler distribution of the original dataset

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### Apply Logistic Regression and Naive Bayes, with k fold

### Function for computing the result of the model

In [None]:
def print_mean():
    # Calculate the averages of the metrics
    mean_accuracy = np.mean(metrics['accuracy'])
    mean_precision = np.mean(metrics['precision'])
    mean_recall = np.mean(metrics['recall'])
    mean_f1_score = np.mean(metrics['f1_score'])

    # Print the averages of the metrics
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1_score)

In [None]:
def print_test(y_pred_test):
    # Calcolo delle metriche di valutazione sul set di test
    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_score_test = f1_score(y_test, y_pred_test)

    # Stampa delle metriche di valutazione sul set di test
    print("Test Accuracy:", accuracy_test)
    print("Test Precision:", precision_test)
    print("Test Recall:", recall_test)
    print("Test F1 Score:", f1_score_test)

Function to train the model, which takes as input the number of folds and the number of iterations to perform.

In [None]:
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1_score': []
}
def computeLogistic(folds,iter,X,y_train,penalty,C,class_weight):
    logistic_reg=LogisticRegression(max_iter=iter,penalty=penalty,C=C,class_weight=class_weight)
    ## Stratified k-fold grant me a balance division of classes

    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(X, y_train):
        X_fold_train, X_fold_val = X[train_index], X[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]
        
        # Training
        logistic_reg.fit(X_fold_train, y_fold_train)
        
        # Validation
        y_pred = logistic_reg.predict(X_fold_val)
        
        # Compute metrics
        metrics['accuracy'].append(accuracy_score(y_fold_val, y_pred))
        metrics['precision'].append(precision_score(y_fold_val, y_pred))
        metrics['recall'].append(recall_score(y_fold_val, y_pred))
        metrics['f1_score'].append(f1_score(y_fold_val, y_pred))
    return logistic_reg

Function to train the model, which takes as input the number of folds.

In [None]:
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1_score': []
}
def compute_naive(folds,X,y_train,alpha,fit_prior):
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    naive_bayes = MultinomialNB(alpha=alpha,fit_prior=fit_prior)
    
    for train_index, val_index in kf.split(X, y_train):
        X_fold_train, X_fold_val = X[train_index], X[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]
        
        # Training
        naive_bayes.fit(X_fold_train, y_fold_train)
        
        # Validation
        y_pred = naive_bayes.predict(X_fold_val)
        
        # Compute metrics
        metrics['accuracy'].append(accuracy_score(y_fold_val, y_pred))
        metrics['precision'].append(precision_score(y_fold_val, y_pred))
        metrics['recall'].append(recall_score(y_fold_val, y_pred))
        metrics['f1_score'].append(f1_score(y_fold_val, y_pred))
    return naive_bayes

## Using Bag of Words
To apply Bag of Words, first reconstruct a dummy text from tokens, then apply the function. As a result we have a matrix where each row corresponds to a document and each column corresponds to a token.

In [None]:
text=[" ".join(word) for word in X_train]

In [None]:
textT=[" ".join(word) for word in X_test]

In [None]:
## Bag of Words for train
vect=CountVectorizer()
X=vect.fit_transform(text)

In [None]:
## Bag of Words for test
X_t=vect.transform(textT)

In [None]:
y_train=y_train.values
y_test=y_test.values

### Logistic Regression

### Try Grid Search 
Grid Search to find the regularization parameters for logistic regression and the class weight parameter.

In [None]:

param_grid = {         
    'penalty': ['l1', 'l2'],                     # Regolarization type
    'C': [0.001, 0.01, 0.1, 1, 10, 100],         # Parameters for Reg
    'class_weight': [None, 'balanced']           # Weight for Classes
}



In [None]:
model=LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y_train)

In [None]:
best_params = grid_search.best_params_

In [None]:
print(best_params)

{'C': 0.01, 'class_weight': None, 'penalty': 'l2'}

In [None]:
##save the output of the logistic regression

with open("../Output/outputGridLog.txt", "a") as f:
    print(f" Best Parameters:{best_params}",file=f)

### Result Train and Validation

In [None]:
C=0.01
class_weight=None
penalty='l2'

In [None]:
logistic_reg=computeLogistic(5,1500,X,y_train,penalty,C,class_weight)

print_mean()

### Result Test

In [None]:
y_pred_test = logistic_reg.predict(X_t)
print_test(y_pred_test)


### Naive Bayes

### Grid Search for parameters
For Naive Bayes, we look for the smoothing parameter and the parameter that indicates whether to calculate the prior probability or not



In [None]:
### Grid search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'fit_prior': [True, False]
}


In [None]:
### TRY GRID
naive_bayes = MultinomialNB()
# Eseguire la ricerca su griglia
grid_search = GridSearchCV(naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y_train)

In [None]:
best_params = grid_search.best_params_

In [None]:
print(best_params)

### Result Validation, Training

In [None]:
naive_bayes=compute_naive(5,X,y_train,best_params['alpha'],best_params['fit_prior'])
print_mean()

### Result Test

In [None]:
y_pred_test = naive_bayes.predict(X_t)

print_test(y_pred_test)

## Using Tf-idf

In [None]:

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(text)
X_test_tfidf = tfidf_vectorizer.transform(textT)

In [None]:
X_train_tfidf

### Logistic Regression

### for tf-idf try different regularization parameters

For TF-IDF, keep the results from the previous grid search and only changes the parameter C.

In [None]:

param_grid = {                        
    'C': [0.001, 0.01, 0.1, 1, 10, 100],         # Parameters for Reg
}


In [None]:
model=LogisticRegression(max_iter=1500,penalty='l2')
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=3)
grid_search.fit(X_train_tfidf,y_train)

In [None]:
best_params = grid_search.best_params_

In [None]:
print(best_params)

### Apply the model

In [None]:
C=1
penalty="l2"
class_weight=None

In [None]:
logistic_reg=computeLogistic(5,1500,X_train_tfidf,y_train,penalty,C,class_weight)
print_mean()

In [None]:
y_pred_test = logistic_reg.predict(X_test_tfidf)
print_test(y_pred_test)

### Naive Bayes

In [None]:
### Grid search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'fit_prior': [True, False]
}

In [None]:
### TRY GRID
naive_bayes = MultinomialNB()
# Eseguire la ricerca su griglia
grid_search = GridSearchCV(naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

In [None]:
best_params = grid_search.best_params_

In [None]:
print(best_params)

In [None]:
naive_bayes=compute_naive(5,X_train_tfidf,y_train,best_params['alpha'],best_params['fit_prior'])
print_mean()

In [None]:
y_pred_test = naive_bayes.predict(X_test_tfidf)

print_test(y_pred_test)