In [None]:
import pandas as pd
from pandarallel import pandarallel
import ast
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
from gensim.models import Word2Vec

In [None]:
pandarallel.initialize(progress_bar=True)

In [None]:
dataRew=pd.read_csv('../Dataset/datiClean.csv')
dataMovie=pd.read_csv('../Dataset/movieclean.csv')

In [None]:
dataRew["clean_review"]=dataRew.loc[:,"clean_review"].parallel_apply(ast.literal_eval)

In [None]:
dataMovie["plot_clean"]=dataMovie.loc[:,"plot_clean"].parallel_apply(ast.literal_eval)

In [None]:
dataRew.drop(['review_text'],axis=1,inplace=True)

In [None]:
dataMovie.drop(['plot_synopsis','plot_summary'],axis=1,inplace=True)

### Split the Dataset

important distribute well the labels in the train and test dataset

In [None]:
dataRew.drop(['review_date','movie_id','user_id','rating','review_summary'],axis=1,inplace=True)

In [None]:
x=dataRew['clean_review']
y=dataRew['is_spoiler']

In [None]:
## Stratify bilancia i dataset 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,random_state=42)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

### Apply Logistic Regression and Naive Bayes, with k fold

### Function for computing the result of the model

In [None]:
def print_mean():
    # Calcola le medie delle metriche
    mean_accuracy = np.mean(metrics['accuracy'])
    mean_precision = np.mean(metrics['precision'])
    mean_recall = np.mean(metrics['recall'])
    mean_f1_score = np.mean(metrics['f1_score'])

    # Stampa le medie delle metriche
    print("Mean Accuracy:", mean_accuracy)
    print("Mean Precision:", mean_precision)
    print("Mean Recall:", mean_recall)
    print("Mean F1 Score:", mean_f1_score)

In [None]:
def print_test(y_pred_test):
    # Calcolo delle metriche di valutazione sul set di test
    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_score_test = f1_score(y_test, y_pred_test)

    # Stampa delle metriche di valutazione sul set di test
    print("Test Accuracy:", accuracy_test)
    print("Test Precision:", precision_test)
    print("Test Recall:", recall_test)
    print("Test F1 Score:", f1_score_test)

In [None]:
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1_score': []
}
def computeLogistic(folds,iter,X,y_train):
    logistic_reg=LogisticRegression(max_iter=iter)
    ## Stratified k-fold grant me a balance division of classes

    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(X, y_train):
        X_fold_train, X_fold_val = X[train_index], X[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]
        
        # Training
        logistic_reg.fit(X_fold_train, y_fold_train)
        
        # Validation
        y_pred = logistic_reg.predict(X_fold_val)
        
        # Compute metrics
        metrics['accuracy'].append(accuracy_score(y_fold_val, y_pred))
        metrics['precision'].append(precision_score(y_fold_val, y_pred))
        metrics['recall'].append(recall_score(y_fold_val, y_pred))
        metrics['f1_score'].append(f1_score(y_fold_val, y_pred))
    return logistic_reg

In [None]:
metrics = {
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1_score': []
}
def compute_naive(folds,X,y_train):
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    naive_bayes = MultinomialNB()
    
    for train_index, val_index in kf.split(X, y_train):
        X_fold_train, X_fold_val = X[train_index], X[val_index]
        y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]
        
        # Training
        naive_bayes.fit(X_fold_train, y_fold_train)
        
        # Validation
        y_pred = naive_bayes.predict(X_fold_val)
        
        # Compute metrics
        metrics['accuracy'].append(accuracy_score(y_fold_val, y_pred))
        metrics['precision'].append(precision_score(y_fold_val, y_pred))
        metrics['recall'].append(recall_score(y_fold_val, y_pred))
        metrics['f1_score'].append(f1_score(y_fold_val, y_pred))
    return naive_bayes

### Using Bag of Words

first create the text, using the array of vector

In [None]:
text=[" ".join(word) for word in X_train]

In [None]:
textT=[" ".join(word) for word in X_test]

In [None]:
## Bag of Words for train
vect=CountVectorizer()
X=vect.fit_transform(text)

In [None]:
## Bag of Words for test
X_t=vect.transform(textT)

In [None]:
y_train=y_train.values
y_test=y_test.values

## Logistic Regression

### Result Train and Validation

In [None]:
logistic_reg=computeLogistic(5,1000,X,y_train)
print_mean()

### Result Test

In [None]:
y_pred_test = logistic_reg.predict(X_t)
print_test(y_pred_test)


## Naive Bayes

### Result Validation, Training

In [None]:
naive_bayes=compute_naive(5,X,y_train)
print_mean()

### Result Test

In [None]:
y_pred_test = naive_bayes.predict(X_t)

print_test(y_pred_test)

## Tf-idf

In [None]:

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(text)
X_test_tfidf = tfidf_vectorizer.transform(textT)

In [None]:
X_train_tfidf

### Logistic Regression

In [None]:
logistic_reg=computeLogistic(5,1000,X_train_tfidf,y_train)
print_mean()

In [None]:
y_pred_test = logistic_reg.predict(X_test_tfidf)
print_test(y_pred_test)

### Naive Bayes

In [None]:
naive_bayes=compute_naive(5,X_train_tfidf,y_train)
print_mean()

In [None]:
y_pred_test = naive_bayes.predict(X_test_tfidf)

print_test(y_pred_test)

### Word2Vec

In [None]:
X_preWord=dataRew["clean_review"]

In [None]:
X_preWord

In [None]:
word2vec_model = Word2Vec(X_preWord, vector_size=100, window=5, min_count=1, sg=1)

In [None]:
words=word2vec_model.wv.most_similar("spoiler")

In [None]:
words

In [None]:
X_word2vectrain = [np.mean([word2vec_model.wv[word] for word in text], axis=0) for text in X_preWord]

In [None]:
X_train, X_test,y_train, y_test  = train_test_split(X_word2vectrain,y, test_size=0.2, random_state=42,stratify=y)

In [None]:
y_train=np.array(y_train)

In [None]:
y_train.value_counts()

In [None]:
X_train=np.array(X_train)
X_test=np.array(X_test)

In [None]:
logistic_reg=computeLogistic(5,1000,X_train,y_train)
print_mean()

In [None]:
y_pred_test = logistic_reg.predict(X_test)
print_test(y_pred_test)

## Naive Bayes

### Transform with normalization
i dati devono essere normalizzati perchè essendo negativi, non possiamo applicare il naive bayes

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
X_train_word2vec_normalized = scaler.fit_transform(X_train)
X_test_word2vec_normalized = scaler.transform(X_test)

In [None]:
naive_bayes=compute_naive(5,X_train_word2vec_normalized,y_train)
print_mean()

In [None]:
y_pred_test = naive_bayes.predict(X_test_word2vec_normalized)

print_test(y_pred_test)