In [1]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import sklearn
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import utils
import ml_functions

red_code = '\033[91m'
blue_code = '\033[94m'
green_code = '\033[92m'
yellow_code = '\033[93m'
reset_code = '\033[0m'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# word2vec_model_path = "models/modele_word2vec.bin"
# word2vec_model = Word2Vec.load_word2vec_format(word2vec_model_path, binary=True)

glove_vectors_path = "models/vecteurs_glove.txt"
glove_vectors = KeyedVectors.load_word2vec_format(glove_vectors_path, binary=False)

ValueError: not enough values to unpack (expected 2, got 0)

## Load the data & Préprocessing

In [3]:
path = "./datasets/movies/movies1000/"
alltxts,alllabs = utils.load_movies(path)

In [4]:
movies_df = pd.DataFrame()
movies_df['text'] = alltxts
movies_df['label'] = alllabs

In [6]:
preprocessed_alltxts = [utils.preprocess(alltxt) for alltxt in movies_df.text]

preprocessed_movies_df = pd.DataFrame()
preprocessed_movies_df['text'] = preprocessed_alltxts
preprocessed_movies_df['label'] = alllabs


# Tests & Evaluations

In [7]:
def evaluation(analyze_function, **vectorizer_args):

    print(f'{yellow_code}count{reset_code}')
    ml_functions.count_analyze(movies_df, analyze_function, **vectorizer_args)

    print(f'\n{yellow_code}tfidf{reset_code}')
    ml_functions.tfidf_analyze(movies_df, analyze_function, **vectorizer_args)

    print(f'\n{yellow_code}hasing{reset_code}')
    ml_functions.hashing_analyze(movies_df, analyze_function, **vectorizer_args)

    # print(f'\n{yellow_code}word2vec{reset_code}')
    # ml_functions.word2vec_analyze(movies_df, analyze_function, word2vec_model_path)

    # print(f'\n{yellow_code}glove{reset_code}')
    # ml_functions.glove_analyze(movies_df, analyze_function, glove_vectors_path)

In [8]:
def all_evaluations(analyze_function):

    print(f'{blue_code}Non binary{reset_code}')
    evaluation(analyze_function)

    print(f'{blue_code}Binary{reset_code}')
    evaluation(analyze_function, binary=True)

    print(f'{blue_code}Stop word{reset_code}')
    evaluation(analyze_function, stop_words='english')

    print(f'{blue_code}Réduction du dictionnaire{reset_code}')
    evaluation(analyze_function, max_df=.75)

    print(f'{blue_code}Bigram{reset_code}')
    evaluation(analyze_function,ngram_range=(1, 2))

    print(f'{blue_code}Trigram{reset_code}')
    evaluation(analyze_function, ngram_range=(1, 3))




# Régresionn linéaire

In [15]:
analyze_function = ml_functions.logistic_regression_analyze

all_evaluations(analyze_function)

[93mcount[0m
[92mAccuracy :	0.8525[0m
[92mF1 score :	0.8513853904282116[0m
[92mAUC :		0.9217230430760769[0m

[93mtfidf[0m
[92mAccuracy :	0.835[0m
[92mF1 score :	0.8413461538461539[0m
[92mAUC :		0.915972899322483[0m

[93mhasing[0m
[92mAccuracy :	0.7775[0m
[92mF1 score :	0.789598108747045[0m
[92mAUC :		0.8528963224080602[0m


# SVM linéaire

In [16]:
analyze_function = ml_functions.svm_analyze

evaluation(analyze_function)

[93mcount[0m
[92mAccuracy :	0.7475[0m
[92mF1 score :	0.7292225201072386[0m
[92mAUC :		0.8470961774044351[0m

[93mtfidf[0m
[92mAccuracy :	0.8525[0m
[92mF1 score :	0.855036855036855[0m
[92mAUC :		0.9228980724518114[0m

[93mhasing[0m
[92mAccuracy :	0.81[0m
[92mF1 score :	0.8173076923076923[0m
[92mAUC :		0.8954723868096702[0m


# Arbres

In [17]:
analyze_function = ml_functions.decision_tree_analyze

evaluation(analyze_function)

[93mcount[0m
[92mAccuracy :	0.665[0m
[92mF1 score :	0.6666666666666666[0m
[92mAUC :		0.6649916247906197[0m

[93mtfidf[0m
[92mAccuracy :	0.63[0m
[92mF1 score :	0.6390243902439025[0m
[92mAUC :		0.6298907472686818[0m

[93mhasing[0m
[92mAccuracy :	0.6525[0m
[92mF1 score :	0.6445012787723785[0m
[92mAUC :		0.652628815720393[0m


# Random Forest

In [18]:
analyze_function = ml_functions.random_forest_analyze

evaluation(analyze_function)

[93mcount[0m
[92mAccuracy :	0.7875[0m
[92mF1 score :	0.7858942065491183[0m
[92mAUC :		0.8660341508537714[0m

[93mtfidf[0m
[92mAccuracy :	0.8[0m
[92mF1 score :	0.7905759162303665[0m
[92mAUC :		0.8813220330508262[0m

[93mhasing[0m
[92mAccuracy :	0.77[0m
[92mF1 score :	0.7688442211055276[0m
[92mAUC :		0.8677716942923572[0m


In [10]:
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Régresionn linéaire

# Evaluation

# No Préprocessing

In [39]:
print(f'{yellow_code}without preprocessing vectorizer Count{reset_code}')
ml_functions.count_analyze_logistic_regression(movies_df)

[93mwithout preprocessing vectorizer Count[0m


AttributeError: module 'ml_functions' has no attribute 'count_analyze_logistic_regression'

In [None]:
print(f'{yellow_code}without preprocessing vectorizer TfIdf{reset_code}')
ml_functions.tfidf_analyze_logistic_regression(movies_df)

[93mwithout preprocessing vectorizer TfIdf[0m
[92mAccuracy :	0.835[0m
[92mF1 score :	0.8413461538461539[0m
[92mAUC :		0.915972899322483[0m


In [None]:
print(f'{yellow_code}without preprocessing vectorizer Count Bi-gram{reset_code}')
ml_functions.count_analyze_logistic_regression(movies_df,ngram_range=(1,2))

[93mwithout preprocessing vectorizer Count Bi-gram[0m
[92mAccuracy :	0.8625[0m
[92mF1 score :	0.8635235732009926[0m
[92mAUC :		0.9302482562064052[0m


In [None]:
print(f'{yellow_code}without preprocessing vectorizer Count Tri-gram{reset_code}')
ml_functions.count_analyze_logistic_regression(movies_df,ngram_range=(3,3))

[93mwithout preprocessing vectorizer Count Tri-gram[0m
[92mAccuracy :	0.75[0m
[92mF1 score :	0.7237569060773481[0m
[92mAUC :		0.8317207930198256[0m


## With préprocessing

In [None]:
print(f'{yellow_code}with preprocessing vectorizer TfIdf{reset_code}')
ml_functions.count_analyze_logistic_regression(preprocessed_movies_df)

[93mwith preprocessing vectorizer TfIdf[0m
[92mAccuracy :	0.835[0m
[92mF1 score :	0.835[0m
[92mAUC :		0.9159978999474987[0m


In [None]:
print(f'{yellow_code}with preprocessing vectorizer TfIdf{reset_code}')
ml_functions.tfidf_analyze_logistic_regression(preprocessed_movies_df)

[93mwith preprocessing vectorizer TfIdf[0m
[92mAccuracy :	0.8325[0m
[92mF1 score :	0.8385542168674699[0m
[92mAUC :		0.9093977349433736[0m


In [None]:
print(f'{yellow_code}with preprocessing vectorizer Count Bi-gram{reset_code}')
ml_functions.count_analyze_logistic_regression(preprocessed_movies_df,ngram_range=(2,2))

[93mwith preprocessing vectorizer Count Bi-gram[0m
[92mAccuracy :	0.81[0m
[92mF1 score :	0.8118811881188119[0m
[92mAUC :		0.8959973999349984[0m


In [None]:
print(f'{yellow_code}with preprocessing vectorizer Count Tri-gram{reset_code}')
ml_functions.count_analyze_logistic_regression(preprocessed_movies_df,ngram_range=(3,3))

[93mwith preprocessing vectorizer Count Tri-gram[0m
[92mAccuracy :	0.7625[0m
[92mF1 score :	0.7425474254742548[0m
[92mAUC :		0.8332208305207631[0m


### PCA

In [7]:
from sklearn import (
    linear_model, 
    ensemble,
    tree,
    decomposition, 
    naive_bayes, 
    neural_network,
    svm,
    metrics,
    preprocessing, 
    model_selection, 
    pipeline,
)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, auc, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import PCA

import numpy as np

In [8]:
# Diviser les données en ensembles d'entraînement et de test
X_text_train, X_text_test, y_train, y_test = model_selection.train_test_split(movies_df['text'], movies_df['label'], test_size=0.2)
#vectorizer = CountVectorizer(ngram_range=(2,2))   #Mieux  en gardant les stopwords bizzare |bigram marche VRAIMENT bien, trigram impossible a faire pas assez de memoire :c ) 
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_text_train = vectorizer.fit_transform(X_text_train)
X_text_test = vectorizer.transform(X_text_test)



In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#scaler = StandardScaler() moins efficace que MinMaxScaler

scaler = MinMaxScaler()
# Fit on training set only.
scaler.fit(X_text_train.toarray())

# Apply transform to both the training set and the test set.
X_text_train = scaler.transform(X_text_train.toarray())
X_text_test = scaler.transform(X_text_test.toarray())

In [10]:
n_c = 100
pca = PCA(.99)

X_train_pca = pca.fit_transform(X_text_train)
X_test_pca = pca.transform(X_text_test)



In [15]:
model = clf = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
# Entraîner le modèle sur les données d'entraînement
model.fit(X_train_pca, y_train)

# Prédire les étiquettes des données de test
y_pred = model.predict(X_test_pca)

# Prédire les probabilités des classes positives pour les données de test
# Prédire les probabilités des classes positives pour les données de test
if hasattr(model, "predict_proba"):
    y_prob = model.predict_proba(X_test_pca)[:, 1]
else:
    # Utiliser la décision de fonction de décision si le modèle ne prend pas en charge predict_proba
    y_prob = model.decision_function(X_test_pca)

# Calcul des métriques de performance
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = metrics.auc(fpr, tpr)
f1 = f1_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

# Affichage du rapport de classification
report = metrics.classification_report(y_test, y_pred)
# print(report)

print(f'{green_code}Accuracy :\t{acc}{reset_code}')
print(f'{green_code}F1 score :\t{f1}{reset_code}')
print(f'{green_code}AUC :\t\t{auc}{reset_code}')

[92mAccuracy :	0.89[0m
[92mF1 score :	0.8916256157635468[0m
[92mAUC :		0.9558238955973899[0m


### SVD :

In [16]:
from sklearn.decomposition import TruncatedSVD
# Diviser les données en ensembles d'entraînement et de test
X_text_train, X_text_test, y_train, y_test = model_selection.train_test_split(movies_df['text'], movies_df['label'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer(ngram_range=(1,2))   #Mieux  en gardant les stopwords bizzare
#vectorizer = TfidfVectorizer(stop_words="english")
X_text_train = vectorizer.fit_transform(X_text_train)
X_text_test = vectorizer.transform(X_text_test)

# Standardize the data
scaler = MinMaxScaler()
X_text_train = scaler.fit_transform(X_text_train.toarray())
X_text_test = scaler.transform(X_text_test.toarray())
print("xshape  = ", X_text_train.shape)
# Create a TruncatedSVD object and fit the data
svd = TruncatedSVD(n_components=1600)
X_svd_train = svd.fit_transform(X_text_train)
X_svd_test = svd.transform(X_text_test)



xshape  =  (1600, 457235)


In [17]:
model = clf = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
# Entraîner le modèle sur les données d'entraînement
model.fit(X_svd_train, y_train)

# Prédire les étiquettes des données de test
y_pred = model.predict(X_svd_test)

# Prédire les probabilités des classes positives pour les données de test
# Prédire les probabilités des classes positives pour les données de test
if hasattr(model, "predict_proba"):
    y_prob = model.predict_proba(X_svd_test)[:, 1]
else:
    # Utiliser la décision de fonction de décision si le modèle ne prend pas en charge predict_proba
    y_prob = model.decision_function(X_svd_test)

# Calcul des métriques de performance
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc = metrics.auc(fpr, tpr)
f1 = f1_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

# Affichage du rapport de classification
report = metrics.classification_report(y_test, y_pred)
# print(report)

print(f'{green_code}Accuracy :\t{acc}{reset_code}')
print(f'{green_code}F1 score :\t{f1}{reset_code}')
print(f'{green_code}AUC :\t\t{auc}{reset_code}')

[92mAccuracy :	0.9075[0m
[92mF1 score :	0.9053708439897699[0m
[92mAUC :		0.9551238780969525[0m
