In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn import (
    linear_model, 
    ensemble,
    tree,
    decomposition, 
    naive_bayes, 
    neural_network,
    svm,
    metrics,
    preprocessing, 
    model_selection, 
    pipeline,
)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, auc, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import pandas as pd

import utils


red_code = '\033[91m'
blue_code = '\033[94m'
green_code = '\033[92m'
yellow_code = '\033[93m'
reset_code = '\033[0m'

## Load Data 

In [3]:
def load_movies(path2data): # 1 classe par répertoire
    alltxts = [] # init vide
    labs = []
    cpt = 0
    for cl in os.listdir(path2data): # parcours des fichiers d'un répertoire
        for f in os.listdir(path2data+cl):
            txt = open(path2data+cl+'/'+f).read()
            alltxts.append(txt)
            labs.append(cpt)
        cpt+=1 # chg répertoire = cht classe
        
    return alltxts,labs

In [5]:
path = "./datasets/movies/movies1000/"
alltxts,alllabs = utils.load_movies(path)

movies_df = pd.DataFrame()
movies_df['text'] = alltxts
movies_df['label'] = alllabs


In [6]:
preprocessed_alltxts = [utils.preprocess(alltxt) for alltxt in movies_df.text]

preprocessed_movies_df = pd.DataFrame()
preprocessed_movies_df['text'] = preprocessed_alltxts
preprocessed_movies_df['label'] = alllabs

In [40]:
X_text_train, X_text_test, y_train, y_test = model_selection.train_test_split(preprocessed_movies_df['text'], preprocessed_movies_df['label'], test_size=0.2)

### GRID SEARCH

### Logistique regression :

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizers = [
    ('CountVectorizer', CountVectorizer()),
    ('TfidfVectorizer', TfidfVectorizer(smooth_idf=True))
]

param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_features': [1000, 5000, 10000, None],

}

for vectorizer_name, vectorizer in vectorizers:
    pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_micro', n_jobs = -1, refit = True)  # Modifier les paramètres CV et scoring
    grid_search.fit(X_text_train, y_train)


    print(f"Classifier: LogisticRegression, Vectorizer: {vectorizer_name}")
    print("Best parameters found:")
    print(grid_search.best_params_)
    print("Best cross-validation f1:")
    print(grid_search.best_score_)
    print("")

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_text_test)
    report = metrics.classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)
    print("")


Classifier: LogisticRegression, Vectorizer: CountVectorizer
Best parameters found:
{'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 2), 'vectorizer__stop_words': None}
Best cross-validation f1:
0.8387499999999999

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       193
           1       0.85      0.84      0.84       207

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400


Classifier: LogisticRegression, Vectorizer: TfidfVectorizer
Best parameters found:
{'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 1), 'vectorizer__stop_words': 'english'}
Best cross-validation f1:
0.835625

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       193
           1       0.83      0.81      0.82       207

    

### SVM

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizers = [
    ('CountVectorizer', CountVectorizer()),
    ('TfidfVectorizer', TfidfVectorizer())
]

param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_features': [1000, 5000, 10000, None],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__gamma': [0.1, 0.01],
    'classifier__degree': [2, 3],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__probability': [False, True]
}

for vectorizer_name, vectorizer in vectorizers:
    pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', SVC())
    ])

    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_micro', n_jobs = -1, refit = True)  # Modifier les paramètres CV et scoring
    grid_search.fit(X_text_train, y_train)


    print(f"Classifier: LogisticRegression, Vectorizer: {vectorizer_name}")
    print("Best parameters found:")
    print(grid_search.best_params_)
    print("Best cross-validation f1:")
    print(grid_search.best_score_)
    print("")

    best_model_svm = grid_search.best_estimator_

    y_pred = best_model.predict(X_text_test)
    report = metrics.classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)
    print("")


Classifier: LogisticRegression, Vectorizer: CountVectorizer
Best parameters found:
{'classifier__C': 0.1, 'classifier__class_weight': None, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'linear', 'classifier__probability': False, 'vectorizer__max_features': None, 'vectorizer__ngram_range': (1, 2), 'vectorizer__stop_words': None}
Best cross-validation f1:
0.828125

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       193
           1       0.83      0.81      0.82       207

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400


Classifier: LogisticRegression, Vectorizer: TfidfVectorizer
Best parameters found:
{'classifier__C': 10, 'classifier__class_weight': None, 'classifier__degree': 2, 'classifier__gamma': 0.1, 'classifier__kernel': 'rbf', 'classifier__probability': False, 'v

In [46]:
y_pred = best_model_svm.predict(X_text_test)
report = metrics.classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("")

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       193
           1       0.87      0.83      0.85       207

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400




In [47]:
from sklearn import svm
from joblib import dump

dump(best_model_svm, 'svm_model_tfidf.joblib')

['svm_model_tfidf.joblib']

### DecisionTree

### Random forest


In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizers = [
    ('CountVectorizer', CountVectorizer()),
    ('TfidfVectorizer', TfidfVectorizer())
]

param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_features': [1000, 5000, 10000, None],
    'classifier__n_estimators': [100, 200, 500],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

for vectorizer_name, vectorizer in vectorizers:
    pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', RandomForestClassifier())
    ])

    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_micro', n_jobs = -1, refit = True)  # Modifier les paramètres CV et scoring
    grid_search.fit(X_text_train, y_train)


    print(f"Classifier: LogisticRegression, Vectorizer: {vectorizer_name}")
    print("Best parameters found:")
    print(grid_search.best_params_)
    print("Best cross-validation f1:")
    print(grid_search.best_score_)
    print("")

    best_model_rf = grid_search.best_estimator_

    y_pred = best_model.predict(X_text_test)
    report = metrics.classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)
    print("")


Classifier: LogisticRegression, Vectorizer: CountVectorizer
Best parameters found:
{'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 500, 'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 2), 'vectorizer__stop_words': 'english'}
Best cross-validation f1:
0.86

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       193
           1       0.83      0.81      0.82       207

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400


Classifier: LogisticRegression, Vectorizer: TfidfVectorizer
Best parameters found:
{'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 500, 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 2), 'vecto

In [62]:
pipe = Pipeline([
    ('vectorizer', CountVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')),
    ('classifier', RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=500))
])

pipe.fit(X_text_train, y_train)

y_pred = pipe.predict(X_text_test)
report = metrics.classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("")

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.84      0.81       193
           1       0.84      0.77      0.80       207

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.80       400
weighted avg       0.81      0.81      0.80       400




### Naive bayes bernoulli

In [44]:

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
vectorizers = [
    ('CountVectorizer', CountVectorizer()),
    ('TfidfVectorizer', TfidfVectorizer())
]

param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__stop_words': [None, 'english'],
    'vectorizer__max_features': [1000, 5000, 10000, None],
    'classifier__alpha': [0.1, 1.0, 10.0],
    'classifier__binarize': [0.0, 0.5, 1.0]
}

for vectorizer_name, vectorizer in vectorizers:
    pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', BernoulliNB())
    ])

    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_micro', n_jobs = -1, refit = True)  # Modifier les paramètres CV et scoring
    grid_search.fit(X_text_train, y_train)


    print(f"Classifier: LogisticRegression, Vectorizer: {vectorizer_name}")
    print("Best parameters found:")
    print(grid_search.best_params_)
    print("Best cross-validation f1:")
    print(grid_search.best_score_)
    print("")

    best_model_nb = grid_search.best_estimator_

    y_pred = best_model.predict(X_text_test)
    report = metrics.classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)
    print("")


Classifier: LogisticRegression, Vectorizer: CountVectorizer
Best parameters found:
{'classifier__alpha': 0.1, 'classifier__binarize': 0.0, 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 3), 'vectorizer__stop_words': 'english'}
Best cross-validation f1:
0.819375

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       193
           1       0.83      0.81      0.82       207

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400


Classifier: LogisticRegression, Vectorizer: TfidfVectorizer
Best parameters found:
{'classifier__alpha': 0.1, 'classifier__binarize': 0.0, 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 3), 'vectorizer__stop_words': 'english'}
Best cross-validation f1:
0.819375

Classification Report:
              precision    recall  f1-score   support

         

###  Logitic regression

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizers = [
    ('CountVectorizer', CountVectorizer()),
    ('TfidfVectorizer', TfidfVectorizer(smooth_idf=True))
]

param_grid = {
    'vectorizer__ngram_range': [(1, 2), (1, 3)],
    'vectorizer__max_features': [5000, 10000, None],
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}

for vectorizer_name, vectorizer in vectorizers:
    pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

    grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_micro', n_jobs = -1, refit = True)  # Modifier les paramètres CV et scoring
    grid_search.fit(X_text_train, y_train)


    print(f"Classifier: LogisticRegression, Vectorizer: {vectorizer_name}")
    print("Best parameters found:")
    print(grid_search.best_params_)
    print("Best cross-validation f1:")
    print(grid_search.best_score_)
    print("")

    best_model_lr = grid_search.best_estimator_

    y_pred = best_model_lr.predict(X_text_test)
    report = metrics.classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)
    print("")


KeyboardInterrupt: 

In [None]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

pipe.fit(X_text_train, y_train)


y_pred = pipe.predict(X_text_test)
report = metrics.classification_report(y_test, y_pred)
print("Classification Report:")
print(report)
print("")