**1. IMPORTER LES LIBRAIRIES**

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

# Train_Test_Split
from sklearn.model_selection import train_test_split

# Ré-échantillonnage
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Modélisation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

# Performances
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import f1_score

# Enregistrer le modèle
from sklearn import model_selection
import joblib

import warnings
warnings.filterwarnings('ignore')

**2. IMPORTER LE FICHIER**

*Stemming / TF-IDF / Réduction de dimension*

In [35]:
df_stem = pd.read_csv("4. Stemming_TF-IDF(Avec_Reduction).csv").astype('float16')
df_stem['y'] = df_stem['y'].astype('uint8')
df_stem = df_stem.rename(columns={'1099' : 'ponctuation', '1100' : 'sentiment'})

df_stem.info()
display(df_stem.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69243 entries, 0 to 69242
Columns: 1102 entries, 0 to y
dtypes: float16(1101), uint8(1)
memory usage: 145.5 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1092,1093,1094,1095,1096,1097,1098,ponctuation,sentiment,y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,-4.183594,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,-3.720703,5
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,5.550781,5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,4.390625,5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,0.625488,5


# **3. MODELISATION_1 (5 CLASSES)**

*1) Préparation des données*

In [36]:
# Target / Features
X_stem = df_stem.drop('y', axis = 1).to_numpy()
y_stem = df_stem['y'].to_numpy()

In [37]:
# Train_Test_Split
X_train_stem, X_test_stem, y_train_stem, y_test_stem = model_selection.train_test_split(X_stem, y_stem, test_size=0.2, random_state = 42)

In [38]:
# Vérification shape
print ("X_train_stem =", X_train_stem.shape)
print ("y_train_stem =", y_train_stem.shape)
print ("X_test_stem =", X_test_stem.shape)
print ("y_test_stem =", y_test_stem.shape)

X_train_stem = (55394, 1101)
y_train_stem = (55394,)
X_test_stem = (13849, 1101)
y_test_stem = (13849,)


In [39]:
# Undersampling
rUs = RandomUnderSampler(random_state = 42)
X_ru_stem, y_ru_stem = rUs.fit_resample(X_train_stem, y_train_stem)
print('Classes échantillon undersampled stemming :', dict(pd.Series(y_ru_stem).value_counts()))

Classes échantillon undersampled stemming : {1: 1406, 2: 1406, 3: 1406, 4: 1406, 5: 1406}


*2) Modélisation*

In [40]:
adaboost_model = AdaBoostClassifier(random_state = 42)
adaboost_params = {'n_estimators': [10, 50, 100], # Nombre d'estimateurs dans l'ensemble
                   'learning_rate': [0.01, 0.1, 0.5, 1]} # Taux d'apprentissage du modèle

# Création de l'objet RandomizedSearchCV pour trouver les meilleurs hyperparamètres
adaboost_random_1 = RandomizedSearchCV(adaboost_model, adaboost_params, n_iter=4, cv=5, n_jobs=-1)
adaboost_random_1.fit(X_ru_stem, y_ru_stem)
# Sélection du meilleur modèle trouvé par RandomizedSearchCV
best_adaboost_model_1 = adaboost_random_1.best_estimator_

*3) Performances*

In [41]:
y_pred_stem = best_adaboost_model_1.predict(X_test_stem)

print("Score du modèle :", best_adaboost_model_1.score(X_test_stem, y_test_stem))

print(classification_report_imbalanced(y_test_stem, y_pred_stem))
      
confusion_matrix = pd.crosstab(y_test_stem, y_pred_stem, rownames=['Classe réelle'], colnames=['Classe prédite'])
display(confusion_matrix)

Score du modèle : 0.503718680049101
                   pre       rec       spe        f1       geo       iba       sup

          1       0.22      0.69      0.67      0.33      0.68      0.46      1647
          2       0.00      0.00      1.00      0.00      0.00      0.00       405
          3       0.05      0.03      0.97      0.04      0.16      0.02       764
          4       0.00      0.00      1.00      0.00      0.00      0.00      2688
          5       0.71      0.70      0.56      0.70      0.62      0.40      8345

avg / total       0.45      0.50      0.69      0.46      0.47      0.29     13849



Classe prédite,1,3,4,5
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1129,49,17,452
2,262,13,0,130
3,451,21,0,292
4,1052,81,0,1555
5,2277,234,8,5826


In [42]:
# f1_score du modèle AdaBoost_1
print('f1_score AdaBoost_1 =', f1_score(y_test_stem, y_pred_stem, average='weighted'))

# f1_score par classe du modèle AdaBoost_1
AD_f1_stem_1 = f1_score(y_test_stem, y_pred_stem, average=None)
print('f1_score par classes AdaBoost_1 =', AD_f1_stem_1)

f1_score AdaBoost_1 = 0.4643409798786885
f1_score par classes AdaBoost_1 = [0.33118216 0.         0.03614458 0.         0.70192771]


*4) Enregistrement du modèle*

In [43]:
joblib.dump(best_adaboost_model_1, '6.b) adaboost_1.pkl')

['6.b) adaboost_1.pkl']

# **4. MODELISATION_2 (BINAIRE)**

*1) Préparation des données*

In [44]:
### Classification binaire
# Classe 0 : [1, 2, 3]
# Classe 1 : [4, 5]
df_stem['y'] = df_stem['y'].replace((1, 2, 3, 4, 5), (0, 0, 0, 1, 1))

In [45]:
# Target / Features
X_stem = df_stem.drop('y', axis = 1).to_numpy()
y_stem = df_stem['y'].to_numpy()

In [46]:
# Train_Test_Split
X_train_stem, X_test_stem, y_train_stem, y_test_stem = model_selection.train_test_split(X_stem, y_stem, test_size=0.2, random_state = 42)

In [47]:
# Undersampling
rUs = RandomUnderSampler(random_state = 42)
X_ru_stem, y_ru_stem = rUs.fit_resample(X_train_stem, y_train_stem)
print('Classes échantillon undersampled stemming :', dict(pd.Series(y_ru_stem).value_counts()))

Classes échantillon undersampled stemming : {0: 11376, 1: 11376}


*2) Modélisation*

In [48]:
adaboost_model = AdaBoostClassifier(random_state = 42)
adaboost_params = {'n_estimators': [10, 50, 100], # Nombre d'estimateurs dans l'ensemble
                   'learning_rate': [0.01, 0.1, 0.5, 1]} # Taux d'apprentissage du modèle

# Création de l'objet RandomizedSearchCV pour trouver les meilleurs hyperparamètres
adaboost_random_2 = RandomizedSearchCV(adaboost_model, adaboost_params, n_iter=4, cv=5, n_jobs=-1)
adaboost_random_2.fit(X_ru_stem, y_ru_stem)
# Sélection du meilleur modèle trouvé par RandomizedSearchCV
best_adaboost_model_2 = adaboost_random_2.best_estimator_

*3) Performances*

In [49]:
y_pred_stem = best_adaboost_model_2.predict(X_test_stem)

print("Score du modèle :", best_adaboost_model_2.score(X_test_stem, y_test_stem))

print(classification_report_imbalanced(y_test_stem, y_pred_stem))
      
confusion_matrix = pd.crosstab(y_test_stem, y_pred_stem, rownames=['Classe réelle'], colnames=['Classe prédite'])
display(confusion_matrix)

Score du modèle : 0.7571665824247238
                   pre       rec       spe        f1       geo       iba       sup

          0       0.44      0.73      0.76      0.55      0.75      0.56      2816
          1       0.92      0.76      0.73      0.83      0.75      0.56     11033

avg / total       0.82      0.76      0.74      0.78      0.75      0.56     13849



Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2052,764
1,2599,8434


In [50]:
# f1_score du modèle AdaBoost_2
print('f1_score AdaBoost_2 =', f1_score(y_test_stem, y_pred_stem, average='weighted'))

# f1_score par classe du modèle AdaBoost_2
AD_f1_stem_2 = f1_score(y_test_stem, y_pred_stem, average=None)
print('f1_score par classes AdaBoost_2 =', AD_f1_stem_2)

f1_score AdaBoost_2 = 0.7759917052772154
f1_score par classes AdaBoost_2 = [0.54961832 0.83376996]


*4) Enregistrement du modèle*

In [51]:
joblib.dump(best_adaboost_model_2, '6.b) adaboost_2.pkl')

['6.b) adaboost_2.pkl']