**1. IMPORTER LES LIBRAIRIES**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

# Train_Test_Split
from sklearn.model_selection import train_test_split

# Ré-échantillonnage
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Modélisation
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV

# Performances
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import f1_score

# Enregistrer le modèle
from sklearn import model_selection
import joblib

import warnings
warnings.filterwarnings('ignore')

**2. IMPORTER LE FICHIER**

*Stemming / TF-IDF / Réduction de dimension*

In [2]:
df_stem = pd.read_csv("4. Stemming_TF-IDF(Avec_Reduction).csv").astype('float16')
df_stem['y'] = df_stem['y'].astype('uint8')
df_stem = df_stem.rename(columns={'1099' : 'ponctuation', '1100' : 'sentiment'})

df_stem.info()
display(df_stem.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69243 entries, 0 to 69242
Columns: 1102 entries, 0 to y
dtypes: float16(1101), uint8(1)
memory usage: 145.5 MB


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1092,1093,1094,1095,1096,1097,1098,ponctuation,sentiment,y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,-4.183594,3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,-3.720703,5
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,5.550781,5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,4.390625,5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.147949,0.625488,5


# **3. MODELISATION_1 (5 CLASSES)**

*1) Préparation des données*

In [3]:
# Target / Features
X_stem = df_stem.drop('y', axis = 1).to_numpy()
y_stem = df_stem['y'].to_numpy()

In [4]:
# Train_Test_Split
X_train_stem, X_test_stem, y_train_stem, y_test_stem = model_selection.train_test_split(X_stem, y_stem, test_size=0.2, random_state = 42)

In [5]:
# Vérification shape
print ("X_train_stem =", X_train_stem.shape)
print ("y_train_stem =", y_train_stem.shape)
print ("X_test_stem =", X_test_stem.shape)
print ("y_test_stem =", y_test_stem.shape)

X_train_stem = (55394, 1101)
y_train_stem = (55394,)
X_test_stem = (13849, 1101)
y_test_stem = (13849,)


In [6]:
# Undersampling
rUs = RandomUnderSampler(random_state = 42)
X_ru_stem, y_ru_stem = rUs.fit_resample(X_train_stem, y_train_stem)
print('Classes échantillon undersampled stemming :', dict(pd.Series(y_ru_stem).value_counts()))

Classes échantillon undersampled stemming : {1: 1406, 2: 1406, 3: 1406, 4: 1406, 5: 1406}


*2) Modélisation*

In [7]:
bagging_model = BaggingClassifier(random_state=42)
bagging_params = {'n_estimators': [10, 50, 100],
                  'max_samples': [0.5, 1.0],
                  'max_features': [0.5, 1.0]}

# Initialisation de RandomizedSearchCV pour trouver les meilleurs hyperparamètres
bagging_random_1 = RandomizedSearchCV(bagging_model, bagging_params, n_iter=4, cv=5, n_jobs=-1)
bagging_random_1.fit(X_ru_stem, y_ru_stem)
# Sélection du meilleur modèle obtenu après la recherche aléatoire
best_bagging_model_1 = bagging_random_1.best_estimator_

*3) Performances*

In [8]:
y_pred_stem = best_bagging_model_1.predict(X_test_stem)

print("Score du modèle :", best_bagging_model_1.score(X_test_stem, y_test_stem))

print(classification_report_imbalanced(y_test_stem, y_pred_stem))
      
confusion_matrix = pd.crosstab(y_test_stem, y_pred_stem, rownames=['Classe réelle'], colnames=['Classe prédite'])
display(confusion_matrix)

Score du modèle : 0.422413170626038
                   pre       rec       spe        f1       geo       iba       sup

          1       0.41      0.57      0.89      0.47      0.71      0.49      1647
          2       0.08      0.32      0.89      0.13      0.53      0.27       405
          3       0.13      0.28      0.89      0.18      0.50      0.24       764
          4       0.24      0.29      0.78      0.26      0.48      0.22      2688
          5       0.76      0.45      0.78      0.57      0.60      0.34      8345

avg / total       0.56      0.42      0.80      0.46      0.58      0.33     13849



Classe prédite,1,2,3,4,5
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,932,247,182,120,166
2,108,128,72,51,46
3,136,172,217,123,116
4,292,349,379,783,885
5,827,701,852,2175,3790


In [9]:
# f1_score du modèle Bagging_1
print('f1_score Bagging_1 =', f1_score(y_test_stem, y_pred_stem, average='weighted'))

# f1_score par classe du modèle Bagging_1
BG_f1_stem_1 = f1_score(y_test_stem, y_pred_stem, average=None)
print('f1_score par classes Baggin_1 =', BG_f1_stem_1)

f1_score Bagging_1 = 0.46303819836220056
f1_score par classes Baggin_1 = [0.47285642 0.12787213 0.17599351 0.26363636 0.56787534]


*4) Enregistrement*

In [10]:
joblib.dump(bagging_random_1, '6.a) bagging_1.pkl')

['bagging_1.pkl']

# **4. MODELISATION_2 (BINAIRE)**

In [11]:
### Classification binaire
# Classe 0 : [1, 2, 3]
# Classe 1 : [4, 5]
df_stem['y'] = df_stem['y'].replace((1, 2, 3, 4, 5), (0, 0, 0, 1, 1))

In [12]:
# Target / Features
X_stem = df_stem.drop('y', axis = 1).to_numpy()
y_stem = df_stem['y'].to_numpy()

In [13]:
# Train_Test_Split
X_train_stem, X_test_stem, y_train_stem, y_test_stem = model_selection.train_test_split(X_stem, y_stem, test_size=0.2, random_state = 42)

In [14]:
# Undersampling
rUs = RandomUnderSampler(random_state = 42)
X_ru_stem, y_ru_stem = rUs.fit_resample(X_train_stem, y_train_stem)
print('Classes échantillon undersampled stemming :', dict(pd.Series(y_ru_stem).value_counts()))

Classes échantillon undersampled stemming : {0: 11376, 1: 11376}


*2) Modélisation*

In [15]:
bagging_model = BaggingClassifier(random_state=42)
bagging_params = {'n_estimators': [10, 50, 100],
                  'max_samples': [0.5, 1.0],
                  'max_features': [0.5, 1.0]}

# Initialisation de RandomizedSearchCV pour trouver les meilleurs hyperparamètres
bagging_random_2 = RandomizedSearchCV(bagging_model, bagging_params, n_iter=4, cv=5, n_jobs=-1)
bagging_random_2.fit(X_ru_stem, y_ru_stem)
# Sélection du meilleur modèle obtenu après la recherche aléatoire
best_bagging_model_2 = bagging_random_2.best_estimator_

*3) Performances*

In [16]:
y_pred_stem = best_bagging_model_2.predict(X_test_stem)

print("Score du modèle :", best_bagging_model_2.score(X_test_stem, y_test_stem))

print(classification_report_imbalanced(y_test_stem, y_pred_stem))
      
confusion_matrix = pd.crosstab(y_test_stem, y_pred_stem, rownames=['Classe réelle'], colnames=['Classe prédite'])
display(confusion_matrix)

Score du modèle : 0.7646761499025201
                   pre       rec       spe        f1       geo       iba       sup

          0       0.45      0.76      0.77      0.57      0.76      0.58      2816
          1       0.93      0.77      0.76      0.84      0.76      0.58     11033

avg / total       0.83      0.76      0.76      0.78      0.76      0.58     13849



Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2139,677
1,2582,8451


In [17]:
# f1_score du modèle Bagging_2
print('f1_score Bagging_2 =', f1_score(y_test_stem, y_pred_stem, average='weighted'))

# f1_score par classe du modèle Bagging_2
BG_f1_stem_2 = f1_score(y_test_stem, y_pred_stem, average=None)
print('f1_score par classes Baggin_2 =', BG_f1_stem_2)

f1_score Bagging_2 = 0.7832977643496211
f1_score par classes Baggin_2 = [0.56759984 0.83835127]


*4) Enregistrement du modèe*

In [18]:
joblib.dump(best_bagging_model_2, '6.a) bagging_2.pkl')

['bagging_2.pkl']