<h1>Sommaire<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Post-data" data-toc-modified-id="Import-Post-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Post data</a></span><ul class="toc-item"><li><span><a href="#Train-set" data-toc-modified-id="Train-set-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Train set</a></span></li><li><span><a href="#Val-set" data-toc-modified-id="Val-set-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Val set</a></span></li></ul></li><li><span><a href="#Import-Tag-data" data-toc-modified-id="Import-Tag-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Import Tag data</a></span><ul class="toc-item"><li><span><a href="#Train-set" data-toc-modified-id="Train-set-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Train set</a></span></li><li><span><a href="#Val-set" data-toc-modified-id="Val-set-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Val set</a></span></li></ul></li><li><span><a href="#Logistic-regression-multilabels-(one-vs-rest)" data-toc-modified-id="Logistic-regression-multilabels-(one-vs-rest)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Logistic regression multilabels (one vs rest)</a></span></li><li><span><a href="#Evaluation-of-Logistic-regression-multilabels" data-toc-modified-id="Evaluation-of-Logistic-regression-multilabels-3"><span class="toc-item-num">4&nbsp;&nbsp;</span>Evaluation of Logistic regression multilabels</a></span></li><li><span><a href="#SVM-multilabels-(one-vs-rest)" data-toc-modified-id="SVM-multilabels-(one-vs-rest)-4"><span class="toc-item-num">5&nbsp;&nbsp;</span>SVM multilabels (one vs rest)</a></span></li><li><span><a href="#Evaluation-of-SVM-multilabels-(one-vs-rest)" data-toc-modified-id="Evaluation-of-SVM-multilabels-(one-vs-rest)-4"><span class="toc-item-num">6&nbsp;&nbsp;</span>Evaluation of SVM multilabels (one vs rest)</a></span></li><li><span><a href="#Random-forest-multilabels-(one-vs-rest)" data-toc-modified-id="Random-forest-multilabels-(one-vs-rest)-5"><span class="toc-item-num">7&nbsp;&nbsp;</span>Random forest multilabels (one vs rest)</a></span></li><li><span><a href="#Evaluation-of-Random-forest-multilabels-(one-vs-rest)" data-toc-modified-id="Evaluation-of-Random-forest-multilabels-(one-vs-rest)-5"><span class="toc-item-num">8&nbsp;&nbsp;</span>Evaluation of Random forest multilabels (one vs rest)</a></span></li></ul></div>

In [1]:
import os
import pandas as pd
import glob
import numpy as np
import pickle
import gensim
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

# Import Post data

## Train set 

In [2]:
df_X_train = pd.read_pickle('PickleData/lda_features_12.pkl')
X_train = df_X_train.drop(columns=['post'])

X_train.shape

(103225, 12)

## Val set

In [3]:
# load model
lda_model = gensim.models.ldamodel.LdaModel.load('model/lda_model_12')
num_topic = 12

# load train dictionary
dictionary = gensim.corpora.Dictionary(df_X_train['post'].values)

# load data
df_X_val = pd.read_pickle('val/X_val_filtre.pkl')


def get_lda_features(post):
    post_bow = dictionary.doc2bow(post)
    features = lda_model.get_document_topics(post_bow, minimum_probability=0.)
    features = gensim.matutils.sparse2full(features, num_topic)
    return features


df_features = df_X_val['Text'].apply(func=get_lda_features)
X_val = df_features.apply(pd.Series).values

X_val.shape

(34477, 12)

# Import Tag data

## Train set

In [4]:
df_Y_train = pd.read_pickle('train/Y_train_filtre.pkl')
df_Y_train = df_Y_train.drop(columns=['Tags'])
df_Y_train = df_Y_train.rename(columns={'tags_filtered': 'tag'})

# build dictionary
dictionary = gensim.corpora.Dictionary(df_Y_train['tag'])

# build bow
Y_train = [dictionary.doc2bow(text) for text in df_Y_train['tag'].values]

# sparse to dense
Y_train = gensim.matutils.corpus2dense(Y_train, len(dictionary), len(Y_train)).T

Y_train.shape

(103225, 100)

In [5]:
count_tags = sum(sum(Y_train))
print('Nombre de tags dans le corpus : {}'.format(count_tags))

Nombre de tags dans le corpus : 168815.0


## Val set 

In [6]:
df_Y_val = pd.read_pickle('val/Y_val_filtre.pkl')
df_Y_val = df_Y_val.drop(columns=['Tags'])
df_Y_val = df_Y_val.rename(columns={'tags_filtered': 'tag'})

# build bow from train set dictionary
Y_val = [dictionary.doc2bow(text) for text in df_Y_val['tag'].values]

# sparse to dense
Y_val = gensim.matutils.corpus2dense(Y_val, len(dictionary), len(Y_val)).T

Y_val.shape

(34477, 100)

# Logistic regression multilabels (one vs rest)

In [7]:
def evaluate_clf(clf, csv_path=None, clf_name=None):
    # prediction
    # train
    Y_pred_train = clf.predict(X_train)
    # val
    Y_pred_val = clf.predict(X_val)
    print('Nombre de tags présents : {}'.format(sum(sum(Y_val))))
    print('Nombre de tags prédits : {}'.format(sum(sum(Y_pred_val))))
    print('---------------------------------------------------------')
    print(classification_report(Y_val, Y_pred_val, zero_division=0))
    if csv_path and clf_name:
        train_score_macro = f1_score(Y_train, Y_pred_train, average='macro')
        val_score_macro = f1_score(Y_val, Y_pred_val, average='macro')
        train_score_micro = f1_score(Y_train, Y_pred_train, average='micro')
        val_score_micro = f1_score(Y_val, Y_pred_val, average='micro')
        train_score_weighted = f1_score(
            Y_train, Y_pred_train, average='weighted')
        val_score_weighted = f1_score(Y_val, Y_pred_val, average='weighted')
        train_score_samples = f1_score(
            Y_train, Y_pred_train, average='samples')
        val_score_samples = f1_score(Y_val, Y_pred_val, average='samples')
        with open(csv_path, 'w') as file:
            file.write('{};{};{};{};{};{};{};{};{}'.format(clf_name,
                                                           train_score_macro,
                                                           val_score_macro,
                                                           train_score_micro,
                                                           val_score_micro,
                                                           train_score_weighted,
                                                           val_score_weighted,
                                                           train_score_samples,
                                                           val_score_samples))

In [8]:
# sample train set
# sampling
num_samples = 15000
X_train_s = X_train.sample(n=num_samples,random_state=1)
index_to_keep = X_train_s.index.tolist()
df_Y_train_s = df_Y_train.loc[index_to_keep]

# build dictionary
dictionary_s = gensim.corpora.Dictionary(df_Y_train_s['tag'])

# build bow
Y_train_s = [dictionary_s.doc2bow(text) for text in df_Y_train_s['tag'].values]

# sparse to dense
Y_train_s = gensim.matutils.corpus2dense(Y_train_s, len(dictionary_s), len(Y_train_s)).T

Afin de determiner les meilleurs paramètres du modèle, nous allons utiliser `GridSearchCV` du module `model_selection` de `scikit-learn`. Nous appliquerons cela à un échantillon de 15000 posts afin de reduire le temps d'exécution.

In [9]:
# classifier
log_clf = OneVsRestClassifier(LogisticRegression(random_state=0,
                                                 max_iter=10000,
                                                 verbose=0))

# hyperparameters
param_grid = [{'estimator__C': [0.1, 0.5, 1., 5, 10]}]

# cross-validation
clf = GridSearchCV(log_clf,
                   param_grid,
                   scoring='f1_weighted',
                   n_jobs=5,
                   verbose = 0,
                   cv=5,
                   return_train_score=True)

clf.fit(X_train_s, Y_train_s)

print('Meilleurs hyper-paramètres :\n')
print(clf.best_params_)

Meilleurs hyper-paramètres :

{'estimator__C': 10}


Une fois le meilleur paramètre determiner, nous appliquerons la classification à toute la base en considérant le meilleur paramètre déterminé précédemment.

In [10]:
# train classifier
clf = OneVsRestClassifier(LogisticRegression(random_state=0,
                                             C=10,
                                             max_iter=10000,
                                             verbose=0))

clf.fit(X_train, Y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=10, max_iter=10000,
                                                 random_state=0))

# Evaluation of Logistic regression multilabels

In [11]:
evaluate_clf(clf, 'results/log_reg_lda.csv', 'Regression logistique lda')

Nombre de tags présents : 56448.0
Nombre de tags prédits : 2634
---------------------------------------------------------
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       559
           1       0.00      0.00      0.00       216
           2       0.00      0.00      0.00      1144
           3       0.29      0.00      0.00      2376
           4       0.00      0.00      0.00      1858
           5       0.00      0.00      0.00       364
           6       0.00      0.00      0.00       294
           7       0.00      0.00      0.00       802
           8       0.00      0.00      0.00      1492
           9       0.00      0.00      0.00       191
          10       0.00      0.00      0.00       292
          11       0.00      0.00      0.00       186
          12       0.00      0.00      0.00      3009
          13       0.00      0.00      0.00       438
          14       0.00      0.00      0.00       324
          15 

# SVM multilabels (one vs rest)

In [12]:
# classifier
svm_clf = OneVsRestClassifier(LinearSVC(random_state=0,
                                    class_weight='balanced',
                                    max_iter=10000,
                                    verbose=0))
# hyperparameters
param_grid = [{'estimator__C': [0.1, 0.5, 1., 5, 10]}]

# cross-validation
clf = GridSearchCV(svm_clf,
                   param_grid,
                   scoring='f1_weighted',
                   n_jobs=5,
                   verbose = 0,
                   return_train_score=True)

clf.fit(X_train_s, Y_train_s)

print('Meilleurs hyper-paramètres :\n')
print(clf.best_params_)

Meilleurs hyper-paramètres :

{'estimator__C': 0.1}


In [13]:
# train classifier
clf = OneVsRestClassifier(LinearSVC(random_state=0,
                                    class_weight='balanced',
                                    C=0.1,
                                    max_iter=50000,
                                    verbose=0))

clf.fit(X_train, Y_train)

OneVsRestClassifier(estimator=LinearSVC(C=0.1, class_weight='balanced',
                                        max_iter=50000, random_state=0))

# Evaluation of SVM multilabels (one-vs-rest)

In [14]:
evaluate_clf(clf, 'results/svm_lda.csv', 'SVM lda')

Nombre de tags présents : 56448.0
Nombre de tags prédits : 747452
---------------------------------------------------------
              precision    recall  f1-score   support

           0       0.09      0.81      0.15       559
           1       0.03      0.78      0.05       216
           2       0.12      0.83      0.20      1144
           3       0.16      0.68      0.26      2376
           4       0.11      0.81      0.20      1858
           5       0.05      0.82      0.10       364
           6       0.05      0.84      0.10       294
           7       0.06      0.79      0.11       802
           8       0.08      0.65      0.14      1492
           9       0.02      0.70      0.03       191
          10       0.03      0.85      0.05       292
          11       0.03      0.85      0.05       186
          12       0.13      0.70      0.21      3009
          13       0.03      0.71      0.06       438
          14       0.06      0.81      0.11       324
          1

# Random forest multilabels (one vs rest) 

In [15]:
# classifier
randf = OneVsRestClassifier(RandomForestClassifier(random_state=0,
                                                   n_estimators=50,
                                                   criterion='gini'))

# hyperparameters
param_grid = {'estimator__max_features':[0.25, 0.5, 0.75],
             'estimator__min_samples_leaf':[1, 3, 10],
             'estimator__max_depth':[5, 10]}

# cross-validation

clf = GridSearchCV(randf,
                  param_grid,
                  scoring='f1_weighted',
                  n_jobs=5,
                  verbose = 0,
                  return_train_score=True)

clf.fit(X_train_s, Y_train_s)

print('Meilleurs hyper-paramètres :\n')
print(clf.best_params_)

Meilleurs hyper-paramètres :

{'estimator__max_depth': 10, 'estimator__max_features': 0.75, 'estimator__min_samples_leaf': 1}


In [17]:
clf = OneVsRestClassifier(RandomForestClassifier(random_state=0,
                                                 max_depth=10,  # 10
                                                 min_samples_leaf=1, # 1
                                                 max_features=0.75, # 0.75
                                                 n_estimators=50,
                                                 criterion='gini'))

clf.fit(X_train, Y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(max_depth=10,
                                                     max_features=0.75,
                                                     n_estimators=50,
                                                     random_state=0))

# Evaluation of Random forest multilabels (one-vs-rest)

In [18]:
evaluate_clf(clf, 'results/rand_forest_lda.csv', 'Random forest lda')

Nombre de tags présents : 56448.0
Nombre de tags prédits : 1053
---------------------------------------------------------
              precision    recall  f1-score   support

           0       0.47      0.01      0.02       559
           1       0.00      0.00      0.00       216
           2       1.00      0.00      0.00      1144
           3       0.44      0.00      0.01      2376
           4       0.00      0.00      0.00      1858
           5       0.00      0.00      0.00       364
           6       0.00      0.00      0.00       294
           7       0.00      0.00      0.00       802
           8       0.00      0.00      0.00      1492
           9       0.00      0.00      0.00       191
          10       0.00      0.00      0.00       292
          11       0.00      0.00      0.00       186
          12       0.00      0.00      0.00      3009
          13       0.00      0.00      0.00       438
          14       0.00      0.00      0.00       324
          15 