<h1>Sommaire<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Topics-matrix" data-toc-modified-id="Topics-matrix-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Topics matrix</a></span><ul class="toc-item"><li><span><a href="#Train-set" data-toc-modified-id="Train-set-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Train set</a></span></li><li><span><a href="#Val-set" data-toc-modified-id="Val-set-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Val set</a></span></li></ul></li><li><span><a href="#Tags-matrix" data-toc-modified-id="Tags-matrix-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Tags matrix</a></span><ul class="toc-item"><li><span><a href="#Train-set" data-toc-modified-id="Train-set-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Train set</a></span></li><li><span><a href="#Val-set" data-toc-modified-id="Val-set-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Val set</a></span></li></ul></li><li><span><a href="#Tags-/-Topics-matrix" data-toc-modified-id="Tags-/-Topics-matrix-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Tags / Topics matrix</a></span></li><li><span><a href="#Predict-tags" data-toc-modified-id="Predict-tags-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Predict tags</a></span><ul class="toc-item"><li><span><a href="#Train-set" data-toc-modified-id="Train-set-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Train set</a></span></li><li><span><a href="#Val-set" data-toc-modified-id="Val-set-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Val set</a></span></li></ul></li></ul></div>

In [1]:
import os
import pandas as pd
import glob
import numpy as np
import pickle
import gensim
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, f1_score

# Topics matrix

## Train set 

In [2]:
df_X_train = pd.read_pickle('PickleData/lda_features_12.pkl')
X_train = df_X_train.drop(columns=['post'])

X_train.shape

(103225, 12)

## Val set

In [3]:
# load model
lda_model = gensim.models.ldamodel.LdaModel.load('model/lda_model_12')
num_topic = 12

# load train dictionary
dictionary = gensim.corpora.Dictionary(df_X_train['post'].values)

# load data
df_X_val = pd.read_pickle('val/X_val_filtre.pkl')


def get_lda_features(post):
    post_bow = dictionary.doc2bow(post)
    features = lda_model.get_document_topics(post_bow, minimum_probability=0.)
    features = gensim.matutils.sparse2full(features, num_topic)
    return features


df_features = df_X_val['Text'].apply(func=get_lda_features)
X_val = df_features.apply(pd.Series).values

X_val.shape

(34477, 12)

# Tags matrix

## Train set

In [4]:
df_Y_train = pd.read_pickle('train/Y_train_filtre.pkl')
df_Y_train = df_Y_train.drop(columns=['Tags'])
df_Y_train = df_Y_train.rename(columns={'tags_filtered': 'tag'})

# build dictionary
dictionary = gensim.corpora.Dictionary(df_Y_train['tag'])

# build bow
Y_train = [dictionary.doc2bow(text) for text in df_Y_train['tag'].values]

# sparse to dense
Y_train = gensim.matutils.corpus2dense(Y_train, len(dictionary), len(Y_train)).T

Y_train.shape

(103225, 100)

In [5]:
count_tags = sum(sum(Y_train))
print('Nombre de tags dans le corpus : {}'.format(count_tags))

Nombre de tags dans le corpus : 168815.0


## Val set 

In [6]:
df_Y_val = pd.read_pickle('val/Y_val_filtre.pkl')
df_Y_val = df_Y_val.drop(columns=['Tags'])
df_Y_val = df_Y_val.rename(columns={'tags_filtered': 'tag'})

# build bow from train set dictionary
Y_val = [dictionary.doc2bow(text) for text in df_Y_val['tag'].values]

# sparse to dense
Y_val = gensim.matutils.corpus2dense(Y_val, len(dictionary), len(Y_val)).T

Y_val.shape

(34477, 100)

# Tags / Topics matrix

Matrice(docs-topics) . Matrice(transformation) = Matrice(docs-tags)

Matrice(transformation)  ≈ pseudo-inverse (Matrice(doc-topics)) . Matrice(docs-tags)

In [7]:
transformation_matrix = np.linalg.pinv(X_train)@Y_train
transformation_matrix.shape

(12, 100)

# Predict tags

## Train set 

In [8]:
# predict
def predict_tags(X, threshold):
    tag_pred = X@transformation_matrix
    tag_pred = (tag_pred > threshold) * 1
    return tag_pred

In [9]:
# estimate best threshold
best_score = 0
best_threshold = 0.5

for threshold in np.arange(0.001, 0.1, 0.01):

    Y_pred_train = predict_tags(X_train, threshold)
    train_score = f1_score(Y_train, Y_pred_train, average='macro')
    if train_score > best_score:
        best_score = train_score
        best_threshold = threshold

print('Best threshold ; {}'.format(best_threshold))

Best threshold ; 0.040999999999999995


In [10]:
Y_pred_train = predict_tags(X_train, best_threshold)
train_score = f1_score(Y_train, Y_pred_train, average='macro')

In [11]:
best_score

0.09520023746073797

## Val set 

In [12]:
# predict
Y_pred_val = predict_tags(X_val, best_threshold)

# evaluate
print('Nombre de tags présents : {}'.format(sum(sum(Y_val))))
print('Nombre de tags prédits : {}'.format(sum(sum(Y_pred_val))))
print('---------------------------------------------------------')

csv_path = 'results/lda_matrix.csv'
clf_name = 'lda matrix'
train_score_macro = f1_score(Y_train, Y_pred_train, average='macro')
val_score_macro = f1_score(Y_val, Y_pred_val, average='macro')
train_score_micro = f1_score(Y_train, Y_pred_train, average='micro')
val_score_micro = f1_score(Y_val, Y_pred_val, average='micro')
train_score_weighted = f1_score(Y_train, Y_pred_train, average='weighted')
val_score_weighted = f1_score(Y_val, Y_pred_val, average='weighted')
train_score_samples = f1_score(Y_train, Y_pred_train, average='samples')
val_score_samples = f1_score(Y_val, Y_pred_val, average='samples')
with open(csv_path, 'w') as file:
    file.write('{};{};{};{};{};{};{};{};{}'.format(clf_name,
                                                   train_score_macro,
                                                   val_score_macro,
                                                   train_score_micro,
                                                   val_score_micro,
                                                   train_score_weighted,
                                                   val_score_weighted,
                                                   train_score_samples,
                                                   val_score_samples))

print(classification_report(Y_val, Y_pred_val))

Nombre de tags présents : 56448.0
Nombre de tags prédits : 395154
---------------------------------------------------------
              precision    recall  f1-score   support

           0       0.09      0.81      0.17       559
           1       0.06      0.22      0.10       216
           2       0.09      0.92      0.16      1144
           3       0.11      0.93      0.19      2376
           4       0.08      0.92      0.15      1858
           5       0.07      0.63      0.12       364
           6       0.06      0.68      0.11       294
           7       0.06      0.75      0.12       802
           8       0.07      0.75      0.13      1492
           9       0.00      0.00      0.00       191
          10       0.00      0.00      0.00       292
          11       0.02      0.02      0.02       186
          12       0.10      0.96      0.18      3009
          13       0.05      0.25      0.09       438
          14       0.07      0.53      0.12       324
          1

  _warn_prf(average, modifier, msg_start, len(result))
