## Modélisation supervisée ##

Je passe maintenant à une approche supervisée, qui utilise directement les tags du dataset pour entraîner un modèle prédictif.  
Je choisis d'utiliser un simple réseau de neurones à une couche cachée.

### Importation et fonctions ###


#### Environnement de travail ####

In [None]:
# Générique
import random

# Manipulation de données
from collections import Counter
import pandas as pd
import numpy as np

# NLP
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import Word2Vec, LdaModel, CoherenceModel
from gensim import corpora
from gensim.utils import simple_preprocess
from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# MLOps
import mlflow

# Modèle
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


# DataViz
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# Chargement des dictionnaires NLP de NLTK

In [None]:
# Initialisation de MLFlow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("NLP StackOverflow Tagging")

<Experiment: artifact_location='mlflow-artifacts:/635357062346878224', creation_time=1726234904716, experiment_id='635357062346878224', last_update_time=1726234904716, lifecycle_stage='active', name='NLP StackOverflow Tagging', tags={}>

#### Importation des données ###

In [None]:
data = pd.read_json('data/feature_matrix.json')
raw_data = data

In [None]:
data.head()

Unnamed: 0,processed_title_tokens,processed_body_tokens,processed_tags,combined_text,use_embeddings
0,"[convert, decimal, double, c]","[want, assign, decimal, variable, trans, doubl...","c#,floating-point,type-conversion,double,decimal",convert decimal double c want assign decimal v...,"[-0.0449068211, -0.0657903776, -0.0003035644, ..."
1,"[calculate, relative, time, c]","[given, specific, datetime, value, display, re...","c#,datetime,time,datediff,relative-time-span",calculate relative time c given specific datet...,"[-0.0344359428, -0.07839460670000001, -0.02548..."
2,"[determine, user, timezone]","[standard, way, web, server, able, determine, ...","html,browser,timezone,user-agent,timezone-offset",determine user timezone standard way web serve...,"[0.014654723000000001, 0.0442878939, -0.052892..."
3,"[fastest, way, get, value, π]","[looking, fastest, way, obtain, value, π, pers...","performance,algorithm,language-agnostic,unix,pi",fastest way get value π looking fastest way ob...,"[-0.048530597200000004, -0.0524718091, -0.0530..."
4,"[use, c, socket, api, c, z, o]","[issue, getting, c, socket, api, work, properl...","c++,c,sockets,mainframe,zos",use c socket api c z o issue getting c socket ...,"[-0.0037920545, 0.05711698900000001, -0.058412..."


#### Définition des fonctions ####

In [None]:
def generate_dictionary_and_bow(corpus):

    words = [doc.split() for doc in corpus]
    dictionary = corpora.Dictionary(words)
    corpus_gensim = [dictionary.doc2bow(doc) for doc in words]
    
    return dictionary, corpus_gensim

def train_lda_model(corpus, dictionary, num_topics, alpha, beta, passes, iterations):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha, eta=beta,
                         passes=passes, iterations=iterations, random_state=42)
    return lda_model

def test_lda_hyperparams(param_grid, n_iter):

    from sklearn.model_selection import ParameterSampler


    param_sampler = ParameterSampler(param_grid, n_iter=n_iter, random_state=42)

    best_coherence = -np.inf
    best_params = None
    best_model = None

    for params in param_sampler:
        lda_model = train_lda_model(corpus_gensim, dictionary, num_topics=params['num_topics'],
                                    alpha=params['alpha'], beta=params['beta'],
                                    passes=params['passes'], iterations=params['iterations'])

        # Calculer la cohérence des topics
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence = coherence_model_lda.get_coherence()

        # Mise à jour des meilleurs paramètres
        if coherence > best_coherence:
            best_coherence = coherence
            best_params = params
            best_model = lda_model
    
    return best_params, best_model, best_coherence

    # Afficher les meilleurs hyperparamètres et la meilleure cohérence
    print(f"Meilleure cohérence: {best_coherence}")
    print(f"Meilleure combinaison d'hyperparamètres: {best_params}")

def multi_label_binarizer(corpus, sep=' '):
    corpus_list = corpus.apply(lambda x: x.split(sep))
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(corpus_list)
    tags_binarized_df = pd.DataFrame(y, columns=mlb.classes_)
    return tags_binarized_df

def display_token_info(corpus):
    print(f'Le corpus contient {len(corpus)} tokens')
    unique_tokens = set(corpus.split())
    print(f"Le corpus contient {len(unique_tokens)} tokens uniques")
    print(f"Occurences moyennes par token: {len(corpus) / len(unique_tokens)}")

def inspect_non_null_matrix_values(matrix):
    column_names = matrix.columns
    column_name = random.choice(column_names)
    print("Colonne choisie:", column_name)
    non_zero_column = matrix[matrix[column_name] > 0]
    print(non_zero_column[[column_name]].head())

def get_document_vector(doc, model):
    vectors = [model.wv[token] for token in doc if token in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)


### Préparation des données ###

Le réseau de neurones étant un modèle gourmand, je vais l'entraîner sur un échantillon du dataset, dont j'adapterai la taille en fonction des performances.

In [17]:
data = data.sample(50000)

Pour ce modèle, je vais utiliser:
- Pour **X**, une matrice de features sous forme d'embeddings
- Pour **y**, une matrice de labels binarisée qui indique la présence ou non d'un tag dans le document

In [18]:
# Embeddings de USE
X = np.vstack(data['use_embeddings'].values)

In [19]:
X

array([[-0.05925827, -0.00815879, -0.01140629, ...,  0.06363516,
        -0.0503395 , -0.05821633],
       [ 0.05438701, -0.05166201,  0.06055912, ...,  0.06311379,
        -0.03812636, -0.01134035],
       [-0.02250627, -0.03708333, -0.02851975, ...,  0.05835588,
        -0.02248105, -0.05677763],
       ...,
       [ 0.04403068, -0.05935622,  0.03934072, ...,  0.06161219,
        -0.03890656,  0.05266009],
       [-0.0348973 ,  0.01732324, -0.0458566 , ...,  0.06214431,
        -0.05859448, -0.0357019 ],
       [-0.04001129, -0.02488029,  0.01704159, ...,  0.05725216,
        -0.00332964, -0.04315857]])

In [20]:
# Liste de labels binarisés
y = multi_label_binarizer(data.processed_tags, sep=',')

Je veux également filtrer les données sur les tags les plus fréquents (top 100), pour éviter les outliers et l'overfitting.

In [21]:
tag_frequencies = y.sum(axis=0)
top_100_tags = tag_frequencies.sort_values(ascending=False).head(100).index

In [22]:
top_100_tags

Index(['c#', 'java', 'c++', 'javascript', 'php', 'asp.net', 'iphone', 'jquery',
       'python', 'sql', 'html', 'c', 'sql-server', 'objective-c', 'mysql',
       'database', 'windows', 'wpf', 'linux', 'ajax', 'performance', 'xml',
       'css', 'ruby-on-rails', 'ruby', 'asp.net-mvc', 'flash', 'cocoa-touch',
       'multithreading', 'visual-studio', 'cocoa', 'image', 'django', 'macos',
       'apache-flex', 'web-services', 'string', 'security', 'arrays',
       'actionscript-3', 'visual-studio-2008', 'user-interface', 'vb.net',
       'algorithm', 't-sql', 'hibernate', 'sql-server-2005', 'forms',
       'eclipse', 'oracle', 'debugging', 'winforms', 'internet-explorer',
       'file', 'winapi', 'unit-testing', 'http', 'parsing', 'json', 'xcode',
       'events', 'spring', 'class', 'xaml', 'apache', 'linq', 'android',
       'delphi', 'unix', 'authentication', 'data-binding', 'firefox',
       'silverlight', 'optimization', 'ios', 'jakarta-ee', 'wcf', 'regex',
       'oop', 'gcc', 'memory

In [23]:
y = y[top_100_tags]

J'ai mes deux matrices, je peux maintenant séparer le jeu de données en subsets d'entraînement et de test.

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Modélisation d'un réseau de neurones ###

In [25]:
# Définition du réseau de neurones
model = Sequential()
model.add(Dense(256, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
# Compilation du modèle
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [27]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.2083 - loss: 0.1516 - val_accuracy: 0.5092 - val_loss: 0.0585
Epoch 2/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4528 - loss: 0.0627 - val_accuracy: 0.4969 - val_loss: 0.0530
Epoch 3/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4615 - loss: 0.0575 - val_accuracy: 0.4979 - val_loss: 0.0507
Epoch 4/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4653 - loss: 0.0553 - val_accuracy: 0.4992 - val_loss: 0.0490
Epoch 5/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4620 - loss: 0.0542 - val_accuracy: 0.4931 - val_loss: 0.0480
Epoch 6/20
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4677 - loss: 0.0525 - val_accuracy: 0.5098 - val_loss: 0.0473
Epoch 7/20
[1m1

### Validation et évaluation sur le jeu de test ###

In [28]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Perte: {loss}, Précision: {accuracy}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4923 - loss: 0.0445
Perte: 0.044765688478946686, Précision: 0.4959000051021576


In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Prédire les résultats pour l'ensemble de test
y_pred = model.predict(X_test)

# Binariser les prédictions (car 'sigmoid' donne des probabilités)
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculer la précision, le rappel, et le F1-score (sur toutes les classes, en micro-average)
precision = precision_score(y_test, y_pred_binary, average='micro')
recall = recall_score(y_test, y_pred_binary, average='micro')
f1 = f1_score(y_test, y_pred_binary, average='micro')

print(f'Précision: {precision}')
print(f'Rappel: {recall}')
print(f'F1: {f1}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 518us/step
Précision: 0.7135528710905179
Rappel: 0.423700501129999
F1: 0.5316892725030826


Je log les informations du modèle dans MLFlow.

In [35]:
import json
tags_list = list(y.columns)

with open("data/tags.json", "w") as f:
    json.dump(tags_list, f)

In [36]:
with mlflow.start_run():
    # Enregistrement des paramètres
    mlflow.log_param("epochs", 20)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("layer_1_units", 256)
    mlflow.log_param("layer_2_units", 128)
    mlflow.log_param("layer_3_units", 64)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    mlflow.log_artifact("data/tags.json")

    # Enregistrement du modèle
    mlflow.keras.log_model(model, "model")




2024/09/13 16:41:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run unruly-hen-610 at: http://localhost:5000/#/experiments/635357062346878224/runs/1197fc00a368469087e5c1913d810a4d.
2024/09/13 16:41:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/635357062346878224.


In [43]:
data[0:1]

Unnamed: 0,processed_title_tokens,processed_body_tokens,processed_tags,combined_text,use_embeddings
46432,"[adding, jpanel, jframe, netbeans]","[particularly, using, netbeans, gui, editor, n...","java,swing,netbeans,jframe,jpanel",adding jpanel jframe netbeans particularly usi...,"[-0.0592582673, -0.0081587909, -0.011406286600..."


In [48]:
sorted_list = sorted(top_100_tags)
for tag in sorted_list:
    print(tag)

actionscript-3
ajax
algorithm
android
apache
apache-flex
arrays
asp.net
asp.net-mvc
authentication
bash
c
c#
c++
caching
class
cocoa
cocoa-touch
css
data-binding
database
debugging
delphi
django
eclipse
events
exception
file
firefox
flash
forms
gcc
hibernate
html
http
iis
image
internet-explorer
ios
iphone
jakarta-ee
java
javascript
jpa
jquery
json
linq
linq-to-sql
linux
macos
math
memory
memory-management
multithreading
mysql
objective-c
oop
optimization
oracle
orm
parsing
performance
perl
php
python
regex
ruby
ruby-on-rails
scripting
security
shell
silverlight
sockets
spring
sql
sql-server
sql-server-2005
string
svn
swing
t-sql
testing
unit-testing
unix
user-interface
validation
vb.net
visual-c++
visual-studio
visual-studio-2008
wcf
web-applications
web-services
winapi
windows
winforms
wpf
xaml
xcode
xml
