## Modélisation non supervisée ##

L'objectif de la modélisation non supervisée va être d'entraîner un modèle à découvrir de quoi une question traite, par "lui-même", sans l'utilisation des tags associés.  
Un modèle idéal pour découvrir les sujets d'un large corpus de documents est la **Latent Dirichlet Allocation**, ou *LDA*.

### Importation et fonctions ###


#### Environnement de travail ####

In [34]:
# Générique
import random

# Manipulation de données
import pandas as pd
import numpy as np

# NLP
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora

# MLOps
import mlflow

# Modèle
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterSampler


# DataViz
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import pyLDAvis.gensim

# Project modules
from config.paths import DATA_DIR

In [3]:
# Initialisation de MLFlow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("NLP StackOverflow Tagging")

MlflowException: API request to http://localhost:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=NLP+StackOverflow+Tagging (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x2984c0a50>: Failed to establish a new connection: [Errno 61] Connection refused'))

#### Importation des données ###

In [13]:
data = pd.read_json(f'{DATA_DIR}/gold/feature_matrix.json')
raw_data = data

In [14]:
data.head()

Unnamed: 0,processed_title_tokens,processed_body_tokens,processed_tags,combined_text
25399,"[reset, as3, list, component, nothing, selecte...","[list, component, stage, as3, movie, populate,...","flash,actionscript-3,combobox,flash-cs4,select...",reset as3 list component nothing selected sele...
387,"[wiki, central, development, project, repository]","[played, idea, wiki, mediawiki, centralize, pr...","svn,integration,wiki,projects,bugzilla",wiki central development project repository pl...
9870,"[identifying, type, variable, c, project]","[trying, write, program, check, c, source, cod...","python,c,variables,coding-style,code-analysis",identifying type variable c project trying wri...
49231,"[jquery, fade, swap, element, clicked, also, r...","[notice, click, posture, description, drop, im...","jquery,ajax,hide,fadein,fadeout",jquery fade swap element clicked also relate a...
39120,"[cache, flush, invalidate, operation]","[question, cache, synchronization, operation, ...","caching,hardware,driver,device-driver,dma",cache flush invalidate operation question cach...


#### Définition des fonctions ####

In [17]:
def generate_dictionary_and_bow(corpus):

    words = [doc.split() for doc in corpus]
    dictionary = corpora.Dictionary(words)
    corpus_gensim = [dictionary.doc2bow(doc) for doc in words]
    
    return dictionary, corpus_gensim

def train_lda_model(corpus, dictionary, num_topics, alpha, beta, passes, iterations):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha, eta=beta,
                         passes=passes, iterations=iterations, random_state=42)
    return lda_model

def test_lda_hyperparams(param_grid, n_iter):


    param_sampler = ParameterSampler(param_grid, n_iter=n_iter, random_state=42)

    best_coherence = -np.inf
    best_params = None
    best_model = None

    for params in param_sampler:
        lda_model = train_lda_model(corpus_gensim, dictionary, num_topics=params['num_topics'],
                                    alpha=params['alpha'], beta=params['beta'],
                                    passes=params['passes'], iterations=params['iterations'])

        # Calculer la cohérence des topics
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence = coherence_model_lda.get_coherence()

        # Mise à jour des meilleurs paramètres
        if coherence > best_coherence:
            best_coherence = coherence
            best_params = params
            best_model = lda_model
    
    return best_params, best_model, best_coherence

def display_token_info(corpus):
    print(f'Le corpus contient {len(corpus)} tokens')
    unique_tokens = set(corpus.split())
    print(f"Le corpus contient {len(unique_tokens)} tokens uniques")
    print(f"Occurences moyennes par token: {len(corpus) / len(unique_tokens)}")

def inspect_non_null_matrix_values(matrix):
    column_names = matrix.columns
    column_name = random.choice(column_names)
    print("Colonne choisie:", column_name)
    non_zero_column = matrix[matrix[column_name] > 0]
    print(non_zero_column[[column_name]].head())

def get_document_vector(doc, model):
    vectors = [model.wv[token] for token in doc if token in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)


### Préparation des données ###

LDA prend en entrée une matrice de fréquence, via une technique d'extraction de features comme Bag of Words.

In [18]:
corpus = data.combined_text.tolist()
dictionary, corpus_gensim = generate_dictionary_and_bow(corpus)

In [21]:
corpus_gensim[0:2]

[[(0, 2),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 3),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 6),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 3),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 3),
  (22, 1),
  (23, 1),
  (24, 5),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1)],
 [(30, 1),
  (34, 2),
  (35, 1),
  (36, 3),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 2),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 3),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 2),
  (60, 1),
  (61, 1),
  (62, 2),
  (63, 2),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 2),
  (68, 2),
  (69, 1),
  (70, 1),
  (71, 2),
  (72, 2),
  (73, 5)]]

Pour mesurer la cohérence des topics trouvés par LDA, j'aurai également besoin des documents sous forme de listes de tokens.

In [22]:
texts = [doc.split() for doc in data.combined_text]

In [23]:
texts

[['reset',
  'as3',
  'list',
  'component',
  'nothing',
  'selected',
  'selectedindex',
  'list',
  'component',
  'stage',
  'as3',
  'movie',
  'populate',
  'value',
  'runtime',
  'user',
  'select',
  'multiple',
  'value',
  'button',
  'clicked',
  'want',
  'list',
  'reset',
  'state',
  'nothing',
  'selected',
  'figure',
  'set',
  'selected',
  'index',
  'first',
  'position',
  'list',
  'either',
  'blank',
  'reading',
  'choose',
  'list',
  'make',
  'code',
  'ignore',
  'selected',
  'way',
  'reset',
  'list',
  'combobox',
  'component',
  'nothing',
  'selected',
  'thanks'],
 ['wiki',
  'central',
  'development',
  'project',
  'repository',
  'played',
  'idea',
  'wiki',
  'mediawiki',
  'centralize',
  'project',
  'information',
  'development',
  'project',
  'done',
  'extension',
  'pull',
  'information',
  'svn',
  'svnkit',
  'linking',
  'bugzilla',
  'extract',
  'work',
  'assigned',
  'developer',
  'work',
  'remaining',
  'release',
  'examp

### Modélisation avec LDA ###

Le corpus de matrices Bag of Words et le dictionnaire peuvent maintenant être passés au modèle LDA, pour découvrir les topics associés.

In [24]:
lda_model = LdaModel(
    corpus=corpus_gensim, 
    id2word=dictionary, 
    num_topics=10, 
    random_state=0, 
    passes=10
    )

In [25]:
# Obtenir les topics dominants pour chaque document
dominant_topics = []
for bow in corpus_gensim:
    topics_per_doc = lda_model.get_document_topics(bow)
    dominant_topics.append(max(topics_per_doc, key=lambda x: x[1])[0])

# Ajouter les topics dominants au DataFrame
data['dominant_topic'] = dominant_topics


### Evaluation des premiers résultats ###

Je veux maintenant évaluer la pertinence des topics devinés et assignés par le modèle.  
Pour cela, plusieurs mesures sont pertinentes:
- La **cohérence**
- La **perplexité**
- L'analyse de la **distribution des topics au sein des documents**, pour vérifier si les topics sont variés et bien répartis
- L'analyse de la **distribution des mots au sein des topics**, pour vérifier si les mots présents dans le même topic sont liés

D'abord, la cohérence, qui mesure à quel point les mots assignés au même topic co-occurent dans plusieurs documents du corpus

In [26]:
coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')

In [27]:
coherence_lda = coherence_model.get_coherence()

In [28]:
print(coherence_lda)

0.5200948487535204


*0.51* est un score de cohérence plutôt moyen. Une optimisation des hyperparamètres (nombre de topics, passes...) permettra peut-être d'obtenir une plus grande cohérence.

La perplexité mesure la capacité du modèle à prédire les tokens d'un document, à partir des distributions de topics et de mots qu'il a apprises.

In [29]:
log_perplexity_lda = lda_model.log_perplexity(corpus_gensim)
perplexity_lda = np.exp(log_perplexity_lda)

In [30]:
print(perplexity_lda)

0.00016174743656212804


Le score est très proche de 0; le modèle est peu perplexe, ou confiant en sa capacité à regénerer un document à partir des distributions qu'il a apprises.  
Cela ne garantit pas la qualité des topics et de leur distribution; simplement l'effacicité du modèle à s'ajuster aux données fournies.

### Optimisation des hyperparamètres  ###

Avant d'analyser visuellement et "à la main" la composition des topics, je vais tester plusieurs combinaisons d'hyperparamètres pour maximiser le score de cohérence, et minimiser le score de perplexité.

In [31]:
param_grid = {
    'num_topics': [5, 10, 15, 20],
    'alpha': ['symmetric', 'asymmetric', 0.01, 0.1],
    'beta': ['symmetric', 0.01, 0.1],
    'passes': [10, 20, 50],
    'iterations': [50, 100, 200]
}

In [32]:
best_params, best_model, best_coherence = test_lda_hyperparams(param_grid=param_grid, n_iter=10)

KeyboardInterrupt: 

In [33]:
print(f"Meilleure cohérence: {best_coherence}")
print(f"Meilleure combinaison d'hyperparamètres: {best_params}")

Meilleure cohérence: 0.52089569935783
Meilleure combinaison d'hyperparamètres: {'passes': 50, 'num_topics': 20, 'iterations': 100, 'beta': 0.01, 'alpha': 0.01}


Il semble que le score de cohérence ne puisse pas être beaucoup amélioré.

### Interprétation des topics ###

In [37]:
lda_model = best_model

In [38]:
topics = lda_model.print_topics(num_topics=20, num_words=10)
for topic in topics:
    print(topic)

(0, '0.020*"error" + 0.018*"self" + 0.016*"thread" + 0.013*"code" + 0.011*"python" + 0.010*"memory" + 0.010*"function" + 0.009*"h" + 0.009*"object" + 0.008*"def"')
(1, '0.023*"database" + 0.021*"data" + 0.018*"sql" + 0.016*"model" + 0.011*"stored" + 0.010*"db" + 0.010*"end" + 0.010*"code" + 0.009*"mysql" + 0.009*"table"')
(2, '0.057*"1" + 0.029*"backup" + 0.029*"product" + 0.026*"oracle" + 0.026*"piece" + 0.024*"2" + 0.020*"0" + 0.020*"10" + 0.019*"b" + 0.017*"app"')
(3, '0.035*"image" + 0.016*"event" + 0.011*"way" + 0.011*"like" + 0.011*"control" + 0.011*"button" + 0.010*"want" + 0.009*"work" + 0.009*"would" + 0.008*"problem"')
(4, '0.035*"0" + 0.031*"1" + 0.028*"array" + 0.020*"2" + 0.015*"value" + 0.014*"date" + 0.014*"number" + 0.014*"list" + 0.013*"3" + 0.012*"like"')
(5, '0.031*"x" + 0.026*"binding" + 0.026*"value" + 0.021*"property" + 0.016*"grid" + 0.014*"row" + 0.014*"element" + 0.013*"control" + 0.012*"xaml" + 0.011*"set"')
(6, '0.022*"user" + 0.017*"server" + 0.013*"using" +

A première vue, certains topics semblent plus cohérents que d'autres.

In [52]:
print(corpus[0])
lda_model.get_document_topics(corpus_gensim[0])

database structure mysql right choice currently planning database structure quite complex e commerce web app flexibility main cornerstone app feature large amount data product run slight headache trying keep performance high without compromizing normalization rule database leaving highly beloved flexibility concept behind integrating product option also widely known product attribute parameter based various reference source available made list pro con major well known database pattern solve comparing come two final alternative eav entity attribute value model pro database used sorting con related query include number join multiple table order complete collection data slob serialized lob also known facade pro flexible keeping number necessary join low compared eav design pattern easy update add remove data product hard keep data integrity without additional table con sorting done application instead database use lot performance memory big datasets processed large number user main questi

[(1, 0.35653096),
 (2, 0.13312623),
 (6, 0.13638742),
 (7, 0.08073259),
 (10, 0.07670015),
 (12, 0.21567431)]

In [53]:
i = random.randint(1, 100)
print(corpus[i])
lda_model.get_document_topics(corpus_gensim[i])

difference factory strategy pattern one explain difference factory strategy pattern looking extra factory class create object product factory pattern


[(2, 0.16456428), (10, 0.45397753), (14, 0.37260234)]

En regardant quelques documents au hasard, la distribution de topics au sein du corpus semble plutôt bonne.

In [54]:

vis = pyLDAvis.gensim.prepare(lda_model, corpus_gensim, dictionary)
pyLDAvis.display(vis)