                        Projet n°5 Catégorisez automatiquement des questions
Nous allons dans cette étude proposez une analyse textuelle des questions posées sur le site Stackoverflow. 
Notre objectif final est de suggérer des tags relatifs à la question posée afin d'aider les débutants dans leurs premiers pas sur le site.

Ce notebook représente le test des différents modèles
1. Importation des données
2. Approche non supervisée

2.1 Structure
2.2 Champs vides et dupliqués
2.3 Analyse univariée
2.4 Analyse multivariée
3. Réduction des Posts
3.1 Nettoyage du code dans le corps du post
3.2 Nettoyage des Posts
3.2.1 Initialisation
3.2.2 Traitement de l'ensemble du corpus
3.2.3 Nettoyage des textes et des Tags
3.3 Analyse textuelle
3.3.1 Représentation visuelle
3.3.2 Analyse de la distribution des tokens
3.3.3 Nettoyage aprés analyse
4. Sauvegarde du jeu de données nettoyé

In [73]:
# Bult-in
import os, sys, time, random

# Data
import numpy as np
import pandas as pd
from ast import literal_eval

# Visualizationraw_corpus
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from PIL import Image

import re, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn import cluster, metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score,\
                            recall_score, f1_score, jaccard_score

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import words

from pandarallel import pandarallel

import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


sns.set()

# <a name="C1">1. Importation des données</a>

In [3]:
df = pd.read_csv("data/cleaned/df_final_version.csv",sep=';')

In [4]:
for col in ['Title', 'Body', 'Tags']:
     df[col] = df[col].apply(literal_eval)

In [5]:
df.head()

Unnamed: 0,Title,Body,Tags
0,"[selenium, button, webdriver, find]","[loosing, page, left, focus, href, working, cl...","[html, webdriver, selenium, python]"
1,"[policy, rest, access, nifi, api, using]","[policy, following, set, rest, instance, get, ...","[apache, rest, python]"
2,"[based, table, existence, subtable, row]","[fruit, table, select, enabled, seeing, fruits...","[mysql, sql]"
3,"[route, uris, go, localizing]","[register, go, multiple, route, french, versio...",[laravel]
4,"[service, public, app, access, restriction, az...","[image, networking, src, explorer, checked, fa...",[azure]


# <a name="C2">2. Approche non supervisée</a>

In [None]:
Le modèle Latent Dirichlet Allocation suppose que chaque document est un mélange d’un petit nombre de topics, 
et que chaque occurrence d’un mot correspond à l’un des sujets du document. 
Chaque mot se voit attribuer un topic selon la loi de Dirichlet.

In [35]:
# Create Dictionary
id2word = corpora.Dictionary(df.Body)
# Create Corpus
Texts = df.Body
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in Texts]
# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)]]


In [36]:
n_topics = 10

# Créer le modèle LDA
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=n_topics, passes=2, iterations=25)

In [37]:
lda_model.print_topics(5)

[(4,
  '0.009*"using" + 0.008*"like" + 0.007*"problem" + 0.006*"time" + 0.006*"function" + 0.006*"wa" + 0.006*"ha" + 0.006*"data" + 0.005*"way" + 0.005*"tried"'),
 (7,
  '0.022*"class" + 0.021*"lang" + 0.021*"override" + 0.021*"pre" + 0.021*"prettyprint" + 0.013*"lt" + 0.009*"true" + 0.008*"j" + 0.007*"false" + 0.007*"return"'),
 (5,
  '0.014*"file" + 0.013*"run" + 0.009*"using" + 0.009*"tried" + 0.007*"get" + 0.007*"running" + 0.007*"version" + 0.007*"command" + 0.007*"project" + 0.006*"following"'),
 (0,
  '0.021*"table" + 0.018*"container" + 0.018*"div" + 0.018*"td" + 0.018*"class" + 0.017*"tbody" + 0.017*"thead" + 0.009*"like" + 0.009*"column" + 0.008*"data"'),
 (2,
  '0.012*"page" + 0.010*"using" + 0.009*"tried" + 0.009*"button" + 0.009*"like" + 0.008*"way" + 0.008*"click" + 0.008*"change" + 0.006*"html" + 0.006*"j"')]

In [38]:
#Print Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=Texts, 
                                     dictionary=id2word, 
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', round(coherence_lda,4))


Coherence Score:  0.465


In [39]:
for n_topics in range (3,20):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=n_topics, passes=2, iterations=25)  
    coherence_model_lda = CoherenceModel(model=lda_model, 
                                     texts=Texts, 
                                     dictionary=id2word, 
                                     coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Nb_topics : ' + str(n_topics) +  ', Coherence Score : ' + str(coherence_lda))

Nb_topics : 3, Coherence Score : 0.45528558396442104
Nb_topics : 4, Coherence Score : 0.5078678025780694
Nb_topics : 5, Coherence Score : 0.5004623049188166
Nb_topics : 6, Coherence Score : 0.4753899003932973
Nb_topics : 7, Coherence Score : 0.474959786747556
Nb_topics : 8, Coherence Score : 0.4786610618168548
Nb_topics : 9, Coherence Score : 0.4921675071118009
Nb_topics : 10, Coherence Score : 0.47603597127137165
Nb_topics : 11, Coherence Score : 0.4517544887652524
Nb_topics : 12, Coherence Score : 0.4731373000541159
Nb_topics : 13, Coherence Score : 0.5032193101681075
Nb_topics : 14, Coherence Score : 0.46135328781091994
Nb_topics : 15, Coherence Score : 0.45587558099360065
Nb_topics : 16, Coherence Score : 0.5152972270713547
Nb_topics : 17, Coherence Score : 0.4664432548733696
Nb_topics : 18, Coherence Score : 0.48570045721774474
Nb_topics : 19, Coherence Score : 0.47044513124105647
Nb_topics : 20, Coherence Score : 0.48650259663046597
Nb_topics : 21, Coherence Score : 0.46474098920

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Applications/Anaconda/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Applications/Anaconda/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
  File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/gensim/__init__.py", line 11, in <module>
    from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils  # noqa:F401
  File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/gensim/corpora/__init__.py", line 6, in <module>
    from .indexedcorpus import IndexedCorpus  # noqa:F401 must appear before the other classes
  File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/gensim/corpora/indexedcorpus.py", line 14, in <module>
    from gensim import interfaces, utils
  File "/Applications/Anaconda/anacond

KeyboardInterrupt: 

In [43]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=16, passes=2, iterations=25) 

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds',R=20)
vis


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Applications/Anaconda/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Applications/Anaconda/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
  File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/gensim/__init__.py", line 11, in <module>
    from gensim import parsing, corpora, matutils, interfaces, models, similarities, utils  # noqa:F401
  File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/gensim/corpora/__init__.py", line 6, in <module>
    from .indexedcorpus import IndexedCorpus  # noqa:F401 must appear before the other classes
  File "/Applications/Anaconda/anaconda3/lib/python3.9/site-packages/gensim/corpora/indexedcorpus.py", line 14, in <module>
    from gensim import interfaces, utils
  File "/Applications/Anaconda/anacond

KeyboardInterrupt: 

# Fonctions communes

In [46]:
df

Unnamed: 0,Title,Body,Tags
0,"[selenium, button, webdriver, find]","[loosing, page, left, focus, href, working, cl...","[html, webdriver, selenium, python]"
1,"[policy, rest, access, nifi, api, using]","[policy, following, set, rest, instance, get, ...","[apache, rest, python]"
2,"[based, table, existence, subtable, row]","[fruit, table, select, enabled, seeing, fruits...","[mysql, sql]"
3,"[route, uris, go, localizing]","[register, go, multiple, route, french, versio...",[laravel]
4,"[service, public, app, access, restriction, az...","[image, networking, src, explorer, checked, fa...",[azure]
...,...,...,...
78446,"[phone, device, near, bluetooth, detect, scann...","[part, enabled, even, alright, video, message,...","[java, android]"
78447,"[missing_enum_constant_in_switch, make, dart, ...","[github, image, severity, src, complete, langu...","[flutter, dart]"
78448,"[default, cplex, option]","[something, faster, native, ol, problem, linea...",[python]
78449,"[term, many, efficiently, filter]","[elsewhere, command, expect, something, questi...","[r, data, dataframe]"


In [59]:
%%time
# Define X and y
X = df["Body"]
y = df["Tags"]

# Initialize the "CountVectorizer" TFIDF for the Body
vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.97,
                             min_df= 3,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)
vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)

print(X_tfidf.shape)

# Multilabel binarizer for targets
mlb = MultiLabelBinarizer()
mlb.fit(y)
y_mlb = mlb.transform(y)

print(y_mlb.shape)

(78451, 26530)
(78451, 100)
CPU times: user 5.93 s, sys: 172 ms, total: 6.1 s
Wall time: 6.12 s


In [60]:
# Create train and test split (30%)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_mlb,
                                                    test_size=0.3, random_state=42)
print("X_train shape : {}".format(X_train.shape))
print("X_test shape : {}".format(X_test.shape))
print("y_train shape : {}".format(y_train.shape))
print("y_test shape : {}".format(y_test.shape))

X_train shape : (54915, 26530)
X_test shape : (23536, 26530)
y_train shape : (54915, 100)
y_test shape : (23536, 100)


In [62]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
#ARI = np.round(metrics.adjusted_rand_score(y_pred, clf.labels_),4)
        

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [63]:
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [64]:
y_pred.shape

(23536, 100)

In [68]:
y_test.shape

(23536, 100)

In [74]:
print("Accuracy : ", accuracy_score(y_test, y_pred))
print("Precision : ", precision_score(y_test, y_pred, average='samples'))
print("Recall : ", recall_score(y_test, y_pred, average='samples')) 
print("F1 Score : ", f1_score(y_test, y_pred, average='samples'))
print("Jaccard :", jaccard_score(y_test, y_pred, average='samples'))

Accuracy :  0.16374915023793338


  _warn_prf(average, modifier, msg_start, len(result))


Precision :  0.39852313376064225
Recall :  0.307191870204914
F1 Score :  0.3276766017142462
Jaccard : 0.285180354947945
