In [2]:
# spaCy based imports
import spacy
nlp = spacy.load('fr_core_news_sm')

In [3]:
# Usual imports 
import pandas as pd
from tqdm import tqdm, tqdm_notebook

# NLP imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import stop_words
import re, sys
from gensim.models import phrases, Phrases

# Sklearn imports
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV

# Dataviz imports
import pyLDAvis.sklearn

In [4]:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""
pd.set_option('display.max_colwidth', -1)

import nltk
nltk.download('punkt')

In [5]:
tqdm.pandas()
tqdm_notebook()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0it [00:00, ?it/s]

In [16]:
# Data import
answers = pd.read_csv('preprocessed/answers_lemmatized.csv', sep="\t", encoding="utf-8")
answers['theme'].unique()

Unnamed: 0.1,Unnamed: 0,customer_id,question_name,answer,theme,tokens,tokens_clean,tokens_clean_joined,lemmatized_answer
0,0,17 943,DC11,"Dites leur la vérité. Si une situation est jugée catastrophique c'est aux citoyen(ne)s d'y répondre et non à un gouvernement qui ne nous entends pas ! && Il y en à marre des faux semblant. Vous voulez que la Démocratie se renforce ? Ecoutez les citoyen(ne)s, cessez de faire la sourde oreille. Nous avons toutes et tous nos mots à dire pour la gestion de nos vies, comme celles de nos enfants ! Un Pays libre est un pays qui vit par et pour le Peuple !",Démocratie et citoyenneté,"['Dites', 'leur', 'la', 'vérité', '.', 'Si', 'une', 'situation', 'est', 'jugée', 'catastrophique', ""c'est"", 'aux', 'citoyen', '(', 'ne', ')', 's', 'd', ""'"", 'y', 'répondre', 'et', 'non', 'à', 'un', 'gouvernement', 'qui', 'ne', 'nous', 'entends', 'pas', '!', '&', '&', 'Il', 'y', 'en', 'à', 'marre', 'des', 'faux', 'semblant', '.', 'Vous', 'voulez', 'que', 'la', 'Démocratie', 'se', 'renforce', '?', 'Ecoutez', 'les', 'citoyen', '(', 'ne', ')', 's', ',', 'cessez', 'de', 'faire', 'la', 'sourde', 'oreille', '.', 'Nous', 'avons', 'toutes', 'et', 'tous', 'nos', 'mots', 'à', 'dire', 'pour', 'la', 'gestion', 'de', 'nos', 'vies', ',', 'comme', 'celles', 'de', 'nos', 'enfants', '!', 'Un', 'Pays', 'libre', 'est', 'un', 'pays', 'qui', 'vit', 'par', 'et', 'pour', 'le', 'Peuple', '!']","['dites', 'vérité', 'situation', 'jugée', 'catastrophique', 'citoyen', 'répondre', 'non', 'gouvernement', 'entends', 'marre', 'faux', 'semblant', 'voulez', 'démocratie', 'renforce', 'ecoutez', 'citoyen', 'cessez', 'sourde', 'oreille', 'toutes', 'mots', 'dire', 'gestion', 'vies', 'celles', 'enfants', 'pays', 'libre', 'pays', 'vit', 'peuple']",dites vérité situation jugée catastrophique citoyen répondre non gouvernement entends marre faux semblant voulez démocratie renforce ecoutez citoyen cessez sourde oreille toutes mots dire gestion vies celles enfants pays libre pays vit peuple,dites vérité situation jugée catastrophique citoyen répondre non gouvernement entends marre faux semblant voulez démocratie renforce ecoutez citoyen cessez sourd oreille toutes mots dire gestion vie celles enfant pays libre pays vit peuple
1,1,17 965,DC11,Rendre la politique intéressante,Démocratie et citoyenneté,"['Rendre', 'la', 'politique', 'intéressante']","['rendre', 'politique', 'intéressante']",rendre politique intéressante,rendre politique intéressante
2,2,17 971,DC11,"Plus d'éducation civique, y compris au lycée. Proposer des formations en ligne pour mieux comprendre le fonctionnement de notre état.",Démocratie et citoyenneté,"['Plus', ""d'éducation"", 'civique', ',', 'y', 'compris', 'au', 'lycée', '.', 'Proposer', 'des', 'formations', 'en', 'ligne', 'pour', 'mieux', 'comprendre', 'le', 'fonctionnement', 'de', 'notre', 'état', '.']","['plus', 'éducation', 'civique', 'compris', 'lycée', 'proposer', 'formations', 'ligne', 'mieux', 'comprendre', 'fonctionnement']",plus éducation civique compris lycée proposer formations ligne mieux comprendre fonctionnement,plus éducation civique compris lycée proposer formation ligne mieux comprendre fonctionnement
3,3,17 974,DC11,Information du citoyen + vote oblugatoire,Démocratie et citoyenneté,"['Information', 'du', 'citoyen', '+', 'vote', 'oblugatoire']","['information', 'citoyen', 'vote', 'oblugatoire']",information citoyen vote oblugatoire,information citoyen vote oblugatoire
4,4,18 019,DC11,Qu'il arrête leur connerie comme les 80km/h,Démocratie et citoyenneté,"[""Qu'il"", 'arrête', 'leur', 'connerie', 'comme', 'les', '80km/h']","['arrête', 'connerie', 'km']",arrête connerie km,arrête connerie kilomètre


In [9]:
sw = stop_words.get_stop_words(language='fr')

In [10]:
# listes des mots à retirer du tf_idf
stopwords_additionnel = [
    'les',
    'a',
    'plus',
    'faut',
    'tout',
    'tous',
    'prends',
    'tre',
    'si',
    'non',
    'doit',
    'avoir',
    'comme',
    'trop',
    'leurs',
    'faire',
    'ils',
    'peut',
    'bien',
    'aussi',
    'cela',
    'gens',
    'sans',
    'car',
    'très',
    'fait',
    '\'',
    'qu\'',
    'd\'',
    'l\'',
    '"',
    'être',
    'mai',
    'faudrait'
]
stop_words_complete = sw + stopwords.words('french') + stopwords_additionnel
stop_words_complete = list(set(stop_words_complete))

## Analyse thème contribution libre

In [18]:
vectorizer = CountVectorizer(max_df=0.95, min_df=20, stop_words=stop_words_complete)
%time data_vectorized = vectorizer.fit_transform(answers[answers['theme'] == "Contribution libre"]['lemmatized_answer'].values.astype('U'))
data_vectorized.shape

CPU times: user 60.5 ms, sys: 8.3 ms, total: 68.8 ms
Wall time: 72.6 ms


(581, 158)

In [20]:
# Define Search Param
search_params = {'n_components': [3, 4, 5], 'learning_decay': [.5, .9], 'max_iter': [30]}

# Init the Model
lda = LatentDirichletAllocation(max_iter=10, learning_method='batch', verbose=True)

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, verbose=0)

# Do the Grid Search
model.fit(data_vectorized)



iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30
iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_it

iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30
iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max



iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_components': [3, 4, 5], 'learning_decay': [0.5, 0.9], 'max_iter': [30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'max_iter': 30, 'n_components': 3}
Best Log Likelihood Score:  -12006.353750560787
Model Perplexity:  148.629891430917


In [23]:
# Latent Dirichlet Allocation Model
NUM_TOPICS = 3
LEARNING_DECAY = 0.9
MAX_ITER = 30
lda = LatentDirichletAllocation(learning_decay=LEARNING_DECAY, n_components=NUM_TOPICS, max_iter=MAX_ITER, learning_method='batch',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30


In [24]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
