In [1]:
# spaCy based imports
import spacy
nlp = spacy.load('fr_core_news_sm')

In [2]:
# Usual imports 
import pandas as pd
from tqdm import tqdm, tqdm_notebook

# NLP imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import stop_words
import re, sys
from gensim.models import phrases, Phrases

# Sklearn imports
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV

# Dataviz imports
import pyLDAvis.sklearn

In [3]:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""
pd.set_option('display.max_colwidth', -1)

import nltk
nltk.download('punkt')

In [4]:
tqdm.pandas()
tqdm_notebook()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

0it [00:00, ?it/s]

In [5]:
# Data import
answers = pd.read_csv('preprocessed/answers_lemmatized.csv', sep="\t", encoding="utf-8")
answers['theme'].unique()

array(['Démocratie et citoyenneté', 'Impôts et dépenses publiques',
       "Organisation de l'Etat", 'Contribution libre',
       'Transition écologique'], dtype=object)

In [6]:
sw = stop_words.get_stop_words(language='fr')

In [7]:
# listes des mots à retirer du tf_idf
stopwords_additionnel = [
    'les',
    'a',
    'plus',
    'faut',
    'tout',
    'tous',
    'prends',
    'tre',
    'si',
    'non',
    'doit',
    'avoir',
    'comme',
    'trop',
    'leurs',
    'faire',
    'ils',
    'peut',
    'bien',
    'aussi',
    'cela',
    'gens',
    'sans',
    'car',
    'très',
    'fait',
    '\'',
    'qu\'',
    'd\'',
    'l\'',
    '"',
    'être',
    'mai',
    'faudrait'
]
stop_words_complete = sw + stopwords.words('french') + stopwords_additionnel
stop_words_complete = list(set(stop_words_complete))

## Analyse thème contribution libre

In [8]:
vectorizer = CountVectorizer(max_df=0.95, min_df=20, stop_words=stop_words_complete)
%time data_vectorized = vectorizer.fit_transform(answers[answers['theme'] == "Contribution libre"]['lemmatized_answer'].values.astype('U'))
data_vectorized.shape

CPU times: user 83.8 ms, sys: 12 ms, total: 95.8 ms
Wall time: 106 ms


(1024, 364)

In [9]:
# Define Search Param
search_params = {'n_components': [3, 4, 5], 'learning_decay': [.5, .9], 'max_iter': [30]}

# Init the Model
lda = LatentDirichletAllocation(max_iter=10, learning_method='batch', verbose=True)

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, verbose=0)

# Do the Grid Search
model.fit(data_vectorized)



iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30
iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_it

iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30
iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_i

iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=True),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_components': [3, 4, 5], 'learning_decay': [0.5, 0.9], 'max_iter': [30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'max_iter': 30, 'n_components': 3}
Best Log Likelihood Score:  -38045.98947589181
Model Perplexity:  311.3040831671076


In [11]:
# Latent Dirichlet Allocation Model
NUM_TOPICS = 3
LEARNING_DECAY = 0.9
MAX_ITER = 30
lda = LatentDirichletAllocation(learning_decay=LEARNING_DECAY, n_components=NUM_TOPICS, max_iter=MAX_ITER, learning_method='batch',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 30
iteration: 2 of max_iter: 30
iteration: 3 of max_iter: 30
iteration: 4 of max_iter: 30
iteration: 5 of max_iter: 30
iteration: 6 of max_iter: 30
iteration: 7 of max_iter: 30
iteration: 8 of max_iter: 30
iteration: 9 of max_iter: 30
iteration: 10 of max_iter: 30
iteration: 11 of max_iter: 30
iteration: 12 of max_iter: 30
iteration: 13 of max_iter: 30
iteration: 14 of max_iter: 30
iteration: 15 of max_iter: 30
iteration: 16 of max_iter: 30
iteration: 17 of max_iter: 30
iteration: 18 of max_iter: 30
iteration: 19 of max_iter: 30
iteration: 20 of max_iter: 30
iteration: 21 of max_iter: 30
iteration: 22 of max_iter: 30
iteration: 23 of max_iter: 30
iteration: 24 of max_iter: 30
iteration: 25 of max_iter: 30
iteration: 26 of max_iter: 30
iteration: 27 of max_iter: 30
iteration: 28 of max_iter: 30
iteration: 29 of max_iter: 30
iteration: 30 of max_iter: 30


In [12]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
