# Data

after downloading the json file from le grand debat i transformed it to data_file.csv  

In [1]:
%pylab inline
import pandas as pd

f = "data_file.csv"
df = pd.read_csv(f)

Populating the interactive namespace from numpy and matplotlib


In [2]:
df.columns

Index(['reference', 'title', 'createdAt', 'publishedAt', 'updatedAt',
       'trashed', 'trashedStatus', 'authorId', 'authorType', 'authorZipCode',
       'responses'],
      dtype='object')

preprocessing to extract the wanted answers list

In [47]:
responces = df.responses
answers=[]
for responce in responces:
    r=responce.split("questionId")[-1].split("formattedValue")[-1]
    responce = r[4:len(r)-3]
    responce=responce.replace("\\n","")
    answers.append(responce)

removing nan values

In [48]:
ok=True
while(ok):
    try:
        answers.remove("on")
    except : 
        ok=False
    

In [55]:
texts = pd.Series(answers)

In [56]:
print(texts)

0                   Multiplier les centrales géothermiques
1        Les problèmes auxquels se trouve confronté l’e...
2          Une vrai politique écologique et non économique
3        Les bonnes idées ne grandissent que par le par...
4        Pédagogie dans ce sens là dés la petite école ...
                               ...                        
33102    Construire des usines marée motrice, des panne...
33103                                                  Non
33104    Demandons aux grandes chaines de fast food de ...
33105    REMUEZ-VOUS, ARRETEZ VOTRE BLABLA .... DES ACTES,
33106    L'Etat devrait davantage communiquer sur les a...
Length: 33107, dtype: object


## preprocessing with spacy

In [81]:
import spacy

nlp = spacy.load("fr_core_news_sm")

In [82]:
%time spacy_docs = list(nlp.pipe(texts))

Wall time: 5min 40s


we remove all words shorter than 3 characters (these are often fairly uninteresting from a topical point of view) <br>
we drop all stopwords <br>
we take them lemmas of the remaining words and lowercase them <br>

In [83]:
docs = [[t.lemma_.lower() for t in doc if len(t.orth_) > 3 and not t.is_stop] for doc in spacy_docs]

In [84]:
print(docs[:3])

[['multiplier', 'centrale', 'géothermique'], ['problème', 'trouve', 'confronter', 'ensemble', 'planète', 'dénoncent', 'parfaire', 'désordre', 'gilet', 'jaune', 'france', 'il', 'surpopulation', 'mondial', 'population', 'passer', 'd’1,5', 'milliard', 'habitant', '1900', 'milliard', '2020', 'monter', 'bientôt', 'milliard', '2040', 'progrès', 'communication', 'village', 'mondial', 'individu', 'fond', 'asie', 'fond', 'afrique', 'passer', 'quartiers', 'campagne', 'pays', 'aspir', 'vivre', 'blâmer', 'lotir', 'concitoyen', 'logement', 'nourriture', 'bien', 'consommation', 'déplacement', 'etc.', 'mère', 'problème', 'bien', 'solution', 'problème', 'stabilisation', 'croissance', 'démographique', 'partage', 'richesse', 'partage', 'terre', 'partage', 'protection', 'biodiversité', 'règlemer', 'conflit', 'lutte', 'contre', 'déforestation', 'lutte', 'contre', 'dérèglemer', 'climatique', 'règlemer', 'conflit', 'stabilisation', 'migration', 'concurrence', 'commercial', 'mondial', 'etc.', 'français', 'eu

rq : topic models are bag-of-word models that ignore word position <br>
we use the Gensim library to  identify the frequent bigrams in the corpus, then we append them to the list of tokens for the documents in which they appear. 

In [86]:
import re
from gensim.models import Phrases

bigram = Phrases(docs, min_count=10)

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:  # bigrams can be recognized by the "_" that joins the invidual words
            docs[idx].append(token)
docs[2]

['vrai', 'politique', 'écologique', 'économique']

In [87]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
print('Number of unique words in original documents:', len(dictionary))

dictionary.filter_extremes(no_below=3, no_above=0.25)
print('Number of unique words after removing rare and common words:', len(dictionary))

print("Example representation of document 3:", dictionary.doc2bow(docs[2]))

Number of unique words in original documents: 53657
Number of unique words after removing rare and common words: 16080
Example representation of document 3: [(87, 1), (88, 1), (89, 1), (90, 1)]


bag-of-word representations for each document in the corpus

In [88]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

## Training 

corpus: the bag-of-word representations of our documents <br>
id2token: the mapping from indices to words <br>
num_topics: the number of topics we want the model to identify  <br>
chunksize: the number of documents the model sees for every update        equivalent to batch size<br>
passes: the number of times we show the total corpus to the model during training  <br>
random_state: we use a seed to ensure reproducibility.

In [89]:
from gensim.models import LdaModel

%time model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, chunksize=1000, passes=5, random_state=1)

Wall time: 46.9 s


## Results

In [90]:
for (topic, words) in model.print_topics():
    print(topic+1, ":", words)

1 : 0.036*"nucléaire" + 0.027*"énergie" + 0.015*"falloir" + 0.015*"production" + 0.015*"électricité" + 0.015*"centrale" + 0.015*"centrale_nucléaire" + 0.013*"solution" + 0.011*"éolien" + 0.010*"france"
2 : 0.019*"panneau_solaire" + 0.016*"lutter_contre" + 0.016*"ville" + 0.015*"solaire" + 0.014*"panneau" + 0.012*"centre_ville" + 0.010*"grand_surface" + 0.010*"grand" + 0.009*"construction" + 0.008*"bâtiment"
3 : 0.046*"mettre_place" + 0.045*"énergie_renouvelable" + 0.035*"énergie" + 0.019*"place" + 0.018*"énergie_fossile" + 0.018*"mettre" + 0.018*"transition_énergétique" + 0.017*"renouvelable" + 0.014*"énergétique" + 0.007*"service_civique"
4 : 0.011*"falloir" + 0.011*"faire" + 0.009*"bien" + 0.007*"politique" + 0.006*"pouvoir" + 0.006*"réchauffement_climatique" + 0.006*"environnement" + 0.006*"devoir" + 0.005*"problème" + 0.005*"citoyen"
5 : 0.043*"produit" + 0.017*"taxer" + 0.011*"production" + 0.011*"sac_plastique" + 0.010*"obsolescence_programmer" + 0.009*"produit_importer" + 0.009*