### Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
#Sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

#other
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline

# Plotly 
from plotly import tools
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

#spaCy
import spacy
nlp = spacy.load("fr_core_news_lg")
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.lang.fr import French
import string
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

### Load (acceptably clean) data

In [4]:
data=pd.read_pickle("../../data/cleaned1.pkl")
data_np=data.to_numpy()

In [7]:
i=5648
data_np[i]

'calendrier et résultats toulouse olympique 13. Région Occitanie Pyrénées Méditerranée. jeu de données présentant le calendrier et les résultats associés de la saison du toulouse olympique .. championship rugby sport toxiii'

In [31]:
import progressbar
bar = progressbar.ProgressBar(maxval=len(data_np), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])


In [30]:
spacy.displacy.render(nlp(data_np_noloc[2]), style='ent',jupyter=True)

## Process data to extract keywords only

### Remove the locs (there might be a better way)

In [32]:
def remove_locs(doc):
    bar.start()
    for i in range(len(doc)):
        temp = ""
        for token in nlp(doc[i]):
            if not token.ent_type:
                temp += token.text
                if token.whitespace_:
                    temp += " "
        doc[i]=temp
        bar.update(i+1)
    bar.finish()
    return(doc)

In [33]:
data_np_noloc=data_np
data_np_noloc=remove_locs(data_np_noloc)



In [39]:
np.save('../../data/data_np_noloc.npy', data_np_noloc, allow_pickle=True)

### Remove stopwords

In [44]:
parser=French()
def spacy_tokenizer(sentence):
    tokens=parser(sentence)
    tokens = [ word.lower_ for word in tokens ]
    tokens = [ word for word in tokens if word not in stopwords]
    tokens = " ".join([i for i in tokens])
    return tokens

In [45]:
data_np_nostop=[]
bar.start()
for i in range(len(data_np_noloc)):
    data_np_nostop.append(spacy_tokenizer(data_np_noloc[i]))
    bar.update(i+1)
bar.finish()

IOPub message rate exceeded.                                             ]  15%
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.                                             ]  23%
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs

In [50]:
data_flat1=''.join(data_np_nostop)

In [53]:
len(data_flat1)

30743145

In [51]:
from collections import Counter

# all tokens that arent stop words or punctuations
words = [token.text for token in nlp(data_flat1[:1000000])] 

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(50)

In [52]:
common_words

[('.', 10023),
 (':', 2203),
 ('données', 1953),
 ('..', 786),
 ('"', 770),
 ('eau', 737),
 ('donnees', 613),
 ('datasets', 535),
 ('nombre', 528),
 ('ouvertes', 525),
 ('consulter', 490),
 ('jeu', 472),
 ('risques', 441),
 ('base', 431),
 ('origine', 419),
 ('fiche', 407),
 ('région', 397),
 ('partenaires', 392),
 ('information', 391),
 ('commune', 381),
 ('organisations', 380),
 ('territoire', 377),
 ("'", 377),
 ('zones', 363),
 ('inspire', 354),
 ('ville', 351),
 ('réseau', 333),
 ('passerelle', 333),
 ('l', 328),
 ('liens', 327),
 ('annexes', 326),
 ('services', 324),
 ('communes', 324),
 ('code', 319),
 ('environnement', 304),
 ('département', 273),
 ('ensemble', 272),
 ('site', 272),
 ('jour', 268),
 ('public', 268),
 ('métropole', 265),
 ('plan', 265),
 ('informations', 253),
 ('mise', 248),
 ('risque', 247),
 ('transport', 238),
 ('liste', 229),
 ('prélèvement', 223),
 ('points', 220),
 ('plans', 220)]

### Vectorizer read documentation again

In [55]:
vectorizer = CountVectorizer(min_df=10, max_df=4000, lowercase=True, strip_accents='unicode', ngram_range=(2,3))
data_vectorized = vectorizer.fit_transform(data_np_nostop)

### LDA read documentation and paper again

In [57]:
NUM_TOPICS = 30

In [58]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=50, evaluate_every=5, learning_method='online',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50
iteration: 31 of max_iter: 50
iteration: 32 of max_iter: 50
iteration: 33 of max_iter: 50
iteration: 34 of ma

In [59]:
np.save('../../data/data_ldav3.npy', data_lda, allow_pickle=True)

In [60]:
# Keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [61]:
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('qualite air', 513.4830503853624), ('donnees fournies', 291.5851968940687), ('particules suspension', 282.49952790169544), ('enseignement formation', 276.9156494438349), ('surveillance qualite', 263.7467731158401), ('surveillance qualite air', 262.72427309644746), ('μg m3', 256.9787970509656), ('valeurs limites', 221.99000120411694), ('formation recherche', 217.21574028938144), ('concentrations moyennes', 211.83850809614933)]
Topic 1:
[('origine risque', 2245.9588226688397), ('grand public', 1615.149040715145), ('source donnees', 1077.0110888158947), ('cahier charges', 1046.8935594806508), ('inondation tri', 940.3836078977771), ('directive europeenne', 882.1971781383365), ('ouvertes dreal', 871.8911829372689), ('gestion risques inondation', 867.3298978270477), ('organisations partenaires dreal', 853.2783435167782), ('partenaires dreal', 853.2780420298394)]
Topic 2:
[('risques naturels', 5404.241321140376), ('code environnement', 3550.9245856145835), ('article code

[('contient perimetres', 1191.245380803598), ('donnees contient perimetres', 1138.0965286490882), ('acte officiel', 1077.3499141402667), ('fichier contient', 673.3668364820473), ('referentiel geographique', 616.8685100450866), ('foret incendie', 551.0021492853779), ('donnees referentiel geographique', 490.7645215348185), ('depend type', 461.3165568472762), ('represente polygone', 460.2834901956549), ('source donnees referentiel', 459.29396286510837)]
Topic 20:
[('budget primitif', 917.4039174431067), ('campagne mer', 736.9023461029209), ('point vue', 676.7962217254574), ('bd ortho', 442.1408780034), ('referentiel grande', 441.3217905248244), ('referentiel grande echelle', 441.3217905248244), ('echelle rge', 385.30672351072434), ('grande echelle rge', 385.30672351072434), ('casq geospatial', 376.11898535596686), ('integration donnees', 375.5858864716163)]
Topic 21:
[('temps reel', 630.8593441062992), ('services proposes', 454.1012006697242), ('donnee renseigne', 371.4857445676057), ('ac

In [62]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash