### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
#Sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE

#other
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline

# Plotly 
from plotly import tools
#import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

#spaCy
import spacy
nlp = spacy.load("fr_core_news_lg")
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.lang.fr import French
import string
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

### Load (acceptably clean) data

In [3]:
data=pd.read_pickle("../../data/tags_only1.pkl")
data_np=data.to_numpy()

In [4]:
data_np[251]

array(['commune,donnee-ouverte,economie,entreprise,epci,fiscalite,hauts-de-france,oise,unites-administratives,zone-de-restructuration-de-la-defense,zrd'],
      dtype=object)

In [7]:
import progressbar
bar = progressbar.ProgressBar(maxval=len(data_np), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])


In [8]:
i=random.randint(1,len(data_np))

print(i)

36119


In [9]:
spacy.displacy.render(nlp(data_np[i][0]), style='ent',jupyter=True)

## Process data to extract keywords only

### Locations remove

In [10]:
LOCS=[]
for i in range(len(data_np[i])):
    for ent in nlp(data_np[i]).ents:
        if ent.label_ == 'LOC':
            LOCS.append(ent.text)

TypeError: Argument 'string' has incorrect type (expected str, got numpy.ndarray)

In [104]:
LOCS

['Paris',
 'Laboratoire',
 'Paris',
 'Paris',
 'Cartographie des laboratoires](https://capgeo.maps.arcgis.com',
 'Paris',
 'Alpes',
 'Maritimes',
 'Paris',
 'Ville de Nice',
 'Chambéry',
 'Chambéry',
 "communauté d'agglomération Grand Chambéry",
 'Grand Chambéry',
 'Grand Chambéry',
 'Chambéry',
 "communauté d'agglomération Grand Chambéry",
 'Corse',
 'Corse',
 'Monreseaumobile.fr](http://monreseaumobile.fr/',
 'Corse',
 'Corse',
 'Monreseaumobile.fr](http://monreseaumobile.fr/',
 'Orléans Métropole',
 'Orléans Métropole',
 'Orléans Métropole',
 'Orléans Métropole',
 'Routiers',
 'Saint-Denis',
 'La Réunion',
 'Points Routiers',
 'PR',
 'Saint-Denis',
 'lInstitut Culturel Basque',
 'CAPVERN',
 'Capvern',
 'département des Hautes-Pyrénées',
 'Occitanie',
 'Tronçons',
 'Gamarthe',
 'Gamarthe',
 'lInstitut Culturel Basque',
 'Gamarthe',
 'Gamarthe',
 'Gamarthe',
 'BAL',
 'RGF93',
 'CAPVERN',
 'Île-de-France',
 'Île-de-France',
 'Copie_jardins',
 'Paris Seine Ouest',
 'Territoire de Grand 

### Quicker to remove basics

In [61]:
parser=French()
def spacy_tokenizer(sentence):
    tokens=parser(sentence)
    tokens = [ word.lower_ for word in tokens ]
    tokens = [ word for word in tokens if word not in stopwords and word not in punctuations ]
    tokens = " ".join([i for i in tokens])
    return tokens

In [62]:
spacy_tokenizer(data_np[i][0])

'donnees ouvertes geoscientific information passerelle inspire risque zonages risque zones risque'

In [64]:
data_np_proc0=[]
bar.start()
for i in range(len(data_np)):
    data_np_proc0.append(spacy_tokenizer(data_np[i][0]))
    bar.update(i+1)
bar.finish()



In [65]:
type(data_np_proc0[1])

str

In [18]:
data_proc0_pd=pd.DataFrame(data_np_proc0, dtype="str")

ValueError: failed to cast to '<U0' (Exception was: Unable to allocate 5.22 GiB for an array with shape (42837, 1) and data type <U32734)

In [16]:
data_proc0_pd.to_csv('data_proc0_pd.pkl')

AttributeError: 'PandasArray' object has no attribute 'to_csv'

### Remove the locs (there might be a better way)

In [226]:
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
def remove_tokens_on_match(doc):
    indexes = []
    for index, token in enumerate(doc):
        if (token.ent_type_ =='LOC') :
            indexes.append(index)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    np_array = np.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    return doc2

In [227]:
data_np_proc1=[]
bar.start()
for i in range(len(data_np_proc0)):
    data_np_proc1.append(remove_tokens_on_match(nlp(data_np_proc0[i])))
    bar.update(i+1)
bar.finish()



In [148]:
data_np_proc[261].type

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'type'

In [247]:
np.save('../src/data_proc1.npy', np.array(data_np_proc1_str), allow_pickle=True)

MemoryError: Unable to allocate 5.16 GiB for an array with shape (42837,) and data type <U32340

In [228]:
data_flat1=''.join(str(data_np_proc1))

In [66]:
data_flat=''.join(str(data_np_proc0))

In [68]:
from collections import Counter

# all tokens that arent stop words or punctuations
words = [token.text for token in nlp(data_flat[:1000000])] 

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common(50)

In [69]:
common_words

[("'", 25054),
 (',', 12528),
 ('ouvertes', 2467),
 ('donnees', 2216),
 ('inspire', 2084),
 ('passerelle', 2066),
 ('geospatial', 1041),
 ('risque', 989),
 ('citoyennete', 951),
 ('public', 922),
 ('finances', 910),
 ('environnement', 884),
 ('prise', 858),
 ('transport', 845),
 ('budget', 767),
 ('sols', 740),
 ('territoire', 685),
 ('information', 666),
 ('geoscientific', 612),
 ('sante', 553),
 ('grand', 543),
 ('pedologiques', 536),
 ('urbanisme', 531),
 ('ortho', 529),
 ('localisation', 521),
 ('administration', 520),
 ('social', 507),
 ('eau', 506),
 ('publiques', 480),
 ('mobilite', 479),
 ('culture', 450),
 ('transports', 444),
 ('amenagement', 442),
 ('vue', 437),
 ('photo', 436),
 ('orthophotographie', 435),
 ('logement', 433),
 ('vues', 426),
 ('1973,casq', 424),
 ('institution', 407),
 ('tourisme', 393),
 ('france', 388),
 ('election', 372),
 ('patrimoine', 370),
 ('economie', 359),
 ('pollution', 353),
 ('developpement', 346),
 ('saint', 342),
 ('equipements', 325),
 ('pub

In [225]:
print(data_np_proc0[25])
print(data_np[25])

données relatives résultats tests virologiques covid-19 région centre val loire région centre val loire présent jeu données renseigne échelle départementale régionale nombre personnes testées nombre personnes déclarées positives classe âge quotidiennement source   santé publique france mission améliorer protéger santé populations crise sanitaire liée épidémie covid-19 santé publique france charge surveiller comprendre dynamique épidémie anticiper scénarii mettre place actions prévenir limiter transmission virus territoire national système dinformation dépistage dep déploiement 13 mai 2020 plateforme sécurisée systématiquement enregistrés résultats laboratoires tests rt pcr réalisés lensemble laboratoires ville établissements hospitaliers sars cov2.la création système information autorisée durée 6 mois compter fin état urgence sanitaire application décret n° 2020 551 12 mai 2020](https://www.legifrance.gouv.fr affichtexte.do?cidtexte jorftext000041869923 relatif systèmes dinformation me

In [246]:
data_np_proc1_str=list(map(str, data_np_proc1))

### Vectorizer read documentation again

In [70]:
vectorizer = CountVectorizer(min_df=5, max_df=3000, lowercase=True, strip_accents='unicode', ngram_range=(1,2), token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data_np_proc0)

In [78]:
vectorizer=TfidfVectorizer(norm = False, smooth_idf = False)
sentence_vectors = vectorizer.fit_transform(data_np_proc0)

In [82]:
from gensim.models import word2vec
model = word2vec.Word2Vec(data_np_proc0, workers = 1, size = 2, min_count = 1, window = 3, sg = 0)


KeyError: "word 'football' not in vocabulary"

In [83]:
similar_word = model.wv.most_similar('eau')[0]

KeyError: "word 'eau' not in vocabulary"

### LDA read documentation and paper again

In [71]:
NUM_TOPICS = 20

In [79]:
# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online',verbose=True)
#data_lda = lda.fit_transform(data_vectorized)
data_lda=lda.fit_transform(sentence_vectors)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [252]:
np.save('../src/data_ldav0_tag.npy', data_lda, allow_pickle=True)

In [80]:
# Keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [81]:
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('gestion', 5341.8214588303845), ('bruit', 3011.2059255572317), ('structure', 1823.0644292897682), ('dechets', 1779.6672072756737), ('assainissement', 1271.9610640303024), ('gironde', 1158.871567526775), ('marches', 1125.5712244093006), ('dreal', 1037.734505867101), ('masse', 1016.2393401947495), ('bourgogne', 874.557892879109)]
Topic 1:
[('transport', 7835.560919325121), ('transports', 4336.995450798813), ('mobilite', 3622.682108047099), ('reseau', 3614.901720140605), ('reseaux', 2863.0134716795465), ('plan', 2372.7612410216198), ('montpellier', 2098.3092470312754), ('bus', 2071.845846397683), ('route', 1343.618878079925), ('cartes', 1342.5610022081694)]
Topic 2:
[('plu', 7850.35381586174), ('rhone', 7177.2665446461815), ('alpes', 6294.2065681207405), ('auvergne', 4801.139754662964), ('tourisme', 4692.193468289646), ('saint', 3274.218913949427), ('geospatial', 3220.107176751219), ('sport', 2876.634152326039), ('communes', 2265.299019443635), ('pays', 2071.78389795

In [None]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash

In [73]:
# Keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [74]:
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('sante', 1083.5924146676573), ('ddt', 848.1486647037392), ('social', 786.4882454746638), ('sociale', 414.93801433939365), ('aide', 384.0271140798738), ('securite', 305.8671881102135), ('population', 296.7688854809127), ('haute', 294.8428760745599), ('sante social', 279.85804955154947), ('society', 266.46593152446434)]
Topic 1:
[('saint', 642.6109308118696), ('zone', 436.5608331916399), ('stationnement', 298.38539797213707), ('isere', 287.95699563669456), ('seine', 261.3298710730353), ('france', 256.1894110102095), ('marche', 227.24352862444456), ('opendata', 217.22885923271025), ('inondation risque', 205.16664704992027), ('parking', 194.20326629060114)]
Topic 2:
[('environnement', 1121.6061272411112), ('occupation', 767.968717593325), ('sites', 528.8917021648571), ('sol', 508.15847093595175), ('habitats', 498.88785382527567), ('espace', 477.657478042351), ('terres', 433.4080957137626), ('occupation terres', 422.1503501259908), ('proteges', 404.4832523126346), ('si

In [75]:
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda, data_vectorized, vectorizer, mds='tsne')
dash