In [1]:
import collections
import os
import yake
import string
import sys
from os.path import isfile, join

import pandas as pd
from pathlib import Path
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charalambos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "../data/txt/"

# Choix d'une décennie et du nombre de clusters

In [4]:
DECADE = '1960'
N_CLUSTERS = 10

# Chargement des fichiers de la décennie

In [5]:
files = [f for f in sorted(os.listdir(data_path)) if f"_{DECADE[:-1]}" in f]

In [6]:
# Exemple de fichiers
files[:5]

['Bxl_1960_Tome_II1_Part_1.txt',
 'Bxl_1960_Tome_II1_Part_2.txt',
 'Bxl_1960_Tome_II1_Part_3.txt',
 'Bxl_1960_Tome_II1_Part_4.txt',
 'Bxl_1960_Tome_II1_Part_5.txt']

In [7]:
texts = [open(data_path + f).read() for f in files]

In [8]:
# Exemple de textes
texts[0][:900]

'PUBLICATION\n\nPERIODIQUE\n\nVILLE DE BRUXELLES\n\nAnnée 1960\n\nBULLETIN COMMUNAL\nTOME II "\n_u\n\nImpr. H . 6 M . S C H A U M A N S\nSociété Anonyme\n\nParvis Saint-Gilles, 41 Bruxelles 1960\n\n\x0c\x0cN" 14\n\nCOMPTE\n\nR E N D U D E L A S E A N C E D U 1"\' A O U T\n\n1960\n\nVILLE\n\nDE\n\nBRUXELLES\n\nBULLETIN\n(Publication\n\nCOMMUNAL\npériodique)\n\nA n n é e 1960\n\nCONSEIL\n\nCOMMUNAL\n\nS é a n c e du\n\n1\n\ner\n\naoût\n\n1960.\n\nPRESIDENCE\n\nDE M .\n\nCOOREMANS,\n\nBourgmestre.\n\nSOMMAIRE : Pages 1. Evénements tragiques du Congo. — Vote d\'un crédit extraordinaire en faveur des rapatriéa du Congo . . . . . . Sports. — Avant-projet de construction d\'un bâtiment vestiaire et d\'une installation d\'éclairage électrique, au Stade Annexe II du Heysel . Approbation. Académie royale des Beaux-Arts. — Construction d\'un escalier en béton. — Approbation des devis et projets, ainsi que du montant de la dépense . . . . . Approbation. Ecole Industrielle'

# Vectorisation du texte

In [9]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [10]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [11]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

CPU times: user 1min 22s, sys: 427 ms, total: 1min 22s
Wall time: 1min 23s


In [12]:
tfidf_vectors

<420x6668 sparse matrix of type '<class 'numpy.float64'>'
	with 608083 stored elements in Compressed Sparse Row format>

In [13]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

fer        0.285979
curage     0.210133
chemins    0.159929
tension    0.159280
dma        0.153070
             ...   
munal      0.000000
mur        0.000000
murs       0.000000
musin      0.000000
#          0.000000
Length: 6668, dtype: float64

# Comprendre les vecteurs et leurs "distances"

In [14]:
from scipy.spatial.distance import cosine

In [15]:
cosine([1, 2, 3], [1, 2, 3])

0.0

In [16]:
cosine([1, 2, 3], [1, 2, 2])

0.02004211298777725

In [17]:
cosine([1, 2, 3], [2, 2, 2])

0.07417990022744858

In [18]:
tfidf_array = tfidf_vectors.toarray()

In [19]:
tfidf_array[0]

array([0.        , 0.        , 0.02430379, ..., 0.        , 0.01537096,
       0.00911774])

In [20]:
tfidf_array[0]

array([0.        , 0.        , 0.02430379, ..., 0.        , 0.01537096,
       0.00911774])

In [21]:
cosine(tfidf_array[0], tfidf_array[1])

0.5890213388262817

# Clustering des vecteurs TFIDF

Article intéressant sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

In [22]:
km_model = KMeans(n_clusters=N_CLUSTERS)

In [23]:
km_model.fit(tfidf_vectors)

KMeans(n_clusters=10)

In [24]:
km_model.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 1, 1, 1, 9, 9, 3, 3, 1, 1, 1, 1, 3,
       1, 1, 1, 1, 1, 1, 1, 1, 5, 9, 9, 5, 5, 5, 5, 5, 9, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 8, 8, 8, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1,
       1, 5, 5, 5, 5, 9, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 1, 1, 1,
       3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 5, 9, 9, 9, 5, 5, 5,
       9, 1, 1, 1, 1, 6, 6, 1, 7, 7, 1, 6, 3, 3, 6, 7, 7, 7, 8, 8, 6, 6,
       1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 5, 9, 9, 9, 9, 5, 5, 9, 1,
       1, 1, 1, 1, 6, 6, 6, 1, 7, 6, 7, 3, 3, 7, 7, 8, 8, 6, 7, 9, 7, 6,
       1, 6, 6, 1, 3, 1, 1, 1, 1, 1, 6, 1, 1, 5, 9, 9, 9, 5, 5, 6, 6, 6,
       6, 6, 1, 6, 1, 6, 7, 6, 6, 6, 3, 3, 7, 7, 1, 0, 1, 7, 7, 6, 6, 0,
       0, 6, 6, 3, 6, 6, 6, 6, 6, 6, 0, 0, 9, 9, 9, 7, 7, 7, 7, 6, 6, 0,
       0, 6, 6, 6, 6, 7, 6, 6, 3, 3, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 0, 7, 9, 6, 6, 3, 3, 6, 0, 8, 7, 6, 6, 6, 6, 9,
       9, 5, 7, 7, 7, 0, 6, 1, 0, 0, 6, 6, 6, 7, 6,

In [25]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [26]:
pprint(dict(clustering))

{0: ['Bxl_1965_Tome_II2_Part_5.txt',
     'Bxl_1965_Tome_I_Part_10.txt',
     'Bxl_1965_Tome_I_Part_11.txt',
     'Bxl_1965_Tome_I_Part_8.txt',
     'Bxl_1965_Tome_I_Part_9.txt',
     'Bxl_1966_Tome_I1_Part_2.txt',
     'Bxl_1966_Tome_I1_Part_3.txt',
     'Bxl_1966_Tome_I2_Part_2.txt',
     'Bxl_1966_Tome_I2_Part_3.txt',
     'Bxl_1966_Tome_II1_Part_9.txt',
     'Bxl_1966_Tome_II2_Part_3.txt',
     'Bxl_1967_Tome_I1_Part_1.txt',
     'Bxl_1967_Tome_I1_Part_2.txt',
     'Bxl_1967_Tome_I1_Part_3.txt',
     'Bxl_1967_Tome_II1_Part_7.txt',
     'Bxl_1968_Tome_I1_Part_6.txt',
     'Bxl_1968_Tome_I1_Part_7.txt',
     'Bxl_1968_Tome_II2_Part_10.txt',
     'Bxl_1968_Tome_II2_Part_11.txt',
     'Bxl_1968_Tome_II2_Part_12.txt'],
 1: ['Bxl_1960_Tome_II1_Part_1.txt',
     'Bxl_1960_Tome_II1_Part_2.txt',
     'Bxl_1960_Tome_II1_Part_3.txt',
     'Bxl_1960_Tome_II1_Part_4.txt',
     'Bxl_1960_Tome_II1_Part_5.txt',
     'Bxl_1960_Tome_II1_Part_6.txt',
     'Bxl_1960_Tome_II1_Part_7.txt',
     'Bxl_19

# Analyse des mots-clés de chaque cluster

In [27]:
kw_extractor = yake.KeywordExtractor(lan="fr", top=5)
kw_extractor

<yake.yake.KeywordExtractor at 0x7fb503d6d4c0>

In [28]:
ignored = set(["conseil communal", "conseil général","brussel","van","het","bourgmestre","échevin","madame","monsieur","messieurs","mesdames","les", "plus", "cette", "fait", "faire", "être", "deux", "comme", "dont", "tout", "ils", "bien", "sans", "peut", "tous", "après", "ainsi", "donc", "cet", "sous", "celle", "entre", "encore", "toutes", "pendant", "moins", "dire", "cela", "non", "faut", "trois", "aussi", "dit", "avoir", "doit", "contre", "depuis", "autres", "van", "het", "autre", "jusqu", "oktober", "klein", "décembre", "ville", "bruxelles", "conseil communal","mesdames", "messieurs", "bourgmestre", "commune","communal", "franc", "question", "belge","commission", "très", "rue","francs","ceux","pourrait","belgique", "conseil","collège", "année","assistance","demande","service","cour","proposition","article", "septembre","favorable","grand","concerne","total","personnel","moment", "toute","certain","août","leurs", "peuvent","relative","peu", "également","voir","elles","grande","nombre","celui", "cours","laquelle","née","mar","divers"])

In [29]:
for key,value in clustering.items():
    print("Clé : " + str(key))
    for listitem in value:
        text = open(f'{data_path}/{listitem}', encoding='utf-8').read()
        keywords = kw_extractor.extract_keywords(text)
        kept = []
        for kw, score in keywords:
            words = kw.split()
            if len(words) > 1 and kw not in ignored:
                kept.append(kw)
        print(f"{listitem} : {' - '.join(kept)}...")

Clé : 1
Bxl_1960_Tome_II1_Part_1.txt : commission d'assistance publique...
Bxl_1960_Tome_II1_Part_2.txt : commission d'assistance publique - pouvoir compétent d'acquérir - pouvoir compétent l'autorisation - pouvoir compétent...
Bxl_1960_Tome_II1_Part_3.txt : rue rue marché - rue van artevelde - rue van - rue marché...
Bxl_1960_Tome_II1_Part_4.txt : commission d'assistance publique - principe d'une dépense - mesdames et messieurs...
Bxl_1960_Tome_II1_Part_5.txt : commission d'assistance publique...
Bxl_1960_Tome_II1_Part_6.txt : bourgmestre et echevins - voie publique...
Bxl_1960_Tome_II1_Part_7.txt : collège van burgemeester - het collège van - commission d'assistance publique...
Bxl_1960_Tome_II1_Part_8.txt : année année année...
Bxl_1960_Tome_II2_Part_3.txt : ...
Bxl_1960_Tome_II2_Part_4.txt : crédit communal...
Bxl_1960_Tome_II2_Part_5.txt : ...
Bxl_1960_Tome_I_Part_1.txt : commission d'assistance publique - recettes dépenses excédent - avis favorable...
Bxl_1960_Tome_I_Part_10.txt 

ValueError: max() arg is an empty sequence