In [1]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "../data/txt/"

## Choix d'une décennie et du nombre de clusters

In [4]:
DECADE = '1950'
N_CLUSTERS = 5

## Chargement des fichiers de la décennie



In [5]:
files = [f for f in sorted(os.listdir(data_path)) if f"_{DECADE[:-1]}" in f]

In [7]:
# Exemple de fichiers
files[:5]

['Bxl_1950_Tome_III_Part_1.txt',
 'Bxl_1950_Tome_III_Part_2.txt',
 'Bxl_1950_Tome_III_Part_3.txt',
 'Bxl_1950_Tome_III_Part_4.txt',
 'Bxl_1950_Tome_III_Part_5.txt']

In [8]:
texts = [open(data_path + f, encoding='utf-8').read() for f in files]

In [9]:
# Exemple de textes
texts[0][:400]

'C O M P T E R E N D U D E L A SÉANCE D U 1 6 OCTOBRE 1 9 5 0 .\n\nVILLE\n\nDE BRUXELLES\n\nBULLETIN COMMUNAL\nAnnée 1950\nCONSEIL\nSéance\n\nCOMMUNAL\n\ndu 16 octobre\n\n1950.\n\nP r é s i d e n c e de M . J . V A N D E M E U L E B R O E C K ,\nBourgmestre.\n\nPAGES\n\nSOMMAIRE\n1.\n\nCommunications\n\n2.\n\nD é c è s de M . F r a n ç o i s\ncommunal\n\n3.\n4.\n\n,\nDe Ceuster, ancien\n\n904\n\nconseiller\n905\n\nR è g l e m e n t de polic'

## Vectorisation du texte¶


In [10]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [11]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [14]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

Wall time: 1min 10s


In [15]:
tfidf_vectors

<312x6355 sparse matrix of type '<class 'numpy.float64'>'
	with 425544 stored elements in Compressed Sparse Row format>

In [16]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

colportage    0.288932
op            0.246078
frs           0.237571
of            0.222885
thonet        0.184822
                ...   
majorer       0.000000
majoré        0.000000
malade        0.000000
malades       0.000000
il+s'agit     0.000000
Length: 6355, dtype: float64

## Comprendre les vecteurs et leurs "distances"


In [17]:
from scipy.spatial.distance import cosine

In [18]:
cosine([1, 2, 3], [1, 2, 3])

0.0

In [19]:
cosine([1, 2, 3], [1, 2, 2])

0.02004211298777725

In [20]:
cosine([1, 2, 3], [2, 2, 2])

0.07417990022744858

In [21]:
tfidf_array = tfidf_vectors.toarray()

In [22]:
tfidf_array[0]

array([0.00739289, 0.03186491, 0.0047495 , ..., 0.        , 0.        ,
       0.        ])

In [23]:
texts[0][:200]

'C O M P T E R E N D U D E L A SÉANCE D U 1 6 OCTOBRE 1 9 5 0 .\n\nVILLE\n\nDE BRUXELLES\n\nBULLETIN COMMUNAL\nAnnée 1950\nCONSEIL\nSéance\n\nCOMMUNAL\n\ndu 16 octobre\n\n1950.\n\nP r é s i d e n c e de M . J . V A N D'

In [24]:
cosine(tfidf_array[0], tfidf_array[1])

0.7886070589754197

## Clustering des vecteurs TFIDF
Article intéressant sur le KMeans clustering :

https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

In [25]:
km_model = KMeans(n_clusters=N_CLUSTERS)


In [26]:
km_model.fit(tfidf_vectors)


KMeans(n_clusters=5)

In [27]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [29]:
pprint(dict(clustering))

{0: ['Bxl_1950_Tome_II_Part_10.txt',
     'Bxl_1950_Tome_II_Part_4.txt',
     'Bxl_1950_Tome_II_Part_5.txt',
     'Bxl_1950_Tome_II_Part_6.txt',
     'Bxl_1950_Tome_II_Part_9.txt',
     'Bxl_1951_Tome_II_Part_11.txt',
     'Bxl_1951_Tome_II_Part_3.txt',
     'Bxl_1951_Tome_II_Part_4.txt',
     'Bxl_1951_Tome_II_Part_5.txt',
     'Bxl_1951_Tome_II_Part_6.txt',
     'Bxl_1951_Tome_II_Part_9.txt',
     'Bxl_1952_Tome_II_Part_10.txt',
     'Bxl_1952_Tome_II_Part_4.txt',
     'Bxl_1952_Tome_II_Part_5.txt',
     'Bxl_1952_Tome_II_Part_6.txt',
     'Bxl_1952_Tome_II_Part_9.txt',
     'Bxl_1952_Tome_I_Part_9.txt',
     'Bxl_1953_Tome_II_Part_4.txt',
     'Bxl_1953_Tome_II_Part_5.txt',
     'Bxl_1953_Tome_II_Part_9.txt',
     'Bxl_1953_Tome_I_Part_5.txt',
     'Bxl_1954_Tome_II_Part_10.txt',
     'Bxl_1954_Tome_II_Part_4.txt',
     'Bxl_1954_Tome_II_Part_5.txt',
     'Bxl_1954_Tome_II_Part_6.txt',
     'Bxl_1955_Tome_II1_Part_11.txt',
     'Bxl_1955_Tome_II1_Part_12.txt',
     'Bxl_1955_Tome_II

In [None]:
# Résultat un peu froid. Essayer de comprendre pourquoi ces documents sont proches (thématiques?)