In [1]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emeld\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "../data/txt/"

# Choix d'une décennie et du nombre de clusters

In [4]:
DECADE = '1870'
N_CLUSTERS = 5

# Chargement des fichiers de la décennie

In [5]:
files = [f for f in sorted(os.listdir(data_path)) if f"_{DECADE[:-1]}" in f]

In [6]:
# Exemple de fichiers
files[:5]

['Bxl_1870_Tome_I1_Part_1.txt',
 'Bxl_1870_Tome_I1_Part_2.txt',
 'Bxl_1870_Tome_I1_Part_3.txt',
 'Bxl_1870_Tome_I1_Part_4.txt',
 'Bxl_1870_Tome_I1_Part_5.txt']

In [15]:
texts = [open(data_path + f, encoding='utf8', errors='ignore').read() for f in files]

In [16]:
# Exemple de textes
texts[0][:400]

'VILLE\n\nDE\n\nBRUXELLES.\n\nBULLETIN COMMUNAL.\nA N N É E\n\nP R E M I E R\n\n1870.\n\nS E M E S T R E .\n\nBRUXELLES,\nIMPRIMERIE\n\nBOLS-WITTOUCK.\n\n\x0cï\n\nSo\ni\n»\n\n1\n\nV\n\nFu\n\nte\n\nG)\n\ni\n\nÛ\n\n\x0cVILLE DE BRUXELLES.\n\nBULLETIN\n\nCOMMUNAL.\n\nA N N É E 1870.\n\nNUMÉRO 1 .\n\nSAMEDI 1\n\ner\n\nCONSEIL\n\ner\n\nJANVIER.\n\nCOMMUNAL.\n\nSéance du 1 janvier 1870.\ner\n\nPrésidence de M. JULES ANSPACH, Bourgmestre.\n\n— Prestation de serment et installa'

# Vectorisation du texte

In [17]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [18]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [19]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

Wall time: 1min 6s


In [20]:
tfidf_vectors

<111x8170 sparse matrix of type '<class 'numpy.float64'>'
	with 199143 stored elements in Compressed Sparse Row format>

In [21]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

bochart        0.371732
pavés          0.176135
statue         0.163264
anglaise       0.157298
jottrand       0.150558
                 ...   
lorsqu'ils     0.000000
lorsqu'un      0.000000
los            0.000000
lotissement    0.000000
#              0.000000
Length: 8170, dtype: float64

# Clustering des vecteurs TFIDF

In [22]:
km_model = KMeans(n_clusters=N_CLUSTERS)

In [23]:
km_model.fit(tfidf_vectors)

KMeans(n_clusters=5)

In [24]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [25]:
pprint(dict(clustering))

{0: ['Bxl_1872_Tome_II1_Part_4.txt',
     'Bxl_1872_Tome_II1_Part_5.txt',
     'Bxl_1873_Tome_I1_Part_1.txt',
     'Bxl_1873_Tome_I1_Part_2.txt',
     'Bxl_1873_Tome_I1_Part_3.txt',
     'Bxl_1874_Tome_I1_Part_1.txt',
     'Bxl_1874_Tome_I1_Part_2.txt',
     'Bxl_1874_Tome_I1_Part_3.txt',
     'Bxl_1874_Tome_I1_Part_4.txt',
     'Bxl_1874_Tome_I1_Part_5.txt',
     'Bxl_1876_Tome_I1_Part_1.txt',
     'Bxl_1876_Tome_I1_Part_2.txt',
     'Bxl_1876_Tome_I1_Part_3.txt',
     'Bxl_1876_Tome_II1_Part_1.txt',
     'Bxl_1876_Tome_II1_Part_4.txt',
     'Bxl_1876_Tome_II1_Part_5.txt',
     'Bxl_1877_Tome_I1_Part_1.txt',
     'Bxl_1877_Tome_I1_Part_2.txt',
     'Bxl_1877_Tome_I1_Part_3.txt',
     'Bxl_1877_Tome_I1_Part_4.txt',
     'Bxl_1877_Tome_II1_Part_1.txt',
     'Bxl_1877_Tome_II1_Part_2.txt',
     'Bxl_1877_Tome_II1_Part_6.txt',
     'Bxl_1877_Tome_II1_Part_7.txt',
     'Bxl_1878_Tome_I1_Part_1.txt',
     'Bxl_1878_Tome_I1_Part_2.txt',
     'Bxl_1878_Tome_I1_Part_3.txt',
     'Bxl_1878_Tome