In [8]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emeld\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
data_path = "../data/txt/"

# Choix d'une décennie et du nombre de clusters

In [11]:
DECADE = '1920'
N_CLUSTERS = 10

# Chargement des fichiers de la décennie

In [12]:
files = [f for f in sorted(os.listdir(data_path)) if f"_{DECADE[:-1]}" in f]

In [8]:
# Exemple de fichiers
files

['Bxl_1920_Tome_I2_Part_1.txt',
 'Bxl_1920_Tome_I2_Part_10.txt',
 'Bxl_1920_Tome_I2_Part_11.txt',
 'Bxl_1920_Tome_I2_Part_12.txt',
 'Bxl_1920_Tome_I2_Part_13.txt',
 'Bxl_1920_Tome_I2_Part_14.txt',
 'Bxl_1920_Tome_I2_Part_15.txt',
 'Bxl_1920_Tome_I2_Part_16.txt',
 'Bxl_1920_Tome_I2_Part_17.txt',
 'Bxl_1920_Tome_I2_Part_18.txt',
 'Bxl_1920_Tome_I2_Part_19.txt',
 'Bxl_1920_Tome_I2_Part_2.txt',
 'Bxl_1920_Tome_I2_Part_3.txt',
 'Bxl_1920_Tome_I2_Part_4.txt',
 'Bxl_1920_Tome_I2_Part_5.txt',
 'Bxl_1920_Tome_I2_Part_6.txt',
 'Bxl_1920_Tome_I2_Part_7.txt',
 'Bxl_1920_Tome_I2_Part_8.txt',
 'Bxl_1920_Tome_I2_Part_9.txt',
 'Bxl_1921_Tome_I1_Part_1.txt',
 'Bxl_1921_Tome_I1_Part_2.txt',
 'Bxl_1921_Tome_I1_Part_3.txt',
 'Bxl_1921_Tome_I1_Part_4.txt',
 'Bxl_1921_Tome_I1_Part_5.txt',
 'Bxl_1921_Tome_I1_Part_6.txt',
 'Bxl_1921_Tome_I1_Part_7.txt',
 'Bxl_1921_Tome_I1_Part_8.txt',
 'Bxl_1921_Tome_I1_Part_9.txt',
 'Bxl_1922_Tome_II1_Part_1.txt',
 'Bxl_1922_Tome_II1_Part_2.txt',
 'Bxl_1922_Tome_II1_Part_3.t

In [13]:
texts = [open(data_path + f, encoding='utf8', errors='ignore').read() for f in files]

In [14]:
# Exemple de textes
texts[0][:400]

'\x0cVILLE\n\nDÈ\n\nBRUXELLES\n\nBULLETIN COMMUNAL\nA N N É E\n\nPREMIÈRE\nTOME\n\nCOMPTE\n\nRENDU\n\n1920\n\nPARTIE\nII\n\nDES\n\nSÉANCES\n\nBRUXELLES\nTYPOGRAPHIE ET LITHOGRAPHIE E. GUYOT\n12, rue P a c l i é c o ,\n\n1920\n\n12\n\n\x0c\x0cCOMPTE RENDU DE L A SÉANCE D U 1 2 J U I L L E T 1 9 2 0 .\n\nVILLE\n\nD E\n\nBULLETIN\n\nCOMMUNAL\n\nAnnée\n\nC O N S E I L\nSéance\n\nB R U X E L L E S\n\n1920.\n\nC O M M U N A L\ndu 1 2 Juillet\n\n1920.\n\nPrésidence de M'

# Vectorisation du texte

In [15]:
def process_text(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

In [16]:
vectorizer = TfidfVectorizer(tokenizer=process_text,
                                stop_words=stopwords.words('french'),
                                max_df=0.5,
                                min_df=0.1,
                                lowercase=True)

In [17]:
%time tfidf_vectors = vectorizer.fit_transform(texts)

Wall time: 47.6 s


In [18]:
tfidf_vectors

<178x6665 sparse matrix of type '<class 'numpy.float64'>'
	with 251545 stored elements in Compressed Sparse Row format>

In [19]:
# Exemple de vecteur TFIDF
pd.Series(
    tfidf_vectors[0].toarray()[0],
    index=vectorizer.get_feature_names()
    ).sort_values(ascending=False)

camille       0.236540
huysmans      0.231977
conrardy      0.221826
hallet        0.188366
bremaecker    0.153632
                ...   
literies      0.000000
litre         0.000000
litres        0.000000
lits          0.000000
#             0.000000
Length: 6665, dtype: float64

# Comprendre les vecteurs et leurs "distances"

In [20]:
from scipy.spatial.distance import cosine

In [21]:
cosine([1, 2, 3], [1, 2, 3])

0.0

In [22]:
cosine([1, 2, 3], [1, 2, 2])

0.02004211298777725

In [23]:
cosine([1, 2, 3], [2, 2, 2])

0.07417990022744858

In [24]:
tfidf_array = tfidf_vectors.toarray()

In [25]:
tfidf_array[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [26]:
texts[0][:200]

'\x0cVILLE\n\nDÈ\n\nBRUXELLES\n\nBULLETIN COMMUNAL\nA N N É E\n\nPREMIÈRE\nTOME\n\nCOMPTE\n\nRENDU\n\n1920\n\nPARTIE\nII\n\nDES\n\nSÉANCES\n\nBRUXELLES\nTYPOGRAPHIE ET LITHOGRAPHIE E. GUYOT\n12, rue P a c l i é c o ,\n\n1920\n\n12\n\n\x0c\x0cC'

In [27]:
tfidf_array[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [28]:
cosine(tfidf_array[0], tfidf_array[1])

0.8879884802924477

# Clustering des vecteurs TFIDF

Article intéressant sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

In [29]:
km_model = KMeans(n_clusters=N_CLUSTERS)

In [30]:
km_model.fit(tfidf_vectors)

KMeans(n_clusters=10)

In [31]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(km_model.labels_):
    clustering[label].append(files[idx])

In [28]:
pprint(dict(clustering))

{0: ['Bxl_1920_Tome_I2_Part_10.txt',
     'Bxl_1920_Tome_I2_Part_11.txt',
     'Bxl_1920_Tome_I2_Part_9.txt',
     'Bxl_1925_Tome_II1_2_Part_3.txt',
     'Bxl_1925_Tome_II1_2_Part_4.txt',
     'Bxl_1925_Tome_II1_2_Part_5.txt',
     'Bxl_1925_Tome_II1_2_Part_6.txt',
     'Bxl_1926_Tome_I2_Part_12.txt',
     'Bxl_1926_Tome_I2_Part_13.txt',
     'Bxl_1926_Tome_I2_Part_14.txt',
     'Bxl_1926_Tome_II1_Part_12.txt',
     'Bxl_1926_Tome_II1_Part_13.txt',
     'Bxl_1926_Tome_II1_Part_14.txt',
     'Bxl_1928_Tome_II1_Part_10.txt',
     'Bxl_1928_Tome_II1_Part_9.txt'],
 1: ['Bxl_1921_Tome_I1_Part_1.txt',
     'Bxl_1921_Tome_I1_Part_6.txt',
     'Bxl_1923_Tome_I1_Part_9.txt',
     'Bxl_1923_Tome_I2_Part_2.txt',
     'Bxl_1924_Tome_I1_Part_5.txt',
     'Bxl_1924_Tome_I1_Part_6.txt',
     'Bxl_1924_Tome_I1_Part_7.txt',
     'Bxl_1926_Tome_I2_Part_3.txt',
     'Bxl_1926_Tome_II1_Part_3.txt',
     'Bxl_1927_Tome_I_Part_9.txt',
     'Bxl_1929_Tome_I_Part_4.txt'],
 2: ['Bxl_1927_Tome_I_Part_1.txt',
  

In [57]:
cluster_0 =  ['Bxl_1920_Tome_I2_Part_10.txt',
     'Bxl_1920_Tome_I2_Part_11.txt',
     'Bxl_1920_Tome_I2_Part_9.txt',
     'Bxl_1925_Tome_II1_2_Part_3.txt',
     'Bxl_1925_Tome_II1_2_Part_4.txt',
     'Bxl_1925_Tome_II1_2_Part_5.txt',
     'Bxl_1925_Tome_II1_2_Part_6.txt',
     'Bxl_1926_Tome_I2_Part_12.txt',
     'Bxl_1926_Tome_I2_Part_13.txt',
     'Bxl_1926_Tome_I2_Part_14.txt',
     'Bxl_1926_Tome_II1_Part_12.txt',
     'Bxl_1926_Tome_II1_Part_13.txt',
     'Bxl_1926_Tome_II1_Part_14.txt',
     'Bxl_1928_Tome_II1_Part_10.txt',
     'Bxl_1928_Tome_II1_Part_9.txt']

In [59]:
this_file = cluster_0
this_file

['Bxl_1920_Tome_I2_Part_10.txt', 'Bxl_1920_Tome_I2_Part_11.txt', 'Bxl_1920_Tome_I2_Part_9.txt', 'Bxl_1925_Tome_II1_2_Part_3.txt', 'Bxl_1925_Tome_II1_2_Part_4.txt', 'Bxl_1925_Tome_II1_2_Part_5.txt', 'Bxl_1925_Tome_II1_2_Part_6.txt', 'Bxl_1926_Tome_I2_Part_12.txt', 'Bxl_1926_Tome_I2_Part_13.txt', 'Bxl_1926_Tome_I2_Part_14.txt', 'Bxl_1926_Tome_II1_Part_12.txt', 'Bxl_1926_Tome_II1_Part_13.txt', 'Bxl_1926_Tome_II1_Part_14.txt', 'Bxl_1928_Tome_II1_Part_10.txt', 'Bxl_1928_Tome_II1_Part_9.txt']

In [None]:
import os
import yake

ignored = set(["conseil communal", "conseil général"])

kw_extractor = yake.KeywordExtractor(lan="fr", top=20)
data_path = "data/txt/"
files = os.listdir(data_path)
texts = [open(data_path + f, encoding='utf8', errors='ignore').read() for f in files]

data_path = "data/txt/"
files = os.listdir(data_path)
for f in sorted(files):
    if f.startswith("Bxl_"):
        text = open(data_path + f).read()
        keywords = kw_extractor.extract_keywords(text)
        kept = []
        for score, kw in keywords:
            words = kw.split()
            if len(words) > 1 and kw not in ignored: # only bigrams and more
                kept.append(kw)
        print(f"{f} mentions these keywords: {', '.join(kept)}...")
