In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords


import re

### Preprocessing

In [25]:
df = pd.read_csv('comments.csv')

In [26]:
df['event'] = df['event'].map(lambda x : re.sub('[,.\/!?]', '', x))
df['event'] = df['event'].map(lambda x : re.sub('\d', '', x))

In [27]:
df['event'] = df['event'].map(lambda x : word_tokenize(x))

In [28]:
french_stopword = stopwords.words('french')

In [29]:
df['event'] = df['event'].apply(lambda x : [element for element in x if element not in french_stopword])

In [30]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('french')

In [31]:
df['event'] = df['event'].apply(lambda x : [stemmer.stem(element) for element in x])

### TF-IDF

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(df['event'].map(lambda x : ' '.join(x)))

### LDA sous sklearn

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

count_data = count_vectorizer.fit_transform(df['event'].map(lambda x : ' '.join(x)))

In [48]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components = 5, random_state=42, learning_method= 'online')

In [49]:
lda.fit(count_data)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [50]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [53]:
print_topics(lda, count_vectorizer, 5)


Topic #0:
souhait cred compt financ fair

Topic #1:
eur apport interess interesse proposit

Topic #2:
souhait lo financ pai fair

Topic #3:
apport desir financ mois san

Topic #4:
offre financ cred interess souhait
