In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
path = os.getcwd()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn


# Latent Dirichlet Allocation

In [12]:
df = pd.read_csv(path+'/lem_clean_version_8.csv', low_memory=True)
df['statement']=df['statement'].astype(str)
df = df.astype({'Date':'datetime64', 'chair_in_charge':str, 'lemmatized':str})
df = df.drop(columns=['Unnamed: 0'])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156082 entries, 0 to 156081
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   index              156082 non-null  int64         
 1   Date               156082 non-null  datetime64[ns]
 2   interlocutor_name  156080 non-null  object        
 3   statement _size    156082 non-null  int64         
 4   statement          156082 non-null  object        
 5   statement_number   156082 non-null  object        
 6   chair_in_charge    156082 non-null  object        
 7   score_academ       156082 non-null  float64       
 8   score_hostile      156082 non-null  float64       
 9   score_econo        156082 non-null  float64       
 10  score_virtue       156082 non-null  float64       
 11  score_vice         156082 non-null  float64       
 12  score_hawkish      156082 non-null  float64       
 13  score_posi         156082 non-null  float64 

Fonctions de visualisation de la LDA:

In [17]:
def LDA_fit(series, n_topic): #notre LDA prend en entrée le series.dataframe contenant l'information textuelle 
                                      #et le nombre de topic à modéliser
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,max_features=1000)
    tfidf = tfidf_vectorizer.fit_transform(series)

    lda = LatentDirichletAllocation(n_components=n_topic, max_iter=5,
                                  learning_method='online',
                                  learning_offset=50.,
                                  random_state=0)
    lda.fit(tfidf)
    LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=n_topic, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

    tf_feature_names = tfidf_vectorizer.get_feature_names()

    return pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vectorizer)



def LDA_on_dataset(series, n_topic): #notre LDA prend en entrée le series.dataframe contenant l'information textuelle 
                                      #et le nombre de topic à modéliser
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,max_features=1000)
    tfidf = tfidf_vectorizer.fit_transform(series)

    lda = LatentDirichletAllocation(n_components=n_topic, max_iter=5,
                                  learning_method='online',
                                  learning_offset=50.,
                                  random_state=0)
    lda.fit_transform(tfidf)
    LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=n_topic, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

    tf_feature_names = tfidf_vectorizer.get_feature_names()

    return pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vectorizer)



## Choix des documents

Chaque *statement* est un document : 

In [15]:
df1 = df[df.Date.dt.year==2014]
df1.head()

Unnamed: 0,index,Date,interlocutor_name,statement _size,statement,statement_number,chair_in_charge,score_academ,score_hostile,score_econo,score_virtue,score_vice,score_hawkish,score_posi,score_affi,score_uncert,lemmatized
123499,123556,2014-01-29,DUDLEY,13,thought wed couple hours laughter,statement_0,CHAIRMAN BERNANKE,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,-0.0,thought wed couple hour laughter
123500,123557,2014-01-29,BERNANKE,66,thank thank much good afternoon welcome annual...,statement_1,CHAIRMAN BERNANKE,0.030303,0.030303,0.015152,0.030303,0.030303,0.0,1.0,1.0,-0.0,thank thank good afternoon welcome annual org...
123501,123558,2014-01-29,TARULLO,74,thank chairman impending change leadership pos...,statement_2,CHAIRMAN BERNANKE,0.013514,0.013514,0.040541,0.027027,0.027027,0.0,0.0,0.0,-0.0,thank chairman impending change leadership po...
123502,123559,2014-01-29,STEIN,7,would like nominate ben bernanke,statement_3,CHAIRMAN BERNANKE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,like nominate ben bernanke
123503,123560,2014-01-29,TARULLO,4,second,statement_4,CHAIRMAN BERNANKE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,second


In [18]:
print('Statement : \n')
pyLDAvis.enable_notebook()
LDA_fit(df1.lemmatized, 5)

Statement : 



  default_term_info = default_term_info.sort_values(


On agrège les statements par meeting (donc par date)

In [24]:
df2 = df.groupby('Date').apply(lambda s: ' '.join(s['lemmatized']))
df2.head()

Date
1976-08-17     ready start meeting morning item business act...
1976-09-21     meeting way good approval minute august meeti...
1976-10-19     let outset oconnell brief report unintelligib...
1976-11-16     item thought best executive session discus li...
1976-12-21     discussing legislative administrative detail ...
dtype: object

In [25]:
print('Statement : \n')
pyLDAvis.enable_notebook()
LDA_fit(df2, 5)

Statement : 



  default_term_info = default_term_info.sort_values(


In [19]:
df3 = df.groupby(['interlocutor_name', 'Date']).apply(lambda s: ' '.join(s['lemmatized']))

In [20]:
df3.head()

interlocutor_name  Date      
AARONSON           2008-12-16     referring exhibit follow green nonfinancial c...
AHMED              2008-12-16     referring exhibit follow blue international o...
ALEXANDER          1999-02-03     main change forecast international driven eve...
ALTMA              1979-02-06     vice chairman volcker yes president baughman ...
                   1979-03-20     secretary murray altmann assistant secretary ...
dtype: object

In [22]:
print('Statement : \n')
pyLDAvis.enable_notebook()
LDA_fit(df3, 5)

Statement : 



  default_term_info = default_term_info.sort_values(
