In [22]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [23]:
import gensim

In [24]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt


In [25]:
import plotly.express as px
import plotly.graph_objects as go

In [26]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
tqdm.pandas()

# Loading Data

In [28]:
df = pd.read_csv('./manifesto_articles_preprocessed.csv')

In [29]:
df = df.rename({'index':'id_fichier'},axis=1)

In [30]:
df.words = df.words.progress_apply(eval)

  0%|          | 0/69935 [00:00<?, ?it/s]

In [31]:
df.words = df.words.progress_apply(lambda x: [a for a in x if a not in ["→","”","“"]])

  0%|          | 0/69935 [00:00<?, ?it/s]

In [32]:
manifestos = df[~df.partyname.isin(["Le Monde", "Le Figaro"])].reset_index(drop=True)

In [33]:
articles = df[df.partyname.isin(["Le Monde", "Le Figaro"])].reset_index(drop=True)

In [34]:
manifestos = manifestos.reset_index()

In [35]:
manifestos = manifestos.rename({'index':'id_manifesto'},axis=1)

In [36]:
dict_party = {'Socialist Party':'PS',
              'Europe Ecology - The Greens':'EELV',
              'The Republicans':'RPR/UMP/LR',
              'National Front':'FN',
              'Democratic Movement':'MoDem/EM',
              'Republic Onwards!':'MoDem/EM',
              'Left Front':'PCF/PG/LFI',
              'Indomitable France':'PCF/PG/LFI',
              'French Communist Party':'PCF/PG/LFI'}

In [37]:
manifestos.partyname = manifestos.partyname.apply(lambda x: dict_party[x])

In [38]:
articles = articles.reset_index()

In [39]:
articles = articles.rename({'index':'id_article'},axis=1)

# Processing

Splitting each manifesto to 100-token-documents

In [40]:
manifestos['chunks'] = manifestos.apply(lambda row: [[row['id_manifesto'], row['words'][x:x+100]] for x in range(0, len(row['words']), 100)], axis=1)

In [41]:
articles['chunks'] = articles.progress_apply(lambda row: [[row['id_article'], row['words'][x:x+100]] for x in range(0, len(row['words']), 100)], axis=1)

  0%|          | 0/69912 [00:00<?, ?it/s]

In [98]:
chunks_manifesto = pd.DataFrame([a for b in manifestos.chunks for a in b])
chunks_manifesto.columns = ['id_manifesto','words']

In [99]:
chunks_articles = pd.DataFrame([a for b in articles.chunks for a in b])
chunks_articles.columns = ['id_article','words']

Creating bigrams

In [44]:
bigram = gensim.models.Phrases(chunked.words, min_count=5, threshold=100)

Splitting each manifesto to 100-token-documents

In [45]:
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [100]:
chunks_manifesto['bigrams'] = chunks_manifesto.words.apply(lambda x: bigram_mod[x])

In [101]:
chunks_articles['bigrams'] = chunks_articles.words.progress_apply(lambda x: bigram_mod[x])

  0%|          | 0/153836 [00:00<?, ?it/s]

# LDA

In [102]:
dictionary = gensim.corpora.Dictionary(chunks_manifesto.bigrams)

In [103]:
len(dictionary)

9776

In [104]:
dictionary.filter_extremes(no_above=0.5)
len(dictionary)

3205

In [105]:
chunks_manifesto['corpus'] = chunks_manifesto.bigrams.apply(dictionary.doc2bow)

In [106]:
chunks_articles['corpus'] = chunks_articles.bigrams.progress_apply(dictionary.doc2bow)

  0%|          | 0/153836 [00:00<?, ?it/s]

In [120]:
def compute_coherence_values(start, limit, step):
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit + step, step)):
        model = gensim.models.ldamodel.LdaModel(corpus=chunked.corpus,
                                                id2word=dictionary,
                                                chunksize=100,
                                                alpha='auto',
                                                eta='auto',
                                                iterations=400,
                                                num_topics=num_topics,
                                                passes=20,
                                                eval_every=None,
                                                random_state=42,
                                                minimum_probability=0)
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=chunked.bigrams, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [54]:
model_list, coherence_values = compute_coherence_values(start=12, limit=26, step=2)

  0%|          | 0/8 [00:00<?, ?it/s]

In [60]:
start=12; limit=26;step=2;
x = range(start, limit+step, step)
px.line(x=x,y=coherence_values)

In [61]:
model = model_list[5] #22 topics

In [198]:
model = gensim.models.ldamodel.LdaModel(corpus=chunked.corpus,
                                        id2word=dictionary,
                                        chunksize=100,
                                        alpha='auto',
                                        eta='auto',
                                        iterations=400,
                                        num_topics=22,
                                        passes=20,
                                        eval_every=None,
                                        random_state=42,
                                        minimum_probability=0)

In [199]:
coherencemodel = gensim.models.CoherenceModel(model=model, texts=chunked.bigrams, dictionary=dictionary, coherence='c_v')

In [200]:
coherencemodel.get_coherence()

0.4608623579399778

In [201]:
pyLDAvis.enable_notebook()

In [202]:
vis = pyLDAvis.gensim_models.prepare(model, chunks_manifesto.corpus, dictionary,sort_topics=False)

In [203]:
vis

In [204]:
topics = pd.DataFrame([{'id_topic':i,'keywords':', '.join([x[0] for x in model.show_topic(i,topn=20)])} for i in range(model.num_topics)])

In [205]:
topics

Unnamed: 0,id_topic,keywords
0,0,"école, établissement, scolaire, élève, ville, ..."
1,1,"logement, revenu, enfant, famille, droit, femm..."
2,2,"européen, pays, politique, international, mond..."
3,3,"entreprise, travail, emploi, professionnel, ac..."
4,4,"santé, soin, médical, hôpital, prison, prise_c..."
5,5,"social, pourcent, place, an, grand, aide, mesu..."
6,6,"frontière, extérieur, pluralisme, capable, con..."
7,7,"paix, urgence, maladie, assurance, médecine, r..."
8,8,"retraite, régime, prix, obligatoire, génératio..."
9,9,"territoire, national, français, recherche, pla..."


In [None]:
pd.options.display.max_colwidth=50

Interperting topics

In [206]:
dict_topics = {
0 : 'éducation',
1 : 'famille',
2 : 'international/europe',
3 : 'emploie/salaire',
4 : 'santé',
5 : 'cotisation',
6 : 'frontière/indentité',
7 : 'protection sociale/médecine',
8 : 'retraite',
9 : 'plan',
10 : 'énergie',
11 : 'écologie/agriculture',
12 : 'service public',
13 : 'loi/justice',
14 : 'immigration',
15 : 'enseignement supérieur',
16 : 'administration',
17 : 'institutions',
18 : 'régions/territoire',
19 : 'valeurs conservatrices',
20 : 'infrastructure',
21 : 'environnement/pollution'
}

In [208]:
#topics['topic_name'] = topics['id_topic'].apply(str)
topics['topic_name'] = topics['id_topic'].apply(lambda x: dict_topics[x])

Inferring topic distribution for each 100-token-document

In [209]:
chunks_manifesto['topic_distribution'] = chunks_manifesto.corpus.progress_apply(lambda x: [y[1] for y in model[x]])

  0%|          | 0/1499 [00:00<?, ?it/s]

In [210]:
chunks_articles['topic_distribution'] = chunks_articles.corpus.progress_apply(lambda x: [y[1] for y in model[x]])

  0%|          | 0/153836 [00:00<?, ?it/s]

The topic distribution of a manifesto is the average of its splitted documents

In [211]:
manifesto_topic = pd.DataFrame(chunks_manifesto.groupby('id_manifesto').apply(lambda df: np.mean(df['topic_distribution'].tolist(), axis=0).tolist())).reset_index().rename({0:'topic_distribution'},axis=1)

In [212]:
manifesto_topic = pd.merge(manifesto_topic, manifestos[['id_manifesto','partyname','annee']], on='id_manifesto', how='left')

In [213]:
article_topic = pd.DataFrame(chunks_articles.groupby('id_article').apply(lambda df: np.mean(df['topic_distribution'].tolist(), axis=0).tolist())).reset_index().rename({0:'topic_distribution'},axis=1)

In [214]:
article_topic = pd.merge(article_topic, articles[['id_article','partyname','annee']], on='id_article', how='left')

In [215]:
journal_year_topic = pd.DataFrame(article_topic.groupby(['partyname','annee']).apply(lambda df: np.mean(df['topic_distribution'].tolist(), axis=0).tolist())).reset_index().rename({0:'topic_distribution'},axis=1)

# PCA

In [216]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [217]:
from sklearn.preprocessing import normalize

def add_correlation_circle(figure, coeffs, texts, normalization=True, add_circle=True):
    if add_circle:
        figure.add_shape(type="circle",
                         xref="x", yref="y",
                         x0=-1, y0=-1, x1=1, y1=1,
                         line_color="blue"
                        )
    if normalization:
        coeffs = normalize(coeffs,axis=0)
    for i in range(coeffs.shape[1]):
        figure.add_annotation(
            x=coeffs[0,i],  # arrows' head
            y=coeffs[1,i],  # arrows' head
            ax=0,  # arrows' tail
            ay=0,  # arrows' tail
            xref='x',
            yref='y',
            axref='x',
            ayref='y',
            text='',  # if you want only the arrow
            showarrow=True,
            arrowhead=2,
            arrowsize=1,
            arrowwidth=2,
            arrowcolor='red'
        )

        figure.add_annotation(
            x=coeffs[0,i]*1.25, 
            y=coeffs[1,i]*1.25,
            text=texts[i],
            showarrow=False,
            font=dict(size=10,color="red")
        )
    return figure

In [218]:
data = manifesto_topic.topic_distribution.to_list()

In [219]:
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

In [220]:
pca = PCA()
pca.fit(data)

PCA()

In [221]:
px.line(pca.explained_variance_ratio_, labels={'index':'Principal Component','value':'Explained Variance Ratio'})

In [222]:
comp = pca.components_[[0,1],:]

In [223]:
cir_corr = go.Figure()
cir_corr.update_layout(xaxis=dict(range=[-1.5,1.5]),yaxis=dict(range=[-1.5,1.5]), width=600, height=600)

cir_corr = add_correlation_circle(cir_corr, 
                                  coeffs=comp, 
                                  texts=topics['topic_name'], 
                                  normalization=True,
                                  add_circle=True)
cir_corr.show()

In [224]:
pca_manifesto = pd.DataFrame(pca.transform(data)[:, [0,1]])

In [225]:
pca_manifesto = pd.concat([pca_manifesto,manifesto_topic[['partyname','annee']]],axis=1)

In [226]:
pca_manifesto['annee'] = pca_manifesto['annee'].astype('object')

In [227]:
pca_manifesto = pca_manifesto.drop(0)

In [228]:
pca_manifesto['name_year'] = pca_manifesto['partyname'] + '_' +pca_manifesto['annee'].apply(str)

In [229]:
dict_color = {'PS': 'pink', 
              'MoDem/EM': 'orange', 
              'RPR/UMP/LR': 'blue',
              'FN':'darkslateblue',
              'EELV':'green',
              'PCF/PG/LFI':'red',
              'Le Monde':'black',
              'Le Figaro':'cyan'}

In [230]:
fig = go.Figure()

for partname in pca_manifesto.partyname.unique():
    tmp = pca_manifesto[pca_manifesto['partyname']==partname]
    fig.add_trace(go.Scatter(
        x=tmp[0],
        y=tmp[1],
        mode="lines+markers+text",
        name=partname,
        text=tmp['name_year'],
        marker=dict(color=dict_color[partname],size=12),
    ))
    
fig.update_layout(height=1000, width=1200, 
                  #xaxis=dict(range=[-5,5],title='PC 1'), yaxis=dict(range=[-5,5], title='PC 2')
                 )

In [243]:
fig = px.scatter(pca_manifesto, x=0, y=1, color='partyname', text='name_year', 
                 color_discrete_map=dict_color,
                 labels={"0":"PC 1", "1":"PC 2"},
                 range_x=[-5,5], range_y=[-5,5],
                 height=1000, width=1200)

#add_correlation_circle(fig, coeffs=comp, texts=topics['topic_name'], normalization=True, add_circle=True)
fig.update_traces(marker=dict(size=12))

In [232]:
all_docs = pd.concat([manifesto_topic.drop('id_manifesto',axis=1),journal_year_topic], ignore_index=True)

In [233]:
all_docs['name_year'] = all_docs['partyname'] + '_' + all_docs['annee'].apply(str)

In [234]:
all_vectors = all_docs.topic_distribution.to_list()

In [235]:
scaler = StandardScaler()


all_vectors = scaler.fit_transform(all_vectors)

In [236]:
pca_all = pd.DataFrame(pca.transform(all_vectors)[:, [0,1]])

In [237]:
pca_all = pd.concat([pca_all,all_docs[['partyname','annee','name_year']]],axis=1)

In [238]:
pca_all = pca_all.drop(0)

In [247]:
fig_all = go.Figure()

for partname in pca_all.partyname.unique():
    tmp = pca_all[pca_all['partyname']==partname]

    
    if partname in ['Le Monde', 'Le Figaro']:
        fig_all.add_trace(go.Scatter(
            x=tmp[0],
            y=tmp[1],
            mode="markers+text",
            #mode="lines+markers+text",
            name=partname,
            #text=tmp['annee'],
            marker=dict(color=dict_color[partname]),
            hovertext=tmp['name_year'],
        ))        
        
    else:    
        fig_all.add_trace(go.Scatter(
            x=tmp[0],
            y=tmp[1],
            mode="markers+text",
            #mode="lines+markers+text",
            name=partname,
            text=tmp['name_year'],
            marker=dict(color=dict_color[partname]),
            hovertext=tmp['name_year'],
        ))
    
fig_all.update_layout(height=1000, width=1200, 
                      xaxis=dict(range=[-5,5],title='PC 1'), yaxis=dict(range=[-6,6], title='PC 2')
                     )
#fig_all.write_html('/Users/rubing/Downloads/pca_all.html')