In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import gensim
import pandas as pd
from collections import defaultdict

from tqdm.auto import tqdm
tqdm.pandas()

import plotly.express as px

In [4]:
df = pd.read_csv('./manifesto_articles_preprocessed.csv')

In [6]:
df.words = df.words.progress_apply(eval)

  0%|          | 0/69935 [00:00<?, ?it/s]

In [42]:
dict_party = {'Socialist Party':'PS',
              'Europe Ecology - The Greens':'EELV',
              'The Republicans':'RPR/UMP/LR',
              'National Front':'FN',
              'Democratic Movement':'MoDem/EM',
              'Republic Onwards!':'MoDem/EM',
              'Left Front':'PCF/PG/LFI',
              'Indomitable France':'PCF/PG/LFI',
              'French Communist Party':'PCF/PG/LFI',
              'Le Monde':'Le Monde',
              'Le Figaro':'Le Figaro'}

In [43]:
dict_color = {'PS': 'pink', 
              'MoDem/EM': 'orange', 
              'RPR/UMP/LR': 'blue',
              'FN':'darkslateblue',
              'EELV':'green',
              'PCF/PG/LFI':'red',
              'Le Monde':'black',
              'Le Figaro':'cyan'}

In [44]:
df.partyname = df.partyname.apply(lambda x: dict_party[x])

In [45]:
df2002 = df[df['fichier']==1].reset_index(drop=True)
df2007 = df[df['fichier']==2].reset_index(drop=True)
df2012 = df[df['fichier']==3].reset_index(drop=True)
df2017 = df[df['fichier']==4].reset_index(drop=True)

## TF-IDF

In [19]:
def tfidf(df, n_party):
    """
    Input: 
        df: Datafram with columns : index, text, partyname, annee. Where the first n_party rows are manifestos.
        n_party: number of party manifestos.
        
    Output: 
        The Same DataFrame with cosine similarity with each party manifesto (tf-idf embedding).
    """
    texts = df.words.tolist()
    #texts = [gensim.utils.simple_preprocess(x) for x in df.text]
    frequency = defaultdict(int)
    
    for text in texts:
        for token in text:
            frequency[token] += 1
            
    texts = [gensim.utils.simple_preprocess(x) for x in df.text]
    
    processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
    
    dictionary = gensim.corpora.Dictionary(processed_corpus)
    
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
    tfidf = gensim.models.TfidfModel(bow_corpus)
    index = gensim.similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary))
    
    
    for i in range(n_party):
        df[df.loc[i,'partyname']] = index[tfidf[bow_corpus[i]]]
        
    return df

In [47]:
%%time

sims2002 = tfidf(df2002, n_party=5)

2021-12-01 12:41:18,736 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-12-01 12:41:21,065 : INFO : adding document #10000 to Dictionary(20538 unique tokens: ['abandon', 'abrogation', 'accentué', 'accepte', 'accessibilité']...)
2021-12-01 12:41:22,908 : INFO : built Dictionary(21832 unique tokens: ['abandon', 'abrogation', 'accentué', 'accepte', 'accessibilité']...) from 18842 documents (total 3930095 corpus positions)
2021-12-01 12:41:22,909 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(21832 unique tokens: ['abandon', 'abrogation', 'accentué', 'accepte', 'accessibilité']...) from 18842 documents (total 3930095 corpus positions)", 'datetime': '2021-12-01T12:41:22.909208', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2021-12-01 12:41:25,635 : INFO : collecting document frequencies
2021-12-01 12:41:25,636 : INFO : PROGRESS: processing document 

CPU times: user 38.3 s, sys: 1.69 s, total: 40 s
Wall time: 42.6 s


In [56]:
tmp2002 = sims2002.iloc[5:,2:].groupby(['partyname','annee']).mean().reset_index()

In [60]:
tmp2002.columns[3:]

Index(['EELV', 'PS', 'MoDem/EM', 'RPR/UMP/LR', 'FN'], dtype='object')

In [61]:
px.line(tmp2002,
        x='annee',
        y=tmp2002.columns[3:],
        facet_col='partyname',
        labels={'partyname':'Journal','variable':'Party','value':'Cosine Similarity','annee':'year'},
        title='TF-IDF - Manifesto 2002',
        color_discrete_map=dict_color
       )

In [49]:
%%time

sims2007 = tfidf(df2007, n_party=6)

2021-12-01 12:41:59,532 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-12-01 12:42:01,637 : INFO : adding document #10000 to Dictionary(21726 unique tokens: ['abaissement', 'abandon', 'abandonné', 'abandonnée', 'abattage']...)
2021-12-01 12:42:03,383 : INFO : adding document #20000 to Dictionary(22768 unique tokens: ['abaissement', 'abandon', 'abandonné', 'abandonnée', 'abattage']...)
2021-12-01 12:42:03,894 : INFO : built Dictionary(22936 unique tokens: ['abaissement', 'abandon', 'abandonné', 'abandonnée', 'abattage']...) from 23135 documents (total 4180262 corpus positions)
2021-12-01 12:42:03,895 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(22936 unique tokens: ['abaissement', 'abandon', 'abandonné', 'abandonnée', 'abattage']...) from 23135 documents (total 4180262 corpus positions)", 'datetime': '2021-12-01T12:42:03.895474', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-

CPU times: user 40.8 s, sys: 1.39 s, total: 42.2 s
Wall time: 44.2 s


In [62]:
tmp2007 = sims2007.iloc[6:,2:].groupby(['partyname','annee']).mean().reset_index()

In [64]:
tmp2007.columns[3:]

Index(['FN', 'EELV', 'MoDem/EM', 'PCF/PG/LFI', 'PS', 'RPR/UMP/LR'], dtype='object')

In [65]:
px.line(tmp2007,
        x='annee',
        y=tmp2007.columns[3:],
        facet_col='partyname',
        labels={'partyname':'Journal','variable':'Party','value':'Cosine Similarity','annee':'year'},
        title='TF-IDF - Manifesto 2007',
        color_discrete_map=dict_color
       )

In [51]:
%%time

sims2012 = tfidf(df2012, n_party=6)

2021-12-01 12:42:47,399 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-12-01 12:42:50,393 : INFO : adding document #10000 to Dictionary(21518 unique tokens: ['abaissement', 'abandon', 'abattement', 'abolir', 'abolition']...)
2021-12-01 12:42:52,455 : INFO : adding document #20000 to Dictionary(22643 unique tokens: ['abaissement', 'abandon', 'abattement', 'abolir', 'abolition']...)
2021-12-01 12:42:52,509 : INFO : built Dictionary(22671 unique tokens: ['abaissement', 'abandon', 'abattement', 'abolir', 'abolition']...) from 20555 documents (total 4312953 corpus positions)
2021-12-01 12:42:52,510 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(22671 unique tokens: ['abaissement', 'abandon', 'abattement', 'abolir', 'abolition']...) from 20555 documents (total 4312953 corpus positions)", 'datetime': '2021-12-01T12:42:52.510465', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64b

CPU times: user 43 s, sys: 1.56 s, total: 44.6 s
Wall time: 47 s


In [66]:
tmp2012 = sims2012.iloc[6:,2:].groupby(['partyname','annee']).mean().reset_index()

In [67]:
px.line(tmp2012,
        x='annee',
        y=tmp2012.columns[3:],
        facet_col='partyname',
        labels={'partyname':'Journal','variable':'Party','value':'Cosine Similarity','annee':'year'},
        title='TF-IDF - Manifesto 2012',
        color_discrete_map=dict_color
       )

In [53]:
%%time

sims2017 = tfidf(df2017, n_party=6)

2021-12-01 12:43:20,686 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-12-01 12:43:22,090 : INFO : built Dictionary(16732 unique tokens: ['abandonné', 'abattage', 'absence', 'absolue', 'accessibilité']...) from 7403 documents (total 1584056 corpus positions)
2021-12-01 12:43:22,091 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(16732 unique tokens: ['abandonné', 'abattage', 'absence', 'absolue', 'accessibilité']...) from 7403 documents (total 1584056 corpus positions)", 'datetime': '2021-12-01T12:43:22.091083', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 12:59:45) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
2021-12-01 12:43:23,052 : INFO : collecting document frequencies
2021-12-01 12:43:23,053 : INFO : PROGRESS: processing document #0
2021-12-01 12:43:23,260 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 7403 documents and 16732 features (1158007 matrix non-zeros)', 'da

CPU times: user 15.5 s, sys: 380 ms, total: 15.9 s
Wall time: 16.3 s


In [68]:
tmp2017 = sims2017.iloc[6:,2:].groupby(['partyname','annee']).mean().reset_index()

In [69]:
px.line(tmp2017,
        x='annee',
        y=tmp2017.columns[3:],
        facet_col='partyname',
        labels={'partyname':'Journal','variable':'Party','value':'Cosine Similarity','annee':'year'},
        title='TF-IDF - Manifesto 2017',
        color_discrete_map=dict_color
       )

In [82]:
all_sims = pd.concat([tmp2002,tmp2007,tmp2012,tmp2017])

In [97]:
all_sims.fichier = all_sims.fichier.replace({1:2002,2:2007,3:2012,4:2017})

In [101]:
fig = px.line(all_sims,
              x='annee',
              y=all_sims.columns[3:],
              labels={'partyname':'Journal','variable':'Party','value':'Cosine Similarity','annee':'year', 'fichier':'manifesto'},
              title='TF-IDF',
              facet_row='partyname',
              facet_col='fichier',
              color_discrete_map=dict_color
       )
fig.update_xaxes(matches=None)
fig.show()