In [1]:
import pandas as pd
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize

In [2]:
tqdm.pandas()

In [9]:
df = pd.read_csv('./LeMonde_20211027.csv')

In [68]:
manif = pd.read_csv('./corpus-tidy.csv')
manif = manif[['id','date','party','title','text']]
party = pd.read_csv('mp_france.csv')
id_party = dict(zip(party.party, party.partyname))
manif['partyname'] = manif.party.apply(lambda x: id_party[x])
manif['annee'] = manif['date'].apply(lambda x: int(str(x)[:4]))

In [74]:
def clean(text):
    return text.replace('\r',' ').replace('\n',' ').replace('\xa0',' ').replace('\uf0a0'," ")

In [75]:
manif['text'] = manif.text.apply(clean)

In [223]:
ex_manif = manif[manif['annee']==2012]

In [224]:
ex_manif

Unnamed: 0,id,date,party,title,text,partyname,annee
10,31021_201206,201206,31021,L'humain d'abord. Le programme du Front de Gau...,Le programme du Front de Gauche 1. PARTAGER L...,Left Front,2012
11,31110_201206,201206,31110,Vivre mieux. Vers une société écologique,1. SE LIBERER DE LA DEPENDANCE AUX ÉNERGIES FO...,Europe Ecology - The Greens,2012
12,31230_201206,201206,31230,L'audace à gauche. 30 propositions pour la France,I. FAIRE FACE À LA CRISE EUROPÉENNE ET MONDIAL...,Left Radical Party,2012
13,31320_201206,201206,31320,Le changement c'est maintenant. Mes 60 engagem...,"Je veux relancer la production, l'emploi et la...",Socialist Party,2012
14,31421_201206,201206,31421,Le Manifeste du Parti Radical 2012. Pour répon...,1RE PARTIE Fédérer tous les acteurs pour lanc...,Radical Party,2012
15,31624_201206,201206,31624,La France solidaire,FRANÇOIS BAYROU (mouvement démocrate) LA FRA...,Democratic Movement,2012
16,31626_201206,201206,31626,Projet 2012. Protéger et préparer l'avenir des...,LE COURAGE PRODUIRE PLUS ET DEPENSER MOINS P...,The Republicans,2012
17,31630_201206,201206,31630,Des idées neuves pour changer la France,"Des idées neuves pour changer la France, PROG...",New Centre,2012
18,31631_201206,201206,31631,Rassembler tous les centristes. Notre Programme,Alliance centriste Rassembler tous les centri...,Centrist Alliance,2012
19,31720_201206,201206,31720,Mon Projet pour la France et les francais. Mar...,POUVOIR D'ACHAT RELANCE DE L'ÉCONOMIE ~ Tous...,National Front,2012


In [225]:
lm2012 = df[df['annee']==2012]
lm2013 = df[df['annee']==2013]
lm2014 = df[df['annee']==2014]

lm = pd.concat([lm2012,lm2013,lm2014])

In [226]:
embed = pd.concat([ex_manif[['text','annee','partyname']].rename({'text':'texte'},axis=1), lm[['texte','annee']]],ignore_index=True)
embed = embed.fillna('Le Monde')

In [227]:
embed.shape

(4227, 3)

In [228]:
#!python -m spacy download fr_core_news_sm
nlp = spacy.load("fr_core_news_sm")

In [None]:
%%time

docs = nlp.pipe(embed.texte)

texts = []
for doc in docs:
    words = []
    for tok in doc:
        if tok.pos_ in ['NOUN', 'ADJ']:
            words.append(tok.lemma_)
    texts.append(" ".join(words))

In [None]:
tf_idf_vect = TfidfVectorizer(stop_words=STOP_WORDS)

In [None]:
vectors = tf_idf_vect.fit_transform(texts)

In [None]:
dense = vectors.todense()
denselist = dense.tolist()

In [None]:
#feature_names = tf_idf_vect.get_feature_names()

In [None]:
embed['tfidf'] = pd.Series(denselist)

In [None]:
len(ex_manif)

In [None]:
dist = cosine_similarity(embed.loc[:len(ex_manif)-1, 'tfidf'].values.tolist(),embed.loc[len(ex_manif):, 'tfidf'].values.tolist())

In [None]:
len(dist)

In [None]:
for i,partname in enumerate(ex_manif.partyname):
    embed.loc[len(ex_manif):,'distance '+partname] = dist[i]

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
fig = embed.dropna().groupby('annee').mean().plot()

In [None]:
fig.show()

In [222]:
fig.write_html('pol_2002.html')