In [1]:
import pandas as pd
from pdfminer.high_level import extract_text
from gensim.test.utils import datapath
from gensim import utils
import spacy
from collections import Counter
import unidecode
import gensim.models

In [2]:
## create a .csv of EM's Twitter activity since 27 Jan 2022
!minet tw scrape tweets "(from:ZemmourEric) until:2022-03-24 since:2022-01-27" > tweets_EZ.csv

## convert the .csv file in a data frame using pandas
df_tw_EZ = pd.read_csv("./tweets_EZ.csv")

## create a list of tweets selecting the 'text' column of the data frame
list_tw_EZ = df_tw_EZ['text'].values.tolist()
len(list_tw_EZ)

Searching for "(from:ZemmourEric) until:2022-03-24 since:2022-01-27"
Collecting tweet: 1257 tweets [00:45, 27.79 tweets/s, queries=1, tokens=1]


1257

In [3]:
## retrieve EZ's affiliates Twitter activity
!minet tw scrape tweets "(from:Samuel_Lafont OR from:GilbertCollard OR from:jerome_riviere OR from:MarionMarechal OR from:G_Peltier OR from:NicolasBay_ OR from:DenisCieslik OR from:stanislasrig OR from:AntoineDiers OR from:de_beaujeu OR from:Stephane_Ravier OR from:MaxettePirbakas OR from:LaurenceTrochu) until:2022-03-24 since:2022-01-27" > tw_EZ_aff_all.csv

Searching for "(from:Samuel_Lafont OR from:GilbertCollard OR from:jerome_riviere OR from:MarionMarechal OR from:G_Peltier OR from:NicolasBay_ OR from:DenisCieslik OR from:stanislasrig OR from:AntoineDiers OR from:de_beaujeu OR from:Stephane_Ravier OR from:MaxettePirbakas OR from:LaurenceTrochu) until:2022-03-24 since:2022-01-27"
Collecting tweet: 9323 tweets [05:49, 26.69 tweets/s, queries=1, tokens=1]


In [4]:
## convert EZ's affiliates' tweets in a list
df_tw_EZ_aff_all = pd.read_csv("tw_EZ_aff_all.csv")
list_tw_EZ_aff_all = df_tw_EZ_aff_all['text'].values.tolist()
print(list_tw_EZ_aff_all[0])
print(len(list_tw_EZ_aff_all))

Continuez de tweeter sur #FaceABaba #ZemmourProgramme pour soutenir Eric Zemmour 👌 https://twitter.com/Samuel_Lafont/status/1506776886491463682/photo/1
9323


In [5]:
## merge EZ and his affiliates' lists
list_tw_EZ_all = list_tw_EZ + list_tw_EZ_aff_all

In [16]:
## i retrieve a string from the pdf of EZ's manifesto using extract_text of the pdfminer package
## the cleaning process is specific for this manifesto and it depends on the output of extract_text
manif_EZ = extract_text('/Users/simonemariaparazzoli/Documents/Università/Sciences Po/Diving into public digital spaces/research/manifesto_zemmour.pdf')
manif_clean_EZ = manif_EZ.replace('-\n','')
manif_clean_EZ = manif_clean_EZ.replace('\n\n','---')
manif_clean_EZ = manif_clean_EZ.replace('\n','')
manif_clean_EZ = manif_clean_EZ.replace('\xa0','')
manif_clean_EZ = manif_clean_EZ.replace('\x0c','')
manif_clean_EZ = manif_clean_EZ.replace('. .','')
manif_clean_EZ = manif_clean_EZ.replace('  ','')
manif_clean_EZ = manif_clean_EZ.replace('. ','---')
manif_clean_EZ = manif_clean_EZ.replace('------','---')
print(repr(manif_clean_EZ))

## convert the string of the manifesto into a list
list_manif_EZ = manif_clean_EZ.split("---")
list_manif_EZ = [s for s in list_manif_EZ if len(s)>30]
len(list_manif_EZ)
#print(list_manif_EZ)

'POUR QUE LA FRANCE RESTE LA FRANCEMON PROGRAMME---iNTROdUCTiON---Depuis1500ans,unpaysextraordinairedéroulelefildesonHistoire.Aucunenationau monde ne peut se targuer de lui ressembler---Ce pays, c’est le nôtre---Ce pays c’est la France---Vous me connaissez depuis longtemps : voilà près de trente ans que je pose un diagnostic sur la situation de la France, sur nos faiblesses, nos lacunes et nos ennemis---Cette réflexion sur l’état de la France, je l’ai nourrie par mes lectures, mes rencontres, et des analyses alimentées par les milliers de témoignages de mes compatriotes---Depuis des mois, je sillonne toutes les régions de notre pays à la rencontre des Français pour écouter ceux qui font battre le cœur de la nation française.---J’ail’intimeconvictionquecetteélectionprésidentiellen’estpasunscrutincommeles autres.LaFranceestàlacroiséedescheminsdesapropreHistoire:économiquement, culturellement,démographiquement,notrepayspourraits’effacerprogressivementdela grande marche du monde---Un proce

669

In [17]:
## merge the two lists of tweets and of the manifesto 
list_EZ = list_tw_EZ_all + list_manif_EZ
len(list_EZ)

## load a spacy model to retrieve stop words
nlp = spacy.load("fr_core_news_sm")
stop_words_fr = nlp.Defaults.stop_words

## clean the list of tweets and manifesto to get rid of useless words and make the list content homogeneous
list_EZ_clean = []
for i in list_EZ:
    doc = nlp(i)
    tokens = [unidecode.unidecode(token.text).lower() for token in doc 
              if (token.text not in stop_words_fr and
                  len(token.text)>2 and
                  token.like_url == False )]
    tokens_joined = ' '.join(tokens)
    list_EZ_clean.append(tokens_joined)
    
## test the output of the cleaning process
print(list_EZ[205])
print("---")
print(list_EZ_clean[205])

À ce soir, 20h20 sur TF1 ! 

#FaceALaGuerreTF1 https://twitter.com/ZemmourEric/status/1503411736589881345/photo/1
---
soir 20h20 tf1 facealaguerretf1


In [18]:
## prepare the corpus as a class
class MyCorpus_EZ:

    def __iter__(self):
        for i in list_EZ_clean:
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(i,min_len=3)
            
## train the word embeddings model_EZ
sentences = MyCorpus_EZ()
model_EZ = gensim.models.Word2Vec(sentences=sentences, min_count=10, vector_size=300)

In [19]:
## transform the corpus list (that is made of tweets and sentences from the manifesto)
## in a list containing all the words of the corpus as elements of the list
words_EZ = []

for i in list_EZ_clean:
    i_split = i.split(' ') #transform the i document into a list (split at blank space)
    words_EZ.extend(i_split)

## clean the list of tokens
words_EZ_clean = [x for x in words_EZ 
                   if x not in stop_words_fr
                   if x != "\n\n"
                   if len(x)>1]

## find the 30 most common words using Counter
words_freq_EZ = Counter(words_EZ_clean)
common_words_EZ = words_freq_EZ.most_common(30)
print(common_words_EZ)

[('zemmour', 4434), ('eric', 4116), ('france', 1295), ('partagez', 1033), ('macron', 1027), ('francais', 1003), ('@zemmoureric', 885), ('soutenez', 806), ('zemmourpresident', 784), ('soutenir', 686), ('pecresse', 554), ('meeting', 505), ('faire', 437), ('direct', 419), ('contre', 410), ('video', 399), ('zemmourlille', 376), ('programme', 363), ('elysee2022', 356), ('soir', 339), ('grand', 338), ('pouvoirdachat', 337), ('faceababa', 318), ('campagne', 310), ('zemmourtoulon', 310), ('@vpecresse', 309), ('000', 308), ('avez', 306), ('pays', 299), ('reconquete', 289)]


In [20]:
## first attempt with the most_similar function on our corpus using our model_EZ
result = model_EZ.wv.most_similar(positive=['france'], topn=30)
print(result)

[('nous', 0.9986869692802429), ('pays', 0.997687041759491), ('faut', 0.9966728091239929), ('faire', 0.9965434670448303), ('ans', 0.9960296154022217), ('ete', 0.9950423240661621), ('politique', 0.9949116706848145), ('guerre', 0.9914374947547913), ('etat', 0.9890145659446716), ('bien', 0.9823765158653259), ('sommes', 0.9819726347923279), ('ukraine', 0.9812071323394775), ('immigration', 0.9783967137336731), ('veux', 0.9763822555541992), ('peuple', 0.9763120412826538), ('vivre', 0.973588228225708), ('fin', 0.9717499613761902), ('elle', 0.9707645177841187), ('monde', 0.9696477651596069), ('identite', 0.9695180058479309), ('ils', 0.9685749411582947), ('rien', 0.9683544635772705), ('art', 0.9676238894462585), ('reconquerir', 0.9666887521743774), ('russie', 0.9664713144302368), ('francais', 0.9658918380737305), ('mettre', 0.9649530053138733), ('marine', 0.9644203782081604), ('non', 0.9638226628303528), ('campagne', 0.9636406898498535)]
