## ANALYSE DE SENTIMENTS

LIBRAIRIES

In [26]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import length, regexp_replace, substring, col, udf, isnan, concat_ws, to_timestamp, date_format
from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.ml.feature import Tokenizer

import nltk
import spacy
from pyspark.ml.feature import StopWordsRemover
fr_stopwords = nltk.corpus.stopwords.words('french')
import re

import pandas as pd
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
import pymongo


OUVERTURE SESSION PYSPARK

In [27]:
print(pyspark.__version__)

3.4.0


In [28]:
## OUVERTURE SESSION SPARK
spark = SparkSession.builder.appName("CSVReader").getOrCreate()
## CHARGEMENT DU CSV
df = spark.read.csv("./data.csv", header=True, inferSchema=True)
#TELECHARGEMENT POUR TOKENISATION
nltk.download('punkt')
#TELECHARGEMENT POUR STOP WORD
nlp = spacy.load("fr_core_news_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\33664\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NETTOYAGE DES DONNEES

In [29]:
num_rows = df.count()
num_columns = len(df.columns)

#df_size = num_rows * num_columns

print("Taille du DataFrame :", num_rows)

Taille du DataFrame : 974101


In [30]:
## NOMS DES COLONNES ET TYPOLOGIE
df

DataFrame[index: string, Datetime: string, Tweet Id: string, Text: string, Like Count: string, Reply Count: string, Retweet Count: string, Quote Count: string, Tweet_Language: string, User_username: string, User_location: string, User_description: string, User_followersCount: string, User_friendsCount: string, User_verified: string, User_favouritesCount: string, User_statusesCount: string, User_listedCount: string, User_url: string, User_created: string, User_profileImageUrl: string, Query: string]

In [31]:
## SUPRESSIONS DES COLONNES
df = df.drop("index","Tweet_Language","User_friendsCount","User_statusesCount","User_listedCount","User_favouritesCount","User_url","User_created","User_profileImageUrl","Query","User_location")

In [32]:
## CONVERTION INT
columns_to_convert = ["Like Count", "Reply Count", "Retweet Count", "Quote Count", "User_followersCount"]
for column in columns_to_convert:
    df = df.withColumn(column, col(column).cast(IntegerType()))

In [33]:
## NOMS DES COLONNES ET TYPOLOGIE
df

DataFrame[Datetime: string, Tweet Id: string, Text: string, Like Count: int, Reply Count: int, Retweet Count: int, Quote Count: int, User_username: string, User_description: string, User_followersCount: int, User_verified: string]

In [34]:
## SUPPRESSIONS DES LIGNES ERRONEES CERTIF
valid_values = ["False", "null", "True"]
df = df.filter(df.User_verified.isin(valid_values))


In [35]:
## SUPRESSIONS DES CARACTERES TEXT
df = df.withColumn("Text", regexp_replace(df.Text, r"@\w+", ""))
df = df.withColumn("Text", regexp_replace(df.Text, r"https\S+", ""))
df = df.withColumn("Text", regexp_replace(df.Text, "#ReformeDesRetraites", ""))

In [36]:
# Filtrer les lignes sans aucun caract√®re alphanum√©riques dans la colonne "Text" (permet de supprimer les vides)
df = df.filter(col("Text").rlike("[A-Za-z0-9]"))

df.show(5)

+--------------------+-------------------+--------------------+----------+-----------+-------------+-----------+--------------+--------------------+-------------------+-------------+
|            Datetime|           Tweet Id|                Text|Like Count|Reply Count|Retweet Count|Quote Count| User_username|    User_description|User_followersCount|User_verified|
+--------------------+-------------------+--------------------+----------+-----------+-------------+-----------+--------------+--------------------+-------------------+-------------+
|2023-04-02 20:42:...|1642598362414301184| un faux compte ?...|         0|          0|            0|          0|zitounbraghini|                null|                 18|        False|
|2023-04-02 20:40:...|1642597945634611202|üëèü•≥ Cette victoi...|         0|          0|            0|          0|CorbierVincent|En marche dans le...|                121|        False|
|2023-04-02 20:36:...|1642596869741182979|Le 6 avril, gr√®ve...|         1|      

In [37]:
## SUPPRESSIONS DES LIGNES ERRONEES
df = df.filter(length(df.Datetime) >= 10)
#Filtrer les lignes DATETIME
df = df.filter(~col("datetime").rlike("[a-zA-Z]"))

In [38]:
#RACCOURIR DATETIME
df = df.withColumn("Datetime", substring("Datetime", 1, 19))
df.show(5)

+-------------------+-------------------+--------------------+----------+-----------+-------------+-----------+--------------+--------------------+-------------------+-------------+
|           Datetime|           Tweet Id|                Text|Like Count|Reply Count|Retweet Count|Quote Count| User_username|    User_description|User_followersCount|User_verified|
+-------------------+-------------------+--------------------+----------+-----------+-------------+-----------+--------------+--------------------+-------------------+-------------+
|2023-04-02 20:42:22|1642598362414301184| un faux compte ?...|         0|          0|            0|          0|zitounbraghini|                null|                 18|        False|
|2023-04-02 20:40:42|1642597945634611202|üëèü•≥ Cette victoi...|         0|          0|            0|          0|CorbierVincent|En marche dans le...|                121|        False|
|2023-04-02 20:36:26|1642596869741182979|Le 6 avril, gr√®ve...|         1|          0|

In [39]:
df = df.withColumn("Datetime", to_timestamp(col("Datetime"), "yyyy-MM-dd HH:mm:ss"))

In [40]:
df

DataFrame[Datetime: timestamp, Tweet Id: string, Text: string, Like Count: int, Reply Count: int, Retweet Count: int, Quote Count: int, User_username: string, User_description: string, User_followersCount: int, User_verified: string]

In [41]:
df.printSchema()

root
 |-- Datetime: timestamp (nullable = true)
 |-- Tweet Id: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- Like Count: integer (nullable = true)
 |-- Reply Count: integer (nullable = true)
 |-- Retweet Count: integer (nullable = true)
 |-- Quote Count: integer (nullable = true)
 |-- User_username: string (nullable = true)
 |-- User_description: string (nullable = true)
 |-- User_followersCount: integer (nullable = true)
 |-- User_verified: string (nullable = true)



ANALYSE DE SENTIMENT

In [42]:
#TOKENIZATION
tokenizer = Tokenizer(inputCol="Text", outputCol="Words")
df = tokenizer.transform(df)

In [43]:
df.show(5)

+-------------------+-------------------+--------------------+----------+-----------+-------------+-----------+--------------+--------------------+-------------------+-------------+--------------------+
|           Datetime|           Tweet Id|                Text|Like Count|Reply Count|Retweet Count|Quote Count| User_username|    User_description|User_followersCount|User_verified|               Words|
+-------------------+-------------------+--------------------+----------+-----------+-------------+-----------+--------------+--------------------+-------------------+-------------+--------------------+
|2023-04-02 20:42:22|1642598362414301184| un faux compte ?...|         0|          0|            0|          0|zitounbraghini|                null|                 18|        False|[, un, faux, comp...|
|2023-04-02 20:40:42|1642597945634611202|üëèü•≥ Cette victoi...|         0|          0|            0|          0|CorbierVincent|En marche dans le...|                121|        False|[ü

In [44]:
swr = StopWordsRemover(inputCol="Words", outputCol="FilteredWords", stopWords=fr_stopwords)
SwRemoved = swr.transform(df)
SwRemoved.show(truncate=False, n=5)

+-------------------+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-----------+-------------+-----------+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [45]:
df = SwRemoved.withColumn("FilteredWordsString", concat_ws(" ", SwRemoved.FilteredWords))


Conversion Pandas

In [46]:
# Convertir la DataFrame PySpark en pandas DataFrame
pdf = df.toPandas()

  series = series.astype(t, copy=False)


In [47]:
senti_list = []
for i in pdf["FilteredWordsString"]:
    vs = tb(i).sentiment[0]
    if (vs > 0):
        senti_list.append('Satisfait')
    elif (vs < 0):
        senti_list.append('Insatisfait')
    else:
        senti_list.append('Neutre')   

pdf["sentiment_class"]=senti_list

In [48]:
pdf

Unnamed: 0,Datetime,Tweet Id,Text,Like Count,Reply Count,Retweet Count,Quote Count,User_username,User_description,User_followersCount,User_verified,Words,FilteredWords,FilteredWordsString,sentiment_class
0,2023-04-02 20:42:22,1642598362414301184,un faux compte ? Probl√®me : l‚Äôarticle de Fran...,0,0,0,0,zitounbraghini,,18,False,"[, un, faux, compte, ?, probl√®me, :, l‚Äôarticle...","[, faux, compte, ?, probl√®me, :, l‚Äôarticle, fr...",faux compte ? probl√®me : l‚Äôarticle france inf...,Insatisfait
1,2023-04-02 20:40:42,1642597945634611202,üëèü•≥ Cette victoire est un message de fermet√© de...,0,0,0,0,CorbierVincent,En marche dans le 78 depuis 2016 - Anti-Gilets...,121,False,"[üëèü•≥, cette, victoire, est, un, message, de, fe...","[üëèü•≥, cette, victoire, message, fermet√©, part, ...",üëèü•≥ cette victoire message fermet√© part √©lecteu...,Insatisfait
2,2023-04-02 20:36:26,1642596869741182979,"Le 6 avril, gr√®ve g√©n√©rale ! via #Melenchon ...",1,0,1,0,Mercypolitics,French author of The Orwellian Empire journali...,3959,False,"[le, 6, avril,, gr√®ve, g√©n√©rale, !, , via, , #...","[6, avril,, gr√®ve, g√©n√©rale, !, , via, , #mele...","6 avril, gr√®ve g√©n√©rale ! via #melenchon der...",Satisfait
3,2023-04-02 20:27:38,1642594657312862208,sauver les institutions !! sinon un s√©nateur r...,0,0,0,0,Ys_ambre,"Les fant√¥me ne sont pas avares, ici Londres, m...",206,False,"[sauver, les, institutions, !!, sinon, un, s√©n...","[sauver, institutions, !!, sinon, s√©nateur, ri...",sauver institutions !! sinon s√©nateur risque m...,Neutre
4,2023-04-02 20:26:21,1642594332182999042,Qui s√®me le vent r√©colte la temp√™te ! Vous d√©c...,0,0,0,0,zitounbraghini,,18,False,"[qui, s√®me, le, vent, r√©colte, la, temp√™te, !,...","[s√®me, vent, r√©colte, temp√™te, !, d√©couvrez, a...",s√®me vent r√©colte temp√™te ! d√©couvrez ainsi qu...,Satisfait
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105777,2023-04-16 07:14:43,1647468541438836737,Malgr√© la d√©cision du #Conseil_Constitutionn...,0,1,0,0,brestoisdu29200,Ami de notre Pr√©sident de la R√©publique Fran√ßa...,30,False,"[, , malgr√©, la, d√©cision, du, #conseil_consti...","[, , malgr√©, d√©cision, #conseil_constitutionne...",malgr√© d√©cision #conseil_constitutionnel va ...,Satisfait
105778,2023-04-16 07:14:32,1647468493783064577,La france va pourrir tes J.O... #r√©formedesre...,1,0,0,0,greghs74,,153,False,"[, la, france, va, pourrir, tes, j.o..., #r√©fo...","[, france, va, pourrir, j.o..., #r√©formedesret...",france va pourrir j.o... #r√©formedesretraites,Neutre
105779,2023-04-17 00:07:39,1647723456845193219,"Et de temps en temps, la lucidit√© fait irrupti...",0,0,0,0,aw3rty92,Get your fix of politics and entertainment wit...,12,False,"[et, de, temps, en, temps,, la, lucidit√©, fait...","[temps, temps,, lucidit√©, fait, irruption, sur...","temps temps, lucidit√© fait irruption surprise ...",Satisfait
105780,2023-04-17 01:07:15,1647738455256965124,#Macron20h,0,0,0,0,Hersh__Bathens,"Ex dormeur, enfin r√©veill√©...",318,False,[#macron20h],[#macron20h],#macron20h,Neutre


In [49]:
#Cr√©ation de la colonne sentiment_class
#pdf.loc[pdf["sentiment"] > 0, "sentiment_class"] = 'Satisfait'
#pdf.loc[pdf["sentiment"] < 0, "sentiment_class"] = 'Insatisfait'
#pdf.loc[pdf["sentiment"] == 0, "sentiment_class"] = 'Neutre'


In [50]:
#Cr√©ation de la colonne sentiment_class_details
#pdf.loc[pdf["sentiment"] > 0.5, "sentiment_class_details"] = 'Tr√®s Satisfait'
#pdf.loc[(pdf["sentiment"] > 0) & (pdf["sentiment"] < 0.5), "sentiment_class_details"] = 'Satisfait'
#pdf.loc[pdf["sentiment"] == 0, "sentiment_class_details"] = 'Neutre'
#pdf.loc[(pdf["sentiment"] > -0.5) & (pdf["sentiment"] < 0), "sentiment_class_details"] = 'Satisfait'
#pdf.loc[pdf["sentiment"] < -0.5, "sentiment_class_details"] = 'Tr√®s Insatisfait'

#pdf.head()


In [51]:
#SUPPRIMER COLONNES NON NECESSAIRES
pdf = pdf.drop(columns=["FilteredWords", "Words"])



In [52]:
pdf

Unnamed: 0,Datetime,Tweet Id,Text,Like Count,Reply Count,Retweet Count,Quote Count,User_username,User_description,User_followersCount,User_verified,FilteredWordsString,sentiment_class
0,2023-04-02 20:42:22,1642598362414301184,un faux compte ? Probl√®me : l‚Äôarticle de Fran...,0,0,0,0,zitounbraghini,,18,False,faux compte ? probl√®me : l‚Äôarticle france inf...,Insatisfait
1,2023-04-02 20:40:42,1642597945634611202,üëèü•≥ Cette victoire est un message de fermet√© de...,0,0,0,0,CorbierVincent,En marche dans le 78 depuis 2016 - Anti-Gilets...,121,False,üëèü•≥ cette victoire message fermet√© part √©lecteu...,Insatisfait
2,2023-04-02 20:36:26,1642596869741182979,"Le 6 avril, gr√®ve g√©n√©rale ! via #Melenchon ...",1,0,1,0,Mercypolitics,French author of The Orwellian Empire journali...,3959,False,"6 avril, gr√®ve g√©n√©rale ! via #melenchon der...",Satisfait
3,2023-04-02 20:27:38,1642594657312862208,sauver les institutions !! sinon un s√©nateur r...,0,0,0,0,Ys_ambre,"Les fant√¥me ne sont pas avares, ici Londres, m...",206,False,sauver institutions !! sinon s√©nateur risque m...,Neutre
4,2023-04-02 20:26:21,1642594332182999042,Qui s√®me le vent r√©colte la temp√™te ! Vous d√©c...,0,0,0,0,zitounbraghini,,18,False,s√®me vent r√©colte temp√™te ! d√©couvrez ainsi qu...,Satisfait
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105777,2023-04-16 07:14:43,1647468541438836737,Malgr√© la d√©cision du #Conseil_Constitutionn...,0,1,0,0,brestoisdu29200,Ami de notre Pr√©sident de la R√©publique Fran√ßa...,30,False,malgr√© d√©cision #conseil_constitutionnel va ...,Satisfait
105778,2023-04-16 07:14:32,1647468493783064577,La france va pourrir tes J.O... #r√©formedesre...,1,0,0,0,greghs74,,153,False,france va pourrir j.o... #r√©formedesretraites,Neutre
105779,2023-04-17 00:07:39,1647723456845193219,"Et de temps en temps, la lucidit√© fait irrupti...",0,0,0,0,aw3rty92,Get your fix of politics and entertainment wit...,12,False,"temps temps, lucidit√© fait irruption surprise ...",Satisfait
105780,2023-04-17 01:07:15,1647738455256965124,#Macron20h,0,0,0,0,Hersh__Bathens,"Ex dormeur, enfin r√©veill√©...",318,False,#macron20h,Neutre


Exportation vers MongoDB Compass

In [53]:
client = pymongo.MongoClient ("mongodb://localhost:27017")

In [54]:
db = client["twitter_analysis"]


In [55]:
data = pdf.to_dict(orient="record")

  data = pdf.to_dict(orient="record")


In [56]:
data

[{'Datetime': Timestamp('2023-04-02 20:42:22'),
  'Tweet Id': '1642598362414301184',
  'Text': ' un faux compte ? Probl√®me : l‚Äôarticle de France Info date de juin 2022 et le compte pr√©sent a √©t√© cr√©√©  en ao√ªt 2022 üòÖ Qui se cache derri√®re ce compte ? ü§î #retraites #censurepopulaire #macronie',
  'Like Count': 0,
  'Reply Count': 0,
  'Retweet Count': 0,
  'Quote Count': 0,
  'User_username': 'zitounbraghini',
  'User_description': None,
  'User_followersCount': 18,
  'User_verified': 'False',
  'FilteredWordsString': ' faux compte ? probl√®me : l‚Äôarticle france info date juin 2022 compte pr√©sent a cr√©√©  ao√ªt 2022 üòÖ cache derri√®re compte ? ü§î #retraites #censurepopulaire #macronie',
  'sentiment_class': 'Insatisfait'},
 {'Datetime': Timestamp('2023-04-02 20:40:42'),
  'Tweet Id': '1642597945634611202',
  'Text': "üëèü•≥ Cette victoire est un message de fermet√© de la part des √©lecteurs qui sont majoritairement pour la  ! Ce n'est pas le terrorisme intellectu

In [57]:
print(db)

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'twitter_analysis')


In [58]:
db.data.insert_many(data)

<pymongo.results.InsertManyResult at 0x25ea6976e60>