# Exploration des données

## 1. Chargement et aperçu des données

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, lower, trim, when, length

# Initialiser la session Spark
spark = SparkSession.builder \
    .appName("Sentiment Analysis Preprocessing") \
    .getOrCreate()

# Définir les en-têtes manquants
headers = ["target", "ids", "date", "flag", "user", "text"]

# Charger le dataset sans en-têtes
file_path = "dataset/tweets.csv"  # Remplacez par le chemin de votre fichier CSV
df = spark.read.csv(file_path, sep=",", inferSchema=True).toDF(*headers)

In [2]:
# Étape 1 : Afficher un aperçu des données
print("Aperçu des données brutes :")
df.show(10, truncate=False)

Aperçu des données brutes :
+------+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|target|ids       |date                        |flag    |user           |text                                                                                                               |
+------+----------+----------------------------+--------+---------------+-------------------------------------------------------------------------------------------------------------------+
|0     |1467810369|Mon Apr 06 22:19:45 PDT 2009|NO_QUERY|_TheSpecialOne_|@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D|
|0     |1467810672|Mon Apr 06 22:19:49 PDT 2009|NO_QUERY|scotthamilton  |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!    |
|0     |1467810917|Mon

## 2. Dimensions du dataset

In [3]:
# Nombre de lignes et colonnes
num_rows = df.count()
num_cols = len(df.columns)

print(f"Le dataset contient {num_rows} lignes et {num_cols} colonnes.")


Le dataset contient 1600000 lignes et 6 colonnes.


In [4]:
# Vérifier le schéma des données
df.printSchema()

# Résumé statistique
df.describe().show()


root
 |-- target: integer (nullable = true)
 |-- ids: long (nullable = true)
 |-- date: string (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)

+-------+-----------------+--------------------+--------------------+--------+--------------------+--------------------+
|summary|           target|                 ids|                date|    flag|                user|                text|
+-------+-----------------+--------------------+--------------------+--------+--------------------+--------------------+
|  count|          1600000|             1600000|             1600000| 1600000|             1600000|             1600000|
|   mean|              2.0|1.9988175522956276E9|                NULL|    NULL| 4.325887521835714E9|                NULL|
| stddev|2.000000625000293| 1.935760736226641E8|                NULL|    NULL|5.162733218454889E10|                NULL|
|    min|                0|          1467810369|Fri 

## 3. Distribution des sentiments (cibles)

In [5]:
# Compter le nombre de tweets par classe de polarité
df.groupBy("target").count().show()

+------+------+
|target| count|
+------+------+
|     0|800000|
|     4|800000|
+------+------+



In [6]:
# Nombre d'utilisateurs uniques
df.select("user").distinct().count()

# Longueur moyenne des tweets
from pyspark.sql.functions import length
df = df.withColumn("text_length", length("text"))
df.select("text_length").describe().show()

+-------+-----------------+
|summary|      text_length|
+-------+-----------------+
|  count|          1600000|
|   mean|     74.041759375|
| stddev|36.38799215515103|
|    min|                6|
|    max|              359|
+-------+-----------------+



# Nettoyage des données

## 1. Suppression des colonnes inutiles

In [7]:
columns_to_drop = ["ids", "date", "flag", "user"]
df = df.drop(*columns_to_drop)

In [8]:
# Vérifier les données après suppression
df.show(5)

+------+--------------------+-----------+
|target|                text|text_length|
+------+--------------------+-----------+
|     0|@switchfoot http:...|        115|
|     0|is upset that he ...|        111|
|     0|@Kenichan I dived...|         89|
|     0|my whole body fee...|         47|
|     0|@nationwideclass ...|        111|
+------+--------------------+-----------+
only showing top 5 rows



## 2. Nettoyage des textes

### 2.1 Supprimer les URLs

In [9]:
df = df.withColumn("text", regexp_replace(col("text"), r"http\S+", ""))  

### 2.2 Supprimer les mentions Twitter (@username)

In [10]:
df = df.withColumn("text", regexp_replace(col("text"), r"@\w+", ""))


### 2.3 Supprimer les hashtags (#hashtag)

In [11]:
df = df.withColumn("text", regexp_replace(col("text"), r"#\w+", ""))


### 2.4  Supprimer les caractères spéciaux (émoticônes, ponctuations, etc.)

In [12]:
df = df.withColumn("text", regexp_replace(col("text"), r"[^a-zA-Z\s]", ""))


### 2.5 Convertir en minuscules

In [13]:
df = df.withColumn("text", lower(col("text")))


### 2.6 Supprimer les espaces multiples

In [14]:
df = df.withColumn("text", regexp_replace(col("text"), r"\s+", " "))


### 2.7 Supprimer les espaces de début et de fin

In [15]:
df = df.withColumn("text", trim(col("text")))


### 2.8 Supprimer les textes vides ou très courts (moins de 3 caractères)

In [16]:
df = df.withColumn("text_length", length(col("text")))
df = df.filter(col("text_length") > 3).drop("text_length")

### 2.9 Gérer les valeurs manquantes

In [17]:
df = df.withColumn("text", when(col("text").isNull() | (col("text") == ""), "texte_vide").otherwise(col("text")))


In [18]:
# Affichage de la dataset aprés la suppresion 
df.show(5, truncate=False)

+------+--------------------------------------------------------------------------------------------------------+
|target|text                                                                                                    |
+------+--------------------------------------------------------------------------------------------------------+
|0     |awww thats a bummer you shoulda got david carr of third day to do it d                                  |
|0     |is upset that he cant update his facebook by texting it and might cry as a result school today also blah|
|0     |i dived many times for the ball managed to save the rest go out of bounds                               |
|0     |my whole body feels itchy and like its on fire                                                          |
|0     |no its not behaving at all im mad why am i here because i cant see you all over there                   |
+------+--------------------------------------------------------------------------------

## 3. Tokenisation :

### 3.1 Tokenisation Simple

In [19]:
from pyspark.ml.feature import Tokenizer

# Tokenisation simple
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
df = tokenizer.transform(df)

# Afficher les tokens
df.select("text", "tokens").show(truncate=False)

+--------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                    |tokens                                                                                                                        |
+--------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
|awww thats a bummer you shoulda got david carr of third day to do it d                                  |[awww, thats, a, bummer, you, shoulda, got, david, carr, of, third, day, to, do, it, d]                                       |
|is upset that he cant update his facebook by texting it and mig

## 3.2 Suppression des Mots Vides (Stop Words)

In [20]:
from pyspark.ml.feature import StopWordsRemover

# Suppression des mots vides
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
df = remover.transform(df)

# Afficher les tokens filtrés
df.select("tokens", "filtered_tokens").show(truncate=False)


+------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------+
|tokens                                                                                                                        |filtered_tokens                                                                        |
+------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------+
|[awww, thats, a, bummer, you, shoulda, got, david, carr, of, third, day, to, do, it, d]                                       |[awww, thats, bummer, shoulda, got, david, carr, third, day, d]                        |
|[is, upset, that, he, cant, update, his, facebook, by, texting, it, and, might, cry, as, a, result, school, today, also, blah]|[ups

### 3.3 Extraction des n-grammes

In [21]:
from pyspark.ml.feature import NGram

# Générer des bigrammes (n=2)
ngram = NGram(n=2, inputCol="filtered_tokens", outputCol="bigrams")
df = ngram.transform(df)

# Afficher les bigrammes
df.select("filtered_tokens", "bigrams").show(truncate=False)

# Générer des trigrammes (n=3)
ngram3 = NGram(n=3, inputCol="filtered_tokens", outputCol="trigrams")
df = ngram3.transform(df)

# Afficher les trigrammes
df.select("filtered_tokens", "trigrams").show(truncate=False)


+---------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|filtered_tokens                                                                        |bigrams                                                                                                                                               |
+---------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|[awww, thats, bummer, shoulda, got, david, carr, third, day, d]                        |[awww thats, thats bummer, bummer shoulda, shoulda got, got david, david carr, carr third, third day, day d]                                          |
|[upset, cant, update, facebook, tex

## 4. Vectorisation des Textes avec Word2Vec

In [22]:
from pyspark.ml.feature import Word2Vec

# Appliquer le modèle Word2Vec
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_tokens", outputCol="word_embeddings")
model = word2Vec.fit(df)
df = model.transform(df)

# Afficher un aperçu des colonnes après Word2Vec
df.select("target", "text", "word_embeddings").show(5, truncate=False)


+------+--------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
# Sélectionner uniquement les colonnes 'target' et 'text' nettoyé
df_clean = df.select("target", "text")

# Afficher un aperçu des données nettoyées
df_clean.show(5, truncate=False)


+------+--------------------------------------------------------------------------------------------------------+
|target|text                                                                                                    |
+------+--------------------------------------------------------------------------------------------------------+
|0     |awww thats a bummer you shoulda got david carr of third day to do it d                                  |
|0     |is upset that he cant update his facebook by texting it and might cry as a result school today also blah|
|0     |i dived many times for the ball managed to save the rest go out of bounds                               |
|0     |my whole body feels itchy and like its on fire                                                          |
|0     |no its not behaving at all im mad why am i here because i cant see you all over there                   |
+------+--------------------------------------------------------------------------------

In [24]:
from pyspark.sql.functions import when

# Filtrer les tweets avec target = 0 (négatif) et target = 4 (positif)
df_target_0 = df_clean.filter(col("target") == 0)
df_target_4 = df_clean.filter(col("target") == 4)

# Limiter à 100000 lignes pour chaque classe
df_target_0_sampled = df_target_0.limit(50000)
df_target_4_sampled = df_target_4.limit(50000)

# Combiner les deux échantillons
df_balanced = df_target_0_sampled.union(df_target_4_sampled)

# Convertir target 4 en 1 (positif)
df_balanced = df_balanced.withColumn("target", when(col("target") == 4, 1).otherwise(col("target")))

# Vérifier le nombre de lignes après équilibrage
print(f"Nombre total de lignes après équilibrage : {df_balanced.count()}")


Nombre total de lignes après équilibrage : 100000


In [25]:
# Sauvegarder le dataset équilibré sous forme d'un seul fichier CSV
output_balanced_path = "cleaned_dataset.csv"
df_balanced.coalesce(1).write.mode("overwrite").option("header", "true").csv(output_balanced_path)

# Afficher un message de confirmation
print(f"Le dataset équilibré a été sauvegardé sous {output_balanced_path}.")


Le dataset équilibré a été sauvegardé sous balanced_cleaned_tweets.csv.


In [30]:
import pandas as pd

# Charger le fichier CSV
df = pd.read_csv("cleaned_dataset.csv")

# Séparer les lignes où target == 0 et target == 1
df_target_0 = df[df['target'] == 0]
df_target_1 = df[df['target'] == 1]

# Supprimer les 50 000 premières lignes de chaque groupe
df_target_0 = df_target_0.iloc[50000:]
df_target_1 = df_target_1.iloc[50000:]

# Combiner les deux groupes modifiés
df_modified = pd.concat([df_target_0, df_target_1])

# Sauvegarder le fichier modifié sous format CSV
df_modified.to_csv("testing.csv", index=False)
