# Réseau Neuronal Récurrent (RNN Simple)

## Création d'un vocabulaire

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/tweets.csv')

In [3]:
df[["airline_sentiment", "text"]].head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [4]:
def remove_mentions(text):
    return ' '.join(word for word in text.split() if not word.startswith('@'))

def remove_hashtags(text):
    return ' '.join(word for word in text.split() if not word.startswith('#'))

df["cleaned_text"] = df["text"].apply(remove_mentions).apply(remove_hashtags).str.lower()

In [5]:
df["cleaned_text"]

0                                               what said.
1        plus you've added commercials to the experienc...
2        i didn't today... must mean i need to take ano...
3        it's really aggressive to blast obnoxious "ent...
4                 and it's a really big bad thing about it
                               ...                        
14635    thank you we got on a different flight to chic...
14636    leaving over 20 minutes late flight. no warnin...
14637                    please bring american airlines to
14638    you have my money, you change my flight, and d...
14639    we have 8 ppl so we need 2 know how many seats...
Name: cleaned_text, Length: 14640, dtype: object

On veut ensuite retirer les mots vides (Stop Words) (eg: "the", "a", ...) qui n'apportent pas de signification aux sentiments

In [7]:
import string
import re

# Stop words (tu peux ajouter ou enlever des mots selon ton dataset)
stop_words = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
    "at", "by", "for", "with", "about", "against", "between", "into", "through",
    "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then",
    "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other",
    "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
    "now", "dont", "wont", "im", "ive", "u", "ur", "2", "4", "rt", "youve"
])

domain_stop_words = set(["flight", "minutes", "people", "seats", "time", "thank", "pls", "please"])

# Fonction de nettoyage
def clean_text(text):
    # Mettre en minuscules
    text = text.lower()
    # Supprimer urls
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Supprimer mentions et hashtags (optionnel)
    text = re.sub(r"@\w+|#\w+", "", text)
    # Supprimer la ponctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Supprimer les stop words
    cleaned = ' '.join(word for word in text.split() if word not in stop_words|domain_stop_words)
    return cleaned

# Appliquer sur le dataframe
df["cleaned_text"] = df["cleaned_text"].apply(clean_text)

In [8]:
df["cleaned_text"]

0                                                     said
1                  plus added commercials experience tacky
2             didnt today must mean need take another trip
3        really aggressive blast obnoxious entertainmen...
4                                     really big bad thing
                               ...                        
14635                                got different chicago
14637                              bring american airlines
14638    money change answer phones suggestions make co...
14639    8 ppl need know many next plz put us standby next
Name: cleaned_text, Length: 14640, dtype: object

In [None]:
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
stemmer = PorterStemmer()