# Importation des bibliothèques nécessaires

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import os

# Importation et affichage des données

In [5]:
file_path = "sample.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


# Nettoyage des données :

# Mise en minuscules

In [6]:
df = df[['tweet_id', 'text']]
def clean_text(text):
    return text.lower()

df['text_cleaned'] = df['text'].apply(clean_text)
print(df[['tweet_id', 'text', 'text_cleaned']].head())

   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                        text_cleaned  
0  @applesupport causing the reply to be disregar...  
1  @105835 your business means a lot to us. pleas...  
2  @76328 i really hope you all change but i'm su...  
3  @105836 livechat is online at the moment - htt...  
4  @virgintrains see attached error message. i've...  


# Suppression des ponctuations

In [7]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)


df['text_cleaned'] = df['text'].apply(remove_punctuation)
print(df[['tweet_id', 'text', 'text_cleaned']].head())

   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                        text_cleaned  
0  AppleSupport causing the reply to be disregard...  
1  105835 Your business means a lot to us Please ...  
2  76328 I really hope you all change but Im sure...  
3  105836 LiveChat is online at the moment  https...  
4  VirginTrains see attached error message Ive tr...  


# Suppression des mots vides (stopwords)

In [8]:
nltk.download('stopwords')
texts = df['text']
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


df['text_cleaned'] = texts.apply(remove_stopwords)
print(df[['tweet_id', 'text', 'text_cleaned']].head())

   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                        text_cleaned  
0  @AppleSupport causing reply disregarded tapped...  
1  @105835 business means lot us. Please DM name,...  
2      @76328 really hope change I'm sure won't! to!  
3  @105836 LiveChat online moment - https://t.co/...  
4  @VirginTrains see attached error message. I've...  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\d\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Suppression des mots fréquents

In [9]:
word_freq = pd.Series(' '.join(texts).split()).value_counts()
threshold = 10
most_common_words = word_freq[:threshold].index

def remove_common_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in most_common_words]
    return ' '.join(filtered_words)

df['text_cleaned'] = texts.apply(remove_common_words)
print(df[['tweet_id', 'text', 'text_cleaned']].head())

   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                        text_cleaned  
0  @AppleSupport causing reply be disregarded tap...  
1  @105835 Your business means lot us. Please DM ...  
2  @76328 really hope all change but I'm sure won...  
3  @105836 LiveChat online at moment - https://t....  
4  @VirginTrains see attached error message. I've...  


# Suppression des mots rares :

In [10]:
word_freq = pd.Series(' '.join(texts).split()).value_counts()
threshold = 1
rare_words = word_freq[word_freq <= threshold].index

def remove_rare_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in rare_words]
    return ' '.join(filtered_words)

df['text_cleaned'] = texts.apply(remove_rare_words)
print(df[['tweet_id', 'text', 'text_cleaned']].head())

   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                        text_cleaned  
0  @AppleSupport the reply to be and the under th...  
1  means a lot to us. Please DM your name, code a...  
2                I hope you all but I'm you you have  
3  @105836 is at the moment - or 031 031 option 3...  
4  @VirginTrains see I've tried a several times i...  


# Stemming :

In [11]:
nltk.download('punkt')
stemmer = PorterStemmer()

def stem_text(text):
    
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

df['text_stemmed'] = texts.apply(stem_text)
print(df[['tweet_id', 'text', 'text_stemmed']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\d\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                        text_stemmed  
0  @ applesupport caus the repli to be disregard ...  
1  @ 105835 your busi mean a lot to us . pleas dm...  
2  @ 76328 i realli hope you all chang but i 'm s...  
3  @ 105836 livechat is onlin at the moment - htt...  
4  @ virgintrain see attach error messag . i 've ...  


# Lemmatisation :

In [12]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

df['text_lemmatized'] = texts.apply(lemmatize_text)
print(df[['tweet_id', 'text', 'text_lemmatized']].head())

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\d\AppData\Roaming\nltk_data...


   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                     text_lemmatized  
0  @ AppleSupport causing the reply to be disrega...  
1  @ 105835 Your business mean a lot to u . Pleas...  
2  @ 76328 I really hope you all change but I 'm ...  
3  @ 105836 LiveChat is online at the moment - ht...  
4  @ VirginTrains see attached error message . I ...  


# Suppression des émojis , Suppression des émoticônes, Suppression des URL, Suppression des balises HTML.

In [13]:
def remove_special_characters(text):
    
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[\U00010000-\U0010ffff]+', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    return text

df['text_cleaned'] = texts.apply(remove_special_characters)
print(df[['tweet_id', 'text', 'text_cleaned']].head())

   tweet_id                                               text  \
0    119237  @AppleSupport causing the reply to be disregar...   
1    119238  @105835 Your business means a lot to us. Pleas...   
2    119239  @76328 I really hope you all change but I'm su...   
3    119240  @105836 LiveChat is online at the moment - htt...   
4    119241  @VirginTrains see attached error message. I've...   

                                        text_cleaned  
0  @AppleSupport causing the reply to be disregar...  
1  @105835 Your business means a lot to us. Pleas...  
2  @76328 I really hope you all change but I'm su...  
3  @105836 LiveChat is online at the moment -  or...  
4  @VirginTrains see attached error message. I've...  
