# Data Mining LAB: Text Mining



*   réalisé par : chaimaabouabd
*   encadré par : Mme Khadija Bouzaachane



## 1. Acquisition des données et préparation des données

### Chargement du fichier

In [37]:
import pandas as pd

messagesTwitter = pd.read_csv('./global_warming_tweets.csv', delimiter=',', encoding='unicode_escape')

In [38]:
print(messagesTwitter.shape)

(6090, 3)


In [39]:
messagesTwitter.head(2)

Unnamed: 0,tweet,existence,existence.confidence
0,Global warming report urges governments to act...,Yes,1.0
1,Fighting poverty and global warming in Africa ...,Yes,1.0


In [40]:
messagesTwitter['existence'] = (messagesTwitter['existence'] == 'Yes').astype(int)
messagesTwitter.head(100)

Unnamed: 0,tweet,existence,existence.confidence
0,Global warming report urges governments to act...,1,1.0000
1,Fighting poverty and global warming in Africa ...,1,1.0000
2,Carbon offsets: How a Vatican forest failed to...,1,0.8786
3,Carbon offsets: How a Vatican forest failed to...,1,1.0000
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,1,0.8087
...,...,...,...
95,Plants effective way of tackling global warmin...,1,0.7925
96,Climate change & sustainability will be a key ...,1,0.7874
97,Frederic Hague at #PEN: climate change isn't j...,1,0.5778
98,US Generals say: Climate Change Threatens Amer...,1,1.0000


### Normalisation

In [41]:
import re
def normalisation (message):
    message = re.sub('((www\.[^\s]+) | (https?://[^\s]+))', 'URL', message)
    message = re.sub('@[^\s]+', 'USER', message)
    message = message.lower().replace("e", "e")
    message = re.sub('[^a-zA-Za-яА-Я1-9]+', ' ', message)
    message = re.sub(' +',' ', message)
    return message.strip()

In [42]:
messagesTwitter["tweet"] = messagesTwitter["tweet"].apply(normalisation)
print(messagesTwitter.head(10))

                                               tweet  existence  \
0  global warming report urges governments to act...          1   
1  fighting poverty and global warming in africa ...          1   
2  carbon offsets how a vatican forest failed to ...          1   
3  carbon offsets how a vatican forest failed to ...          1   
4  uruguay tools needed for those most vulnerable...          1   
5  rt user rt user ocean saltiness shows global w...          1   
6  global warming evidence all around us|a messag...          1   
7  migratory birds new climate change strategy st...          1   
8  southern africa competing for limpopo water cl...          1   
9  global warming to impact wheat rice production...          1   

   existence.confidence  
0                1.0000  
1                1.0000  
2                0.8786  
3                1.0000  
4                0.8087  
5                1.0000  
6                1.0000  
7                1.0000  
8                1.0000  
9 

In [43]:
messagesTwitter.head()

Unnamed: 0,tweet,existence,existence.confidence
0,global warming report urges governments to act...,1,1.0
1,fighting poverty and global warming in africa ...,1,1.0
2,carbon offsets how a vatican forest failed to ...,1,0.8786
3,carbon offsets how a vatican forest failed to ...,1,1.0
4,uruguay tools needed for those most vulnerable...,1,0.8087


### Suppression des stop words

In [44]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
stopWords = stopwords.words('english')

In [46]:
messagesTwitter["tweet"] = messagesTwitter["tweet"].apply(
    lambda message: ' '.join([word for word in message.split() if word not in (stopWords)]))

messagesTwitter.head(10)

Unnamed: 0,tweet,existence,existence.confidence
0,global warming report urges governments act|br...,1,1.0
1,fighting poverty global warming africa link,1,1.0
2,carbon offsets vatican forest failed reduce gl...,1,0.8786
3,carbon offsets vatican forest failed reduce gl...,1,1.0
4,uruguay tools needed vulnerable climate change...,1,0.8087
5,rt user rt user ocean saltiness shows global w...,1,1.0
6,global warming evidence around us|a message gl...,1,1.0
7,migratory birds new climate change strategy st...,1,1.0
8,southern africa competing limpopo water climat...,1,1.0
9,global warming impact wheat rice production in...,1,1.0


### La stemmisation

In [47]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Téléchargez les données de stemming si ce n'est pas déjà fait
nltk.download('snowball_data')

# Définir la langue pour la stemmatisation (par exemple, 'english')
stemmer = SnowballStemmer('english')

# Définir la fonction lambda pour la stemmatisation des mots
messagesTwitter["tweet"] = messagesTwitter["tweet"].apply(
  lambda message: ' '.join([stemmer.stem(word) for word in message.split()]))


# Afficher le résultat
messagesTwitter.head(10)


[nltk_data] Downloading package snowball_data to /root/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!


Unnamed: 0,tweet,existence,existence.confidence
0,global warm report urg govern act|brussel belg...,1,1.0
1,fight poverti global warm africa link,1,1.0
2,carbon offset vatican forest fail reduc global...,1,0.8786
3,carbon offset vatican forest fail reduc global...,1,1.0
4,uruguay tool need vulner climat chang link,1,0.8087
5,rt user rt user ocean salti show global warm i...,1,1.0
6,global warm evid around us|a messag global war...,1,1.0
7,migratori bird new climat chang strategi stay ...,1,1.0
8,southern africa compet limpopo water climat ch...,1,1.0
9,global warm impact wheat rice product india|lu...,1,1.0


### La lemmatisation

In [49]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
messagesTwitter['tweet'] = messagesTwitter['tweet'].apply(
    lambda message: ' '.join([lemmatizer.lemmatize(word) for word in message.split()]))


[nltk_data] Downloading package wordnet to /root/nltk_data...


## 2. Phases d'apprentissage et de prédiction

### Découpage en jeux de tests et d'apprentissage

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(messagesTwitter['tweet' ].values,
messagesTwitter['existence'].values, test_size=0.2)

### Création d'un pipeline d'apprentissage

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

etapes_apprentissage = Pipeline([('frequence',
CountVectorizer ()),

('tfidf', TfidfTransformer ()),
('algorithme',

MultinomialNB () ) ])

### Apprentissage et analyse des résultats

In [55]:
modele = etapes_apprentissage.fit(X_train, y_train)

from sklearn.metrics import classification_report
print (classification_report(y_test, modele.predict (X_test),
digits=4))

              precision    recall  f1-score   support

           0     0.9105    1.0000    0.9532      1109
           1     0.0000    0.0000    0.0000       109

    accuracy                         0.9105      1218
   macro avg     0.4553    0.5000    0.4766      1218
weighted avg     0.8290    0.9105    0.8679      1218



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Classification d'un nouveau message

In [56]:
phrase = "Why should trust scientists with global warming if they didnt know Pluto wasnt a planet"
print (phrase)

#Normalisation
phrase = normalisation (phrase)

#Suppression des stops words
phrase = ' '.join([mot for mot in phrase.split() if mot not in
(stopWords) ])

#Stemmatisation
phrase = ' '.join([stemmer.stem(mot) for mot in phrase.split(' ')])


Why should trust scientists with global warming if they didnt know Pluto wasnt a planet


In [57]:
#Lemmatisation
phrase = ' '.join([lemmatizer. lemmatize(mot) for mot in
phrase.split(' ')])
print(phrase)

prediction = modele.predict([phrase])
print (prediction)
if[prediction[0] == 0]:
  print (">> Ne croit pas au rechauffement climatique ... ")
else:
  print (">> Croit au rechauffement climatique ... ")


trust scientist global warm didnt know pluto wasnt planet
[0]
>> Ne croit pas au rechauffement climatique ... 
