# Natural Language Processing: sentiment analysis

## Import modules

In [0]:
# import nltk
# nltk.download('all')

In [0]:
import numpy as np
import pandas as pd 

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, sent_tokenize, word_tokenize
from nltk.probability import ConditionalFreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# For regex filter
import re

# For bag-of-words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer,mean_squared_error

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Mission 1 ✔
Crée un compte sur Kaggle, puis télécharge les données à partir de [cette adresse](https://www.kaggle.com/c/tweet-sentiment-extraction/data?select=train.csv). Tu peux aussi les télécharger via l'[API de Kaggle](https://www.kaggle.com/docs/api).

In [0]:
url_train = "https://raw.githubusercontent.com/h4r1c0t/WildCodeSchool/master/Odyssey/Dataset/tweet-sentiment-extraction/train.csv"
url_test = "https://raw.githubusercontent.com/h4r1c0t/WildCodeSchool/master/Odyssey/Dataset/tweet-sentiment-extraction/test.csv"

df_train = pd.read_csv(url_train)
df_test = pd.read_csv(url_test)

## Mission 2 ✔
Lis les jeux de données train et test dans Python avec pandas. Conserve uniquement les tweets positifs et négatifs (donc tu exclues les "neutrals"). Quel est le pourcentage de tweets positifs/négatifs dans le jeu d'entrainement ? Conserve uniquement les colonnes "text" et "sentiment" des 2 datasets.



In [0]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [0]:
df_train.describe(include='all').T

Unnamed: 0,count,unique,top,freq
textID,27481,27481,20251532a7,1
text,27480,27480,_addict yeah like super short I guess I`ll ju...,1
selected_text,27480,22463,good,199
sentiment,27481,3,neutral,11118


In [0]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3534 entries, 0 to 3533
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     3534 non-null   object
 1   text       3534 non-null   object
 2   sentiment  3534 non-null   object
dtypes: object(3)
memory usage: 83.0+ KB


In [0]:
df_test.describe(include='all').T

Unnamed: 0,count,unique,top,freq
textID,3534,3534,ab464f70e6,1
text,3534,3534,someone`s a sweet tooth i was dying for some...,1
sentiment,3534,3,neutral,1430


On retire les tweets neutres

In [0]:
df_train_emo = df_train[['text', 'sentiment']][df_train['sentiment'] != 'neutral']

df_test_emo = df_test[['text', 'sentiment']][df_test['sentiment'] != 'neutral']

In [0]:
df_train_emo.describe(include='all').T

Unnamed: 0,count,unique,top,freq
text,16363,16363,still no pool key. wth. it`s even hot out today.,1
sentiment,16363,2,positive,8582


In [0]:
print('% de tweets positifs =',
      round(len(df_train_emo[df_train_emo['sentiment'] == 'positive']) / df_train_emo.shape[0] * 100, 2))

% de tweets positifs = 52.45


In [0]:
df_test_emo.describe(include='all').T

Unnamed: 0,count,unique,top,freq
text,2104,2104,someone`s a sweet tooth i was dying for some...,1
sentiment,2104,2,positive,1103


In [0]:
print('% de tweets positifs =',
      round(len(df_test_emo[df_test_emo['sentiment'] == 'positive']) / df_test_emo.shape[0] * 100, 2))

% de tweets positifs = 52.42


## Mission 3 & 4 ✔
Enlève les stop words et utilise un stemmer ou un lemmatizer pour affiner le corpus.



In [0]:
df_train_emo

Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
6,2am feedings for the baby are fun when he is a...,positive
...,...,...
27475,enjoy ur night,positive
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive


In [0]:
def WordsPreprocessing(data):
  # Filtre punctuation
  pattern = r"[^\w]"
  data['text_filtred'] = data['text'].apply(lambda x: re.sub(pattern, " ", x).lower())

  # Tokenization
  data['words'] = data['text_filtred'].apply(lambda x: word_tokenize(x))  

  stop_words = set(stopwords.words("english"))
  lem = WordNetLemmatizer()

  for row in range(data.shape[0]):
    words = data.iloc[row, 3] # On prend les mots qui correspondent au tweet
    filtred_words = []
    for w in words:
      if not w in stop_words:   # Stopwords
        filtred_words.append(lem.lemmatize(w))  # Lemmanization

    data.iloc[row, 3] = filtred_words # On remplace la liste de mots par les mots filtrés.
    txt=''
    for w in filtred_words:       # On refait une phrase avec les mots.
      txt += w + ' '
    txt = txt.strip()
    data.iloc[row, 2] = txt
    
    data.rename(columns={"words": "words_filtred"})


In [0]:
WordsPreprocessing(df_train_emo)

WordsPreprocessing(df_test_emo)

In [0]:
df_train_emo.head()

Unnamed: 0,text,sentiment,text_filtred,words
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad miss san diego,"[sooo, sad, miss, san, diego]"
2,my boss is bullying me...,negative,bos bullying,"[bos, bullying]"
3,what interview! leave me alone,negative,interview leave alone,"[interview, leave, alone]"
4,"Sons of ****, why couldn`t they put them on t...",negative,son put release already bought,"[son, put, release, already, bought]"
6,2am feedings for the baby are fun when he is a...,positive,2am feeding baby fun smile coo,"[2am, feeding, baby, fun, smile, coo]"


In [0]:
df_test_emo.head()

Unnamed: 0,text,sentiment,text_filtred,words
1,Shanghai is also really exciting (precisely -...,positive,shanghai also really exciting precisely skyscr...,"[shanghai, also, really, exciting, precisely, ..."
2,"Recession hit Veronique Branquinho, she has to...",negative,recession hit veronique branquinho quit compan...,"[recession, hit, veronique, branquinho, quit, ..."
3,happy bday!,positive,happy bday,"[happy, bday]"
4,http://twitpic.com/4w75p - I like it!!,positive,http twitpic com 4w75p like,"[http, twitpic, com, 4w75p, like]"
5,that`s great!! weee!! visitors!,positive,great weee visitor,"[great, weee, visitor]"


## Mission 5 ✔
Crée des features en utilisant scikit-learn (à toi de choisir combien tu en prends et pourquoi) en utilisant la méthode Countvectorizer ou TfidfVectorizer.



In [0]:
# Countvectorizer
def count_vectorizer(data):
  # Design the Vocabulary
  count_vectorizer = CountVectorizer()
  # Create the Bag-of-Words Model
  bag_of_words = count_vectorizer.fit_transform(data['text_filtred'])
  # # Show the Bag-of-Words Model as a pandas DataFrame
  # feature_names = count_vectorizer.get_feature_names()
  return bag_of_words.toarray()

In [0]:
df_train_emo_BoW = count_vectorizer(df_train_emo)
df_train_emo_BoW.head()

AttributeError: ignored

In [0]:
df_test_emo_BoW = count_vectorizer(df_test_emo)
df_test_emo_BoW.head()

In [0]:
def VectorWordsFreqCounter(data, threshold):
  '''
  Permet de compter la fréquence d'apparition et de garder un certain % de la 
  fréq cumulée et retourne une liste avec les mots à conserver.
  '''

  counter = pd.DataFrame(data.sum(), columns=['count'])
  counter.sort_values('count', ascending = False, inplace = True)

  counter['freq'] = counter['count'].apply(lambda x: x/counter['count'].sum())

  freq_cum = 0
  row = 0

  while freq_cum <= threshold:
    freq_cum += counter.iloc[row, 1]
    row += 1

  bag = counter.iloc[:row, :].T

  return bag.columns
  

In [0]:
# df_train_emo_BoW[VectorWordsFreqCounter(df_train_emo_BoW, .90)]

ValueError: ignored

In [0]:
# df_test_emo_BoW[VectorWordsFreqCounter(df_test_emo_BoW, .90)]

In [0]:
# TF-IDF Vextorizer
def TFIDF_vectorizer(data):
  tfidf_vectorizer = TfidfVectorizer()
  values = tfidf_vectorizer.fit_transform(data)

  # Show the Model as a pandas DataFrame
  feature_names = tfidf_vectorizer.get_feature_names()
  return values.toarray()
 

In [0]:
df_train_emo_tfidf = TFIDF_vectorizer(df_train_emo['text_filtred'])
# df_train_emo_tfidf[VectorWordsFreqCounter(df_train_emo_tfidf, .90)]

In [0]:
df_test_emo_tfidf = TFIDF_vectorizer(df_test_emo['text_filtred'])
# df_test_emo_tfidf[VectorWordsFreqCounter(df_test_emo_tfidf, .90)]

## Mission 6 & 7 ✔
En utilisant les features obtenus à l'étape 5 en tant qu'input, entraîne une régression logistique pour classifier les tweets.

Mesure la performance (via accuracy_score) de ton algorithme.



In [0]:
# Training a LogReg model
def LogRegModeling(data_X, data_y):
  X_train = data_X
  y_train = data_y['sentiment']

  LogisticModel = LogisticRegression().fit(X_train, y_train)

  print("Model accuracy score =", LogisticModel.score(X_train, y_train))

  return LogisticModel

In [0]:
LogisticModel_countv = LogRegModeling(df_train_emo_BoW, df_train_emo)

Model accuracy score = 0.9537982032634602


In [0]:
LogisticModel_tfidf = LogRegModeling(df_train_emo_tfidf, df_train_emo)

Model accuracy score = 0.9268471551671454


## Mission 8 ✔
Teste le modèle avec CountVectorizer et avec TfidfVectorizer. Quelle méthode de preprocessing te permet d'obtenir les résultats les plus performants sur le jeu de test ?

> Dans cet exemple, notre model obtient un meilleur score avec la méthode  **CountVectorizer**
>
>
>> ***CountVectorizer*** *accuracy score* : 95.4%
>>
>>***TfidfVectorizer*** *accuracy score* : 92.7%