# *Part 1 - Text data and known labels*

In [51]:
# read dataset 
import pandas as pd
# visualize dataset
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
## training dataset
dataset_train = pd.read_csv('EA-train.txt', header=None, names= ['text', 'emotion'] ,delimiter=';', quoting=3)
# quoting: ???

In [54]:
# first five rows
dataset_train.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [55]:
# last five rows
dataset_train.tail()

Unnamed: 0,text,emotion
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger
15999,i know a lot but i feel so stupid because i ca...,sadness


In [None]:
dataset_train.info

In [None]:
# (rows, cols)
dataset_train.shape

In [None]:
# describe data
dataset_train.describe()

In [None]:
# Missing data
dataset_train.isnull().sum()

In [None]:
dataset_train['emotion'].value_counts()

In [None]:
# plot
sns.countplot(x=dataset_train['emotion'],data=dataset_train)

In [56]:
# test dataset
dataset_test = pd.read_csv('EA-test.txt', header=None, names= ['text', 'emotion'] ,delimiter=';', quoting=3) 

In [57]:
# (rows, cols)
dataset_test.shape

(2000, 2)

In [58]:
# first five rows
dataset_test.head()

Unnamed: 0,text,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


In [59]:
# last five rows
dataset_test.tail()

Unnamed: 0,text,emotion
1995,i just keep feeling like someone is being unki...,anger
1996,im feeling a little cranky negative after this...,anger
1997,i feel that i am useful to my people and that ...,joy
1998,im feeling more comfortable with derby i feel ...,joy
1999,i feel all weird when i have to meet w people ...,fear


In [60]:
# replace labels with integer numbers
dataset_train['emotion'] = dataset_train['emotion'].replace({'sadness':0, 'anger':1, 'love':2, 'surprise':3, 'fear':4, 'joy':5}).astype(int)
dataset_train.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [61]:
# replace labels with integer numbers
dataset_test['emotion'] = dataset_test['emotion'].replace({'sadness':0, 'anger':1, 'love':2, 'surprise':3, 'fear':4, 'joy':5}).astype(int)
dataset_test.head()

Unnamed: 0,text,emotion
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,5
4,i was feeling a little vain when i did this one,0


## Missing data

In [62]:
dataset_train.isnull().sum()

text       0
emotion    0
dtype: int64

In [63]:
dataset_test.isnull().sum()

text       0
emotion    0
dtype: int64

# *Part 2 - Data preprocessing*

In [64]:
# libraries for NLP 
import re   # regular expressions: to select specific pattern
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [65]:
text_in= "IM updating my blog because i feel shitty"
text_out = re.sub('[^a-zA-z]', ' ', text_in)
print(word_tokenize(text_out.lower()))
print(type(text_out))

['im', 'updating', 'my', 'blog', 'because', 'i', 'feel', 'shitty']
<class 'str'>


In [66]:
def preprocess_text(data):
    corpus = []  # list to include the cleaned text in.

    for row in range(data.shape[0]): # number of rows 
        # replace everything in the text that is not a letter by space, in every row in column text 
        text = re.sub('[^a-zA-z]', ' ', data['text'][row])

        # convert text to tokens
        tokens = word_tokenize(str(text).lower())

        # remove stop words
        stop_words = stopwords.words("english")
        stop_words.remove('not')  ## remove (not) from stopwords
        tokens = [token for token in tokens if token not in stop_words]

        # Word Normalization
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # convert tokens to text
        text = ' '.join(tokens)
        corpus.append(text)
    
    return corpus

In [67]:
# cleaning the training text
corpus_train = preprocess_text(dataset_train)

In [68]:
len(corpus_train)

16000

In [69]:
corpus_train[0]
# didnt ??

'didnt feel humili'

In [70]:
## cleaning the test text
corpus_test = preprocess_text(dataset_test)

In [71]:
corpus_test[0]
# im ??

'im feel rather rotten im not ambiti right'

# *Part 3 - Feature Extraction*

## Bag of Words (BoW)

+ we can perform feature extraction using the CountVectorizer class from the scikit-learn library.
+ represents text as a bag of individual words, without considering the order or structure of the text.
+ CountVectorizer counts the frequency of each word in a text corpus and represents each document in the corpus as a vector of word frequencies

In [72]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# fit train set
X_train = vectorizer.fit_transform(corpus_train)
y_train = dataset_train['emotion']

# Transform test set
X_test = vectorizer.transform(corpus_test)
y_test = dataset_test['emotion']

# *Part 4 - The machine learning models*

In [73]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
# to find the best hyperparameters for the Naive Bayes model
from sklearn.model_selection import RandomizedSearchCV

### Naive bayes model

In [74]:
# define pipeline with the CountVectorizer and Naive Bayes classifier
''' 
# Build the pipeline
makes it easier to train and evaluate the model
which allows you to reuse same preprocessing 
                    reuse same modeling steps on new data  
            without retrain the preprocessing steps
'''
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('nb', MultinomialNB())
])

# define parameters grid with different values for hyperparameters
params = {
    'bow__ngram_range': [(1, 1), (1, 2), (2, 2)],# determines the range of n-grams to be used for tokenization.
    # increasing the value of "max_df" to exclude terms that appear too frequently in the corpus.
    'bow__max_df': [0.5, 0.75, 1.0],# minimum document frequency of a term in the corpus
    # decreasing the value of "min_df" to allow more terms to be included,
    'bow__min_df': [1, 2, 3],# maximum document frequency of a term in the corpus
    'nb__alpha': [0.1, 0.5, 1.0],
}

# define randomized search
random_search = RandomizedSearchCV(pipeline, 
                                   param_distributions=params, 
                                   n_iter=10, 
                                   cv=5,# number of folds to use for cross-validation 
                                   n_jobs=-1)# number of jobs to run in parallel: -1 is run GPU, 1 is run CPU

# fit randomized search on training data and labels.
random_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/usr/lib/python3/dist-packages/sklearn/pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/elsayedelmandoh/.local/lib/python3.10/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/usr/lib/python3/dist-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1198, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents,
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1110, in _count_voc

AttributeError: lower not found

In [None]:
# params for number of iteration
pd.DataFrame(random_search.cv_results_)[["mean_test_score","std_test_score","params"]]

In [None]:
'''
+ The best hyperparameters and score found by the random search object can be accessed using: 
    - the best_params_ attribute
    - The best_score_ attribute 
'''
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

In [None]:
'''
we can train the Naive Bayes model on the training data and labels 
using the pipeline with the best hyperparameters.
'''
best_pipeline = Pipeline([
    ('bow', CountVectorizer(ngram_range=random_search.best_params_['bow__ngram_range'], 
                            max_df=random_search.best_params_['bow__max_df'], 
                            min_df=random_search.best_params_['bow__min_df'])),
    ('nb', MultinomialNB(alpha=random_search.best_params_['nb__alpha']))
])

best_pipeline.fit(X_train, y_train)

In [None]:
# make predictions on test data using the trained model
y_pred = best_pipeline.predict(X_test)

# part 5 - Evaluation

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# compute accuracy score with y-test and y-predictions
# number of correct predictions divided by the total number of predictions.
accuracy = accuracy_score(dataset_test['emotion'], y_pred)
print("Accuracy:", accuracy)

### confusion matrix

In [None]:
'''
+ confusion matrix: Table that shows the 
    - true positive     - true negative
    - false positive    - false negative
    - values to predict actual values and create heatmap
'''
# compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# part 6 - Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Heatmap of the confusion matrix

In [None]:
# visualize confusion matrix as heatmap
plt.figure(figsize = (10,10))
labels = ['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
sns.heatmap(cm, 
            xticklabels=labels, 
            yticklabels=labels, 
            annot=True, 
            cmap='Blues', 
            fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Part 7 - Predicting new results

In [None]:
def pred_outcome(text):
    # STEP1: make pre-processing
    # replace everything in the text that is not a letter by space, in every row in column text 
    text = re.sub('[^a-zA-z]', ' ', text)
    # convert text to tokens
    tokens = word_tokenize(text.lower())
    # remove stop words
    stop_words = stopwords.words("english")
    stop_words.remove('not')  ## remove (not) from stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Word Normalization
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # convert tokens to text
    text = ' '.join(tokens) 
    
    # STEP2:
    # converted text into a vector, part 3
    text= [text]
    text_x = best_pipeline.transform(text).toarray()
    
    # STEP3:
    # predict method from nb (MultinomialNB), part 2
    # predict new comment based on the trained model.
    text_y = best_pipeline.predict(text_x)
    
    if (text_y == 0):
        print("Sadness")
    elif (text_y ==1):
        print("Anger")
    elif (text_y ==2):
        print("Love")
    elif (text_y ==3):
        print("Surprise")
    elif (text_y ==4):
        print("Fear")
    elif (text_y ==5):
        print("Joy")

In [None]:
# Make Prediction
text = "I am so happy"
pred_outcome(text)

In [None]:
# Probability of predictions
best_pipeline.predict_proba(text)

In [None]:
# To Know the classes
best_pipeline.classes_

# Part 8 - Save the pipline

In [None]:
# Save the model
import joblib
# reuse the same preprocessing steps and classifier on new data without having to redefine them each time.
joblib.dump(best_pipeline,'pipeline.pkl') 

# load the model use joblib.load