# To build a Natural Language Processing (NLP) based machine learning model that can automatically classify text into predefined emotion categories such as sadness, anger, love, joy, fear, etc.

In [81]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loding the file 

In [82]:
data = pd.read_csv("TEXT_DATA/train.txt",sep=";", header=None , names= ["text","emotion"] )

## Checking 1st 5 rows of the file

In [126]:
data.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


## Checking the null values

In [84]:
data.isna().sum()

text       0
emotion    0
dtype: int64

## Quck info of about the file data

In [127]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     16000 non-null  object
 1   emotion  16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


## Encoding process With for loop == (Criating a dictionary that assigns a number to each emotion.)

In [86]:
unique_emotions = data["emotion"].unique()
emotion_numbers = {}
i = 0
for emo in unique_emotions:
    emotion_numbers[emo] = i
    i+=1

data["emotion"] = data["emotion"].map(emotion_numbers)   

In [87]:
data

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


## Converting al text into lower Latter 

In [88]:
data["text"] = data["text"].apply(lambda x: x.lower())

## Creating  function  that removes all punctuation marks (like .,!?;: etc.) from the given text

In [128]:
import string

def remove_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))
data["text"] = data["text"].apply(remove_punctuations)

## Creating  function that removes all numeric digits (0–9) from the text and keeps only non-number characters

In [91]:
def remove_numbers(text):
    new = ""
    for i in text:
        if not i.isdigit():
            new = new + i
    return new        

data["text"] = data["text"].apply(remove_numbers)    

## Creating  function that removes emojis and other non-ASCII characters

In [92]:
def remove_emojis(text):
    new = ""
    for i in text:
        if i.isascii():
            new +=i
    return new
    
data["text"] = data["text"].apply(remove_emojis)    
            

In [93]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [94]:
# nltk.download("punkt")
# nltk.download("stopwords")

## all English stopwords from NLTK

In [129]:
stop_words = set(stopwords.words("english"))

In [130]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

## creating  function  that removes common English stopwords from the text and keeps only meaningful words

In [97]:
def remove_stopwords(text):
    words = text.split()
    cleane_text = []
    for i in words:
        if not i in stop_words:
            cleane_text.append(i)
    return " ".join(cleane_text)        

data["text"] = data["text"].apply(remove_stopwords)   

In [98]:
data.loc[1]["text"]

'go feeling hopeless damned hopeful around someone cares awake'

In [99]:
data.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


## Spliting the data into 20 to 80 form 

In [100]:

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data["text"], data["emotion"], test_size=0.20, random_state=43)

In [101]:
X_train

13548            feel disturbed happens roughly everywhere
2658              feel part family universe rather fearful
4497     got really feel like hit lottery scared itd so...
10555                   dont know feel beloved teams draft
4263                             feeling quite overwhelmed
                               ...                        
10517    feel disturbed see people break pieces right f...
7985                      started feel sweet feeling peace
2303     highly doubt would see young jean scott consid...
3392     feel like leaves artistic equivalent crack cou...
14148    like croissants im feeling naughty eating alon...
Name: text, Length: 12800, dtype: object

In [102]:
X_test

11001    im feeling little regretful itll pass thats ha...
4150                                           earth crake
342                 feel like receiving end violent attack
1340     love tall guys make feel little innocent howev...
2833     could feel peace welcomed week packing saying ...
                               ...                        
12244    studied logic ethics know certainty motivation...
2905     im happy race pace officially ability pull tog...
15929                                   feel beaten worked
3028     bad dreams really weird dreams make feel like ...
5853     feel free enjoy possessions like rock book clo...
Name: text, Length: 3200, dtype: object

## importing Bg of word and Tfidf Technique to coverting text into the vectore format 

In [103]:
## coverting into vectors mean bag of worsd or if-idf == countvectorizer = bag of words  tfidfvectorizer == 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## training the model with thw bag of word technique With the navie bayes algorithm

In [104]:
# 1st using the Countvectoruser==(bag of words)
bow_vector = CountVectorizer()
X_train_bow = bow_vector.fit_transform(X_train)
X_test_bow = bow_vector.transform(X_test)

In [105]:
# 1st  build the model we use the algorithm of the navie baies
# 2nd technique is logestic algorithm 

In [106]:
#1st is naive bayes  With the bag od wors technique 

from sklearn.naive_bayes import  MultinomialNB
nb_model = MultinomialNB()
from sklearn.metrics import accuracy_score

In [107]:
nb_model.fit(X_train_bow, Y_train) 

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [108]:
test_pred = nb_model.predict(X_test_bow)

In [109]:
print("Test Accuracy:", accuracy_score(Y_test, test_pred))

Test Accuracy: 0.7725


In [110]:
train_pred = nb_model.predict(X_train_bow)

In [111]:
print("train accuracy", accuracy_score(Y_train, train_pred))

train accuracy 0.907578125


## Training the model with the If-idf technique with the navie bayes algorithm

In [112]:
# now 2nd methos ifidf technique with the navie bayes algorithm

tfidf_vector = TfidfVectorizer()

X_train_tfidf = tfidf_vector.fit_transform(X_train)
X_test_tfidf = tfidf_vector.transform(X_test)

In [113]:
nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf, Y_train)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [114]:
test_predict_tfidf = nb2_model.predict(X_test_tfidf)

In [115]:
print("Test Accuracy:", accuracy_score(Y_test, test_predict_tfidf))

Test Accuracy: 0.6659375


In [116]:
train_predict_ifidf = nb2_model.predict(X_train_tfidf)

In [117]:
print("train accuracy", accuracy_score(Y_train, train_predict_ifidf))

train accuracy 0.74921875


## Now  training the model with Logistic Regreesion model with the If-idf Technique

In [131]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(max_iter=1000)

In [132]:
log.fit(X_train_tfidf, Y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [133]:
test_pred_log = log.predict(X_test_tfidf)

In [134]:
print("testing accuracy of logestic", accuracy_score(Y_test, test_pred_log))

testing accuracy of logestic 0.85125


In [135]:
train_pred_log = log.predict(X_train_tfidf)

In [136]:
print("training accuracy of logestic", accuracy_score(Y_train, train_pred_log))

training accuracy of logestic 0.94671875


In [137]:
# now the cunclusion is the logistic regression with the IFidf is the best 