In [110]:
import pandas as pd

# Loading and Preprocessing

## Load Dataset

In [111]:
df=pd.read_csv('/content/nlp_dataset.csv')

In [112]:
df

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear


In [113]:
df.duplicated().sum()

0

In [114]:
df.isnull().sum()

Unnamed: 0,0
Comment,0
Emotion,0


## Text Cleaning

Remove URLs, punctuation, and convert text to lowercase.



*   **Purpose:** Remove unnecessary characters and noise such as punctuation, special symbols, URLs, and numeric values.
*   **Impact:** Cleaning helps reduce the complexity of the text, ensuring that the model focuses on meaningful words rather than noise. This can improve model accuracy and generalization.




In [115]:
import re
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function
df['Cleaned_Text'] = df['Comment'].apply(clean_text)
print(df['Cleaned_Text'])

0       i seriously hate one subject to death but now ...
1                      im so full of life i feel appalled
2       i sit here to write i start to dig out my feel...
3       ive been really angry with r and i feel like a...
4       i feel suspicious if there is no one outside l...
                              ...                        
5932                   i begun to feel distressed for you
5933    i left feeling annoyed and angry thinking that...
5934    i were to ever get married i d have everything...
5935    i feel reluctant in applying there because i w...
5936    i just wanted to apologize to you because i fe...
Name: Cleaned_Text, Length: 5937, dtype: object


## Tokenization

Tokenization is used to spliiting text into words

* Purpose: Splitting the text into individual words or tokens. This helps in understanding each unit of the text that the model will work with.
* Impact: Tokenization allows the model to process individual words, making it easier to analyze text data and identify patterns or relationships between words.

In [116]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [117]:
from nltk.tokenize import word_tokenize

In [118]:
df["tokens"] = df["Cleaned_Text"].apply(nltk.word_tokenize)

In [119]:
df["tokens"]

Unnamed: 0,tokens
0,"[i, seriously, hate, one, subject, to, death, ..."
1,"[im, so, full, of, life, i, feel, appalled]"
2,"[i, sit, here, to, write, i, start, to, dig, o..."
3,"[ive, been, really, angry, with, r, and, i, fe..."
4,"[i, feel, suspicious, if, there, is, no, one, ..."
...,...
5932,"[i, begun, to, feel, distressed, for, you]"
5933,"[i, left, feeling, annoyed, and, angry, thinki..."
5934,"[i, were, to, ever, get, married, i, d, have, ..."
5935,"[i, feel, reluctant, in, applying, there, beca..."


## Stopward Removal

Eliminating Common Wods (is, are , am ,etc)


* Purpose: Remove common words like "the", "is", "and", etc., that don't carry significant meaning but appear frequently in text data.
* Impact: Removing stopwords reduces the dimensionality of the data and focuses on the more informative words, improving model training speed and accuracy.

In [120]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [121]:
stop_words= set (stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [122]:
df['filtered_tokens']=df['tokens'].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))
df['filtered_tokens']

Unnamed: 0,filtered_tokens
0,seriously hate one subject death feel reluctan...
1,im full life feel appalled
2,sit write start dig feelings think afraid acce...
3,ive really angry r feel like idiot trusting fi...
4,feel suspicious one outside like rapture happe...
...,...
5932,begun feel distressed
5933,left feeling annoyed angry thinking center stu...
5934,ever get married everything ready offer got to...
5935,feel reluctant applying want able find company...


## Lemmatization

Reduce words to their base or dictionary form,we can also using stemming but we using Lemmaitization .


In [123]:
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()
import spacy
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [124]:

nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

df['lemmatized_tokens'] = df['filtered_tokens'].apply(lemmatize_text)
df['lemmatized_tokens']

Unnamed: 0,lemmatized_tokens
0,seriously hate one subject death feel reluctan...
1,I m full life feel appalled
2,sit write start dig feeling think afraid accep...
3,I ve really angry r feel like idiot trust firs...
4,feel suspicious one outside like rapture happe...
...,...
5932,begin feel distressed
5933,leave feel annoyed angry think center stupid joke
5934,ever get marry everything ready offer get toge...
5935,feel reluctant applying want able find company...


**Summary of Preprocessing Techniques:**
* Text Cleaning: Removed noise like URLs, punctuation,
and numbers, and standardized the text to lowercase.
Impact: This reduced the complexity of the text data, removing irrelevant parts that don't contribute to understanding the text.
* Tokenization: Split the cleaned text into individual words (tokens).
* Impact: This enabled the model to work with individual components of the text, allowing it to identify word-level patterns and relationships.
* Stopword Removal: Removed common English stopwords to reduce noise.
* Impact: By focusing on important words, this step helped in reducing the dimensionality of the data, leading to faster training and better model interpretability.


**Impact on Model Performance:**
* Improved Accuracy: Cleaning the data ensures that only meaningful information is passed to the model, reducing overfitting to noisy data.
* Faster Training: Tokenization and stopword removal reduce the number of features (words), leading to a smaller vocabulary and faster model training.
* Better Generalization: By removing irrelevant and noisy elements from the text, the model focuses on the important features, leading to better generalization on unseen data.

This preprocessing pipeline is crucial in almost all NLP tasks such as text classification, sentiment analysis, and machine translation.

# Feature Extraction

In [125]:
df

Unnamed: 0,Comment,Emotion,Cleaned_Text,tokens,filtered_tokens,lemmatized_tokens
0,i seriously hate one subject to death but now ...,fear,i seriously hate one subject to death but now ...,"[i, seriously, hate, one, subject, to, death, ...",seriously hate one subject death feel reluctan...,seriously hate one subject death feel reluctan...
1,im so full of life i feel appalled,anger,im so full of life i feel appalled,"[im, so, full, of, life, i, feel, appalled]",im full life feel appalled,I m full life feel appalled
2,i sit here to write i start to dig out my feel...,fear,i sit here to write i start to dig out my feel...,"[i, sit, here, to, write, i, start, to, dig, o...",sit write start dig feelings think afraid acce...,sit write start dig feeling think afraid accep...
3,ive been really angry with r and i feel like a...,joy,ive been really angry with r and i feel like a...,"[ive, been, really, angry, with, r, and, i, fe...",ive really angry r feel like idiot trusting fi...,I ve really angry r feel like idiot trust firs...
4,i feel suspicious if there is no one outside l...,fear,i feel suspicious if there is no one outside l...,"[i, feel, suspicious, if, there, is, no, one, ...",feel suspicious one outside like rapture happe...,feel suspicious one outside like rapture happe...
...,...,...,...,...,...,...
5932,i begun to feel distressed for you,fear,i begun to feel distressed for you,"[i, begun, to, feel, distressed, for, you]",begun feel distressed,begin feel distressed
5933,i left feeling annoyed and angry thinking that...,anger,i left feeling annoyed and angry thinking that...,"[i, left, feeling, annoyed, and, angry, thinki...",left feeling annoyed angry thinking center stu...,leave feel annoyed angry think center stupid joke
5934,i were to ever get married i d have everything...,joy,i were to ever get married i d have everything...,"[i, were, to, ever, get, married, i, d, have, ...",ever get married everything ready offer got to...,ever get marry everything ready offer get toge...
5935,i feel reluctant in applying there because i w...,fear,i feel reluctant in applying there because i w...,"[i, feel, reluctant, in, applying, there, beca...",feel reluctant applying want able find company...,feel reluctant applying want able find company...


## CountVectorizer
The CountVectorizer is a technique used in Natural Language Processing (NLP) to convert text data into numerical features, specifically a bag-of-words representation. It is part of the scikit-learn library and is commonly used for text classification, clustering, and other NLP tasks. The idea is to convert textual data into a numerical format that can be used by machine learning models.

**Steps of Transformation in CountVectorizer:**

* **Tokenization:**
The CountVectorizer first tokenizes the text by splitting it into individual words (tokens). It typically removes punctuation and converts all words to lowercase by default.
* **Building Vocabulary:**
Once the text is tokenized, the CountVectorizer creates a vocabulary (a set of unique tokens) based on all the text data provided. Each unique token in the text corpus is assigned a unique index.
* **Word Count:**
After building the vocabulary, CountVectorizer counts the occurrences of each word in every document (row) and stores these counts in a sparse matrix, where each entry corresponds to a document-word pair.

If a word appears in a document, the matrix entry will have the count of occurrences; if the word is not present, the matrix will have a value of zero.


In [128]:
from sklearn.feature_extraction.text import CountVectorizer
vector=CountVectorizer()
vector_metrics=vector.fit_transform(x)
vector_metrics

<5937x7146 sparse matrix of type '<class 'numpy.int64'>'
	with 53181 stored elements in Compressed Sparse Row format>

In [129]:
vector_metrics.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Model Development

In [130]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [131]:
X_train, X_test, y_train, y_test = train_test_split(vector_metrics, df['Emotion'], test_size=0.2, random_state=42)

## NAIVE BAYERS



In [132]:
from sklearn.naive_bayes import GaussianNB
import numpy as np

In [133]:
# Create a Gaussian Naive Bayes model
model = GaussianNB()

# Train the model
model.fit(X_train.toarray(), y_train)

In [134]:
#Make predictions
y_pred = model.predict(X_test.toarray())

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)

In [135]:
print(confusion_matrix(y_test, y_pred))

[[215 158  19]
 [ 38 351  27]
 [ 63 159 158]]


In [136]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.68      0.55      0.61       392
        fear       0.53      0.84      0.65       416
         joy       0.77      0.42      0.54       380

    accuracy                           0.61      1188
   macro avg       0.66      0.60      0.60      1188
weighted avg       0.66      0.61      0.60      1188



In [137]:
print("Accuracy: ",accuracy)

Accuracy:  0.6094276094276094


## SVM (SUPPORT VECTOR MACHINE)

In [138]:
from sklearn.svm import SVC

In [139]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [140]:
y_pred = svm_model.predict(X_test)
y_pred

array(['anger', 'joy', 'fear', ..., 'joy', 'fear', 'joy'], dtype=object)

In [141]:
print(confusion_matrix(y_test, y_pred))

[[358  10  24]
 [ 29 368  19]
 [  6   7 367]]


In [142]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.91      0.91      0.91       392
        fear       0.96      0.88      0.92       416
         joy       0.90      0.97      0.93       380

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



In [143]:
accuracy=accuracy_score(y_test, y_pred)

In [144]:
print("Accuracy: ",accuracy)

Accuracy:  0.92003367003367


# Model Comparison

**NAIVE BAYERS**


Naive Bayes is a simple probabilistic classifier that predicts the category of an item based on the likelihood of its features, assuming each feature is independent of the others




*   Multinomial Naive Bayes: Best for count-based features or text data.
*   Bernoulli Naive Bayes: Best for binary features.
*   Gaussian Naive Bayes: Best for continuous features with normal distribution.

f1-score
*   anger      0.61
*   fear       0.65
*   joy        0.54

Accuracy : 0.61

**SVM**

Support Vector Machine (SVM) is a powerful and versatile supervised machine learning algorithm commonly used for classification, regression, and outlier detection tasks.

SVMs are particularly well-suited for classification problems with high-dimensional data and are effective in both linear and non-linear classification tasks.

f1-score
* anger      0.91
* fear       0.92
* joy        0.93


Accuracy : 0.92

**Conclution**



For the motion classification, more suitability model is SVM  better than Naive bayers as the f1 score and accuracy is very high