## Import libraries/modules

In [1]:
!pip install scikit-learn nltk matplotlib pandas gensim --quiet

In [84]:
import sklearn
import nltk
import matplotlib.pyplot as plt
import pandas  as pd
import gensim
from google.colab import drive
import os
nltk.download('punkt')
nltk.download('wordnet')
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
pd.set_option('display.max_colwidth', None)
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Explore the dataset

In [17]:
drive.mount('/content/drive')
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Datasets   'Sentiment Analysis Course Presentation.pptx'
 Notebooks   Sentiment_analysis.html


In [24]:
os.chdir("/content/drive/My Drive/Sentiment_analysis/Datasets")
!ls

Tweets.csv


In [27]:
# load file
df = pd.read_csv('Tweets.csv')
df.sample(5)

Unnamed: 0,textID,text,selected_text,sentiment
23441,b60e0550cd,faith and daphne,faith and daphne,neutral
13897,99aa6a57ae,@ work again,@ work again,neutral
3454,205e6c552b,'My problem isn`t that I miss you... `cause I ...,'My problem isn`t that I miss you... `cause I ...,neutral
663,153027be1c,wheres the music mannnn!!?? my inbox is still...,my inbox is still empty,negative
10693,0167c461c6,headache boo,headache,negative


In [29]:
# overview of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [31]:
# check the shape/size of the dataframe
df.shape

(27481, 4)

In [33]:
# check statistics
df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [36]:
# find missing values
df.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [38]:
# fill missing values
df[df['text'].isna()]

Unnamed: 0,textID,text,selected_text,sentiment
314,fdb77c3752,,,neutral


In [39]:
# remove missing values
df.dropna(inplace=True)

In [40]:
# check missing values
df.isna().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [41]:
# check the overview of data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


## Data Cleaning

In [58]:
def clean_text(text):
    if text is not None and isinstance(text, (str, bytes)):
        # Remove extra spaces
        text = re.sub('\s+', ' ', text).strip()

        # Remove URLs
        text = re.sub(r'http\S+', '', text)

        # Remove HTML encodings
        text = re.sub('&\w+;', '', text)

        # Remove usernames
        text = re.sub('@\w+', '', text)

        # Remove punctuation, numbers, and emojis
        text = re.sub('[^a-zA-Z]', ' ', text)

        # Remove words with 3 letters or less
        text = ' '.join([word for word in text.split() if len(word) > 3])

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])

        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

        # Remove duplicate words
        text = ' '.join(set(text.split()))

    else:
        text = ''

    return text

In [59]:
# Apply the clean_text function to the 'raw_sentence' column of the dataframe
df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].sample(10)

Unnamed: 0,text,clean_text
23411,"I know, they`re ****. I am trying to sort out my portfolio and I don`t know what to put in it!",trying know portfolio sort
3137,do it and I`ll give you the first hug,first give
3335,_h786 as ur coming on to here im just off to bed hows ur day been? x,hows coming
21790,they can`t be in their carriers anymore?,carrier anymore
20747,I think at this rate.. Ill be in class until 4... Sigh.,class Sigh rate think
11576,good morning tweets,good morning tweet
23192,I don`t find this stuff amusing any more,amusing stuff find
16008,Thanks.,Thanks
21608,i got ill and tomorrow iï¿½ve got birthday...,tomorrow birthday
2294,symphonic I,symphonic


In [60]:
# Define a function to get the sentiment polarity score
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply the function to the 'clean_text' column and round the result to 2 decimal places
df['textblob_polarity'] = df['clean_text'].apply(get_sentiment).round(2)

# Define a function to categorize the sentiment polarity score into 4 categories
def categorize_sentiment(score):
    if score >= 0.5:
        return 'Positive'
    elif score >= 0.05 and score < 0.5:
        return 'Partially Positive'
    elif score > -0.05 and score < 0.05:
        return 'Neutral'
    elif score > -0.5 and score <= -0.05:
        return 'Partially Negative'
    else:
        return 'Negative'

# Apply the categorize_sentiment function to the 'textblob_polarity' column
df['sentiment_textblob'] = df['textblob_polarity'].apply(categorize_sentiment)

# Select the relevant columns for display
df[['clean_text', 'textblob_polarity', 'sentiment_textblob']].sample(10)

Unnamed: 0,clean_text,textblob_polarity,sentiment_textblob
4586,update Need annoyed socialscope Starting,-0.2,Partially Negative
1063,discovered great site,0.8,Positive
13419,rental free coupon Coke Blockbuster need Reward inFamous Where play miss,-0.05,Partially Negative
7189,work,0.0,Neutral
17917,sorry patsy Anything generally,-0.22,Partially Negative
3466,sigh nowhere babe job morning come,0.0,Neutral
23714,Have movie want,0.0,Neutral
12161,Have unplugging good rest everybody,0.7,Positive
16030,guess finished exam,0.0,Neutral
20678,prom dress,0.0,Neutral


## Data Preprocessing

In [63]:
df_train, df_test = train_test_split(df, test_size=0.3, stratify=df['sentiment_textblob'], shuffle=True, random_state=20)

In [64]:
# Feature Extraction: Creating our dependent and independent variables
x_train = df_train['clean_text']
y_train = df_train['sentiment_textblob']

x_test =  df_test['clean_text']
y_test = df_test['sentiment_textblob']

In [73]:
df_train.shape

(19236, 7)

In [71]:
df_test.shape

(8244, 7)

In [87]:
vectorizer = CountVectorizer(max_features=3000)

x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [88]:
x_test_vec = x_test_vec.toarray()
x_train_vec = x_train_vec.toarray()

In [89]:
model = BernoulliNB()

In [90]:
# Train a Naive Bayes model
model.fit(x_train_vec, y_train)

In [92]:
model.score(x_train_vec, y_train)

0.8508525681014764

In [93]:
y_train_pred = model.predict(x_train_vec)
accuracy_train = accuracy_score(y_train, y_train_pred)
print("\nTraining Set Metrics:")
print("-" * 54)
print("Train Accuracy:", accuracy_train)
print("-" * 54)
print(classification_report(y_train, y_train_pred))
print("-" * 54)


Training Set Metrics:
------------------------------------------------------
Train Accuracy: 0.8508525681014764
------------------------------------------------------
                    precision    recall  f1-score   support

          Negative       0.89      0.50      0.64      1009
           Neutral       0.84      0.94      0.89      8410
Partially Negative       0.84      0.64      0.72      2065
Partially Positive       0.84      0.85      0.85      4730
          Positive       0.90      0.86      0.88      3022

          accuracy                           0.85     19236
         macro avg       0.86      0.76      0.79     19236
      weighted avg       0.85      0.85      0.85     19236

------------------------------------------------------


In [94]:
y_test_pred = model.predict(x_test_vec)
accuracy_test = accuracy_score(y_test, y_test_pred)
print("\nTest Set Metrics:")
print("-" * 54)
print("Test Accuracy:", accuracy_test)
print("-" * 54)
print(classification_report(y_test, y_test_pred))
print("-" * 54)


Test Set Metrics:
------------------------------------------------------
Test Accuracy: 0.7729257641921398
------------------------------------------------------
                    precision    recall  f1-score   support

          Negative       0.69      0.25      0.37       432
           Neutral       0.80      0.92      0.85      3604
Partially Negative       0.66      0.42      0.52       885
Partially Positive       0.75      0.77      0.76      2028
          Positive       0.81      0.77      0.79      1295

          accuracy                           0.77      8244
         macro avg       0.74      0.63      0.66      8244
      weighted avg       0.76      0.77      0.76      8244

------------------------------------------------------
