### Relevant Resources: https://www.analyticsvidhya.com/blog/2021/07/performing-sentiment-analysis-with-naive-bayes-classifier/

# Import Libraries

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

# Read and Clean Dataset

In [2]:
df = pd.read_csv('data/test_data.csv')
df.head()

Unnamed: 0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,@richardebaker no. it is too big. I'm quite ha...


In [27]:
def remove_at(text):
    # A username can only contain alphanumeric characters (letters A-Z, numbers 0-9) with the exception of underscores,
    # as noted above. Check to make sure your desired username doesn't contain any symbols, dashes, or spaces.
    pattern = r'@([A-Za-z0-9_])+'
    # Replace all occurrences of @username with an empty string
    # https://towardsdatascience.com/topic-modeling-and-sentiment-analysis-on-twitter-data-using-spark-a145bfcc433
    text = re.sub(pattern, '', text)
    pattern = r'http\S+'
    text = re.sub(pattern, '', text)
    pattern = r'bit.ly/\S+'
    # replace all links with empty string
    text = re.sub(pattern, '', text)
    pattern = r'#([A-Za-z]+[A-Za-z0-9-_]+)'
    # replace all hashtags with empty string
    text = re.sub(pattern, '', text)
    return text

In [91]:
def calculate_english_polarity(num):
    polarity = ''
    if num == 0:
        polarity = 'negative'
    elif num == 2:
        polarity = 'neutral'
    else:
        polarity = 'positive'
    return polarity

In [28]:
df.columns = ['Polarity', 'ID', 'Date', 'Topic', 'User', 'Text']
df['Text'] = df['Text'].apply(remove_at)

### Cleaned Dataset After Removing @usernames, links, and hashtags

In [32]:
df.head()

Unnamed: 0,Polarity,ID,Date,Topic,User,Text
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the ...it fucking rock..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,You'll love your Kindle2. I've had mine for a...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,Fair enough. But i have the Kindle2 and I th...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,no. it is too big. I'm quite happy with the K...


# Feature Extraction

In [74]:
X = df['Text']
Y = df['Polarity'] #0: negative, 2: neutral, 4: positive

# Tweet Vectorization (tf-idf)

In [75]:
vec = TfidfVectorizer(stop_words='english')
X_vectorized = vec.fit_transform(X)

In [76]:
X_train, X_test, Y_train, Y_test = train_test_split(X_vectorized, Y, test_size = 0.1, random_state=1)

# Naive Bayes Sentiment Analysis

In [84]:
model = MultinomialNB()
model.fit(X_train, Y_train)
print('Accuracy using tf-idf vectorizer for tweets: ', model.score(X_test, Y_test))

Accuracy using tf-idf vectorizer for tweets:  0.76


In [90]:
print('Example tweet: ', X[0])
output = model.predict(X_vectorized[0])[0]

print('Predicted polarity: ', calculate_english_polarity(output))
print('Actual polarity: ', calculate_english_polarity(Y[0]))

Example tweet:  Reading my kindle2...  Love it... Lee childs is good read.
Predicted polarity:  positive
Actual polarity:  positive
