# Twitter Sentiment Analysis using Natural Language Processing

## Importing the necessary libraries

In [None]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

## Downloading Twitter Samples

In [1]:
nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to C:\Users\Gandharv
[nltk_data]     Kulkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

## Loading Twitter Samples

In [3]:
from nltk.corpus import twitter_samples
tweets = twitter_samples.strings()

## Text Processing

In [4]:
nltk.download("stopwords")
from nltk.corpus import stopwords

corpus = []

for i in range(0, 1000):
    review = re.sub("[^a-zA-Z]", " ", tweets[i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words("english")
    all_stopwords.remove("not")
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = " ".join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to C:\Users\Gandharv
[nltk_data]     Kulkarni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
print(corpus)

['hopeless tmr', 'everyth kid section ikea cute shame nearli month', 'hegelbon heart slide wast basket', 'ketchburn hate japanes call bani', 'dang start next week work', 'oh god babi face http co fcwgvaki', 'rileymcdonough make smile', 'f ggstar stuartthul work neighbour motor ask said hate updat search http co xvmtuikwln', 'tahuodyy sialan http co hv xcrl', 'athabasca glacier athabasca glacier jasper jaspernationalpark alberta explorealberta http co dzzdqmf cz', 'realli good amp g idea never go meet', 'rampageinthebox mare ivan', 'sophiamascardo happi trip keep safe see soon', 'tire hahahah', 'grumpycockney knee replac get amp day ouch', 'relat sweet n sour kind bi polar peopl life cuz life full', 'aysegul k pleass', 'sexykalamo im not sure tho', 'feel stupid seem grasp basic digit paint noth research help', 'good lord http co nc lkyuuvo', 'feel lone someon talk guy girl theonlyrazzyt imarieuda eirozpegasu amysque udotv', 'assign project realli', 'want play video game watch movi someo

## Vectorization of Text Data

In [6]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = [1 if i % 2 == 0 else 0 for i in range(1000)]

In [7]:
print(len(X[0]))

2913


## Splitting the dataset into the Training set and Test set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the model

In [9]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Making Predictions and Comparing Results

In [10]:
y_pred = classifier.predict(X_test)

y_pred_np = np.array(y_pred)
y_test_np = np.array(y_test)

print(np.concatenate((y_pred_np.reshape(len(y_pred), 1), y_test_np.reshape(len(y_test), 1)), 1))

[[0 0]
 [0 0]
 [0 1]
 [1 0]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [0 1]
 [0 1]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [0 1]
 [1 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [1 0]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [1 0]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [0 0]
 [1 0]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 1]
 [0 1]
 [1 1]
 [0 1]
 [0 1]
 [1 0]
 [1 0]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 0]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [1 0]
 [0 1]
 [1 0]
 [0 0]
 [1 1]

## Evaluating Model Performance

In [11]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

[[66 29]
 [76 29]]
0.475


In [12]:
print(X_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Creating a DataFrame of Actual and Predicted Values and Saving it to a CSV file

In [13]:
predictions = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
predictions.to_csv("predictions.csv")

<hr>