In [None]:
import pandas as pd
# three sentiments here:  negative(-1), neutral(0), and positive(+1).
data = pd.read_csv("Twitter_Data.csv")

# rename 'category' column into 'label'
data.rename(columns = {'category':'label'}, inplace = True)
data.head(1)['clean_text'][0]

In [None]:
# explore the distribution of the labels
data['label'].value_counts()
len(data)

In [None]:
# plot label distribution
import seaborn as sns
ax=sns.countplot(data.label)

In [None]:
import gensim
import nltk
from gensim.utils import simple_preprocess
nltk.download('stopwords')
from nltk.corpus import stopwords
#stop words of the English dictionary
stop_w = stopwords.words('English')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use','of', 'as', 'by', 'uc'])

def process(text):
   
    # deacc=True removes punctuations
    no_punc = gensim.utils.simple_preprocess(str(text), deacc=True)
    return [word for word in no_punc if word not in stop_w] # for each doc in the text, remove stop words
data['clean_text'] = data['clean_text'].apply(process)





In [None]:
data['clean_text']

In [None]:
# data visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import collections
from collections import Counter
# get individual words
words = []
for row in data['clean_text']: 
    words.extend(row)
 

# Counter is a subclass for counting objects.
# It is a collection where elements are stored as dictionary keys and their counts are stored as dictionary values
word_freq = Counter(words)

# plotting a Word Cloud 
word_cloud = WordCloud(
 background_color='white',
 max_words=2000,
 stopwords=stopwords
 ).generate_from_frequencies(word_freq)
plt.figure(figsize=(10,9))
plt.imshow(word_cloud) # display as an image
plt.axis('off')
plt.show()

In [None]:
# Most of the words in the cloud seem neutral. It doesn’t give any idea about racist / sexist tweets.
# let's take a look at the positive tweets
positive_rows = [r for r in data['clean_text'][data['label']==1.0]]
pos_words =[]
for twt in positive_rows : 
    pos_words.extend(twt)
#print(pos_words[:200])

# positive word frequencies
pos_freq = Counter(pos_words)

# positive words cloud
pos_cloud = WordCloud(
 background_color='white',
 max_words=2000,
 stopwords=stopwords
 ).generate_from_frequencies(pos_freq)
plt.figure(figsize=(10,9))
plt.imshow(pos_cloud) # display as an image
plt.axis('off')
plt.show()

In [None]:
# data visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import collections
from collections import Counter
# get individual words for negative tweets
neg_rows = [r for r in data['clean_text'][data['label']== -1.0]]
neg_words =[]
for twt in neg_rows : 
    neg_words.extend(twt)
    
# Counter is a subclass for counting objects.
# It is a collection where elements are stored as dictionary keys and their counts are stored as dictionary values
# negative word frequencies
neg_freq = Counter(neg_words)

# positive words cloud
neg_cloud = WordCloud(
 background_color='black',
 max_words=2000,
 stopwords=stopwords
 ).generate_from_frequencies(neg_freq)
plt.figure(figsize=(10,9))
plt.imshow(neg_cloud, interpolation='bilinear') # display as an image, we could use interpolation 
plt.axis('off')
plt.show()

In [None]:
#The bag-of-words model is a simplifying representation used in Natural language processing.
#In this model, a text is represented as the bag of its words (independent features) ,disregarding grammar but keeping multiplicity.
# We will use SciKit Learn’s CountVectorizer function which will convert a collection of 
# text documents into a matrix of token counts or feature table
# we will use TfidfTransformer as a normalization method

# drop rows with Nan values from the dataset
data = data.dropna()
#Split data into training and testing sets 
from sklearn.model_selection import train_test_split
# join clean_txt lists into strings to apply CountVectorizer, otherwise it gives an error

x_train, x_test, y_train, y_test =  train_test_split(data["clean_text"].map(' '.join), 
      data["label"], test_size = 0.2, random_state = 42) # 20% of the data for testing the model

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(stop_words='english')
# normal distribution is not assumed by random forest, but we apply transfromation to transform 
# textual data(categorical data) into numeriacl.
# alternatively we can apply one-hot encoding for the same purpose.
# l2: Sum of squares of vector elements is 1 # apply sublinear Tf scaling
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
x_train_features = count_vect.fit_transform(x_train)
x_train_norm = transformer.fit_transform(x_train_features)
#print(x_train_features.shape)
print(x_train_norm.shape)

#Output :(130378, 84916) 
x_test_features = count_vect.transform(x_test)
x_test_norm = transformer.transform(x_test_features)
print(x_test_features.shape)
print(x_test_norm.shape)
#Output :(32595, 84916)

In [None]:
# classification model; RandomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,f1_score, accuracy_score  
# the No. of trees in the model,
# n_jobs to run the algorithm in parallel(fit, predict, decision_path and apply are all parallelized over the trees),
# -1 to use all processors
# criterion:  used to measure the quality of a split
RF_model = RandomForestClassifier(n_estimators=350, criterion='gini', n_jobs= -1)  
RF_model.fit(x_train_norm, y_train)
predictions = RF_model.predict(x_test_norm)


In [None]:
# F-score is a measure of a test's accuracy. Used to compare the performance of two classifiers.
# If average= None, the scores for each class are returned.
f1score= f1_score(y_test,predictions, average = None)
print('f1_score:',f1score )

#Accuracy_score
acc_score = accuracy_score(y_test,predictions)*100
print('accuracy score:',acc_score)

# model evaluation using Confusion Matrix 
CM = confusion_matrix(y_test,predictions)
print('confusion_matrix:\n', CM) 



In [None]:
# applying Naive Bayes classification MultinomialNB
from sklearn.naive_bayes import MultinomialNB
NB_model = MultinomialNB()
#fitting NB classifier, we use monogram tokenizer; tokenizing each word as one token
NB_model.fit(x_train_features, y_train)

# NB classifier evaluation
from sklearn import metrics
predictions= NB_model.predict(x_test_features)
metrics.accuracy_score(predictions, y_test)
