In [None]:
# utilities
import re
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# nltk
import nltk
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

nltk.download("wordnet")
nltk.download("omw-1.4")

In [None]:
sentiment140 = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding="ISO-8859-1", names=["sentiment", "id", "date", "flag", "user", "text"])
sentiment140 = sentiment140[['text', 'sentiment']]
sentiment140['sentiment'] = sentiment140['sentiment'].replace(0, "negative")
sentiment140['sentiment'] = sentiment140['sentiment'].replace(4, "postive")
sentiment140 = sentiment140.dropna(axis=0, how="any")
sentiment140.head()

In [None]:
TwitterTweetSentiment = pd.read_csv('Tweets.csv')
TwitterTweetSentiment = TwitterTweetSentiment[['text', 'sentiment']]
TwitterTweetSentiment = TwitterTweetSentiment[TwitterTweetSentiment['sentiment'] != "neutral"]
TwitterTweetSentiment = TwitterTweetSentiment.dropna(axis=0, how="any")
TwitterTweetSentiment.head()

In [None]:
TwitterSentimentAnalysis = pd.read_csv('twitter_training.csv', names=["id", "entity", "sentiment", "text"])
TwitterSentimentAnalysis = TwitterSentimentAnalysis[['text', 'sentiment']]
TwitterSentimentAnalysis['sentiment'] = TwitterSentimentAnalysis['sentiment'].str.lower()
TwitterSentimentAnalysis = TwitterSentimentAnalysis[TwitterSentimentAnalysis['sentiment'] != "neutral"]
TwitterSentimentAnalysis = TwitterSentimentAnalysis[TwitterSentimentAnalysis['sentiment'] != "irrelevant"]
TwitterSentimentAnalysis = TwitterSentimentAnalysis.dropna(axis=0, how="any")
TwitterSentimentAnalysis.head()

In [None]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [None]:
def preprocessText(text):
    processedText = []

    wordLemm = WordNetLemmatizer()

    for tweet in text:
        # Lower Casing
        tweet = tweet.lower()

        # Replacing URL
        tweet = re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)", ' URL', tweet)

        # Replacing Emoji
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])

        # Replacing Usernames
        tweet = re.sub("@[^\s]+", " USER", tweet)

        # Removing Non-alphabets
        tweet = re.sub("[^a-zA-Z0-9]", " ", tweet)

        # Removing Consecutive letters
        tweet = re.sub(r"(.)\1\1", r"\1\1", tweet)

        tweetwords = ''
        for word in tweet.split():
            # Removing Stopwords
            if word not in stopwordlist:
                # Removing Short words
                if len(word) > 1:
                    # Lemmatazing words
                    word = wordLemm.lemmatize(word)
                    tweetwords += (word+' ')

        processedText.append(tweetwords)
    return processedText

In [None]:
processedS140 = preprocessText(sentiment140['text'])
processedTTS = preprocessText(TwitterTweetSentiment['text'])
processedTSA = preprocessText(TwitterSentimentAnalysis['text'])

In [None]:
X_trainS140, X_testS140, y_trainS140, y_testS140 = train_test_split(processedS140, sentiment140['sentiment'], test_size=0.2)
X_trainTTS, X_testTTS, y_trainTTS, y_testTTS = train_test_split(processedTTS, TwitterTweetSentiment['sentiment'], test_size=0.2)
X_trainTSA, X_testTSA, y_trainTSA, y_testTSA = train_test_split(processedTSA, TwitterSentimentAnalysis['sentiment'], test_size=0.2)

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,3), max_features=500000)
vectoriser.fit(X_trainS140)
X_trainS140 = vectoriser.transform(X_trainS140)
X_testS140 = vectoriser.transform(X_testS140)

vectoriser.fit(X_trainTTS)
X_trainTTS = vectoriser.transform(X_trainTTS)
X_testTTS = vectoriser.transform(X_testTTS)

vectoriser.fit(X_trainTSA)
X_trainTSA = vectoriser.transform(X_trainTSA)
X_testTSA = vectoriser.transform(X_testTSA)

In [None]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)

    print(classification_report(y, y_pred))

    confusion = confusion_matrix(y, y_pred)

    categories = ['Negative', 'Positive']
    group_names = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
    group_percentages = ['{0:.2%}'.format(value) for value in confusion.flatten() / np.sum(confusion)]
    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names, group_percentages)]
    labels = np.asarray(labels).reshape(2, 2)

    sns.heatmap(confusion, annot = labels, cmap = 'Blues',fmt = '',
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values")
    plt.ylabel("Actual values")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
def train_test_acc(model, X_train, X_test, y_train, y_test):
    print("+-+ Training Accuracy +-+")
    evaluate_model(model, X_train, y_train)

    print("+-+ Testing Accuracy +--+")
    evaluate_model(model, X_test, y_test)

In [None]:
def train_test_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)

    train_test_acc(model, X_train, X_test, y_train, y_test)

In [None]:
BNBmodel = BernoulliNB(alpha = 2)
SVCmodel = LinearSVC()
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
RFmodel = RandomForestClassifier(max_depth=2, random_state=0, n_estimators=1000)

In [None]:
print("+===+ Sentiment140 Dataset +===+")
print("+---+ Bernoulli Naive Bayes +---+")
train_test_model(BNBmodel, X_trainS140, X_testS140, y_trainS140, y_testS140)
print("+---+ Linear Support Vector Classification +---+")
train_test_model(SVCmodel, X_trainS140, X_testS140, y_trainS140, y_testS140)
print("+---+ Logisitic Regression +---+")
train_test_model(LRmodel, X_trainS140, X_testS140, y_trainS140, y_testS140)
print("+---+ Random Forest +---+")
train_test_model(RFmodel, X_trainS140, X_testS140, y_trainS140, y_testS140)

In [None]:
print("+===+ Twitter Tweets Sentiment Dataset +===+")
print("+---+ Bernoulli Naive Bayes +---+")
train_test_model(BNBmodel, X_trainTTS, X_testTTS, y_trainTTS, y_testTTS)
print("+---+ Linear Support Vector Classification +---+")
train_test_model(SVCmodel, X_trainTTS, X_testTTS, y_trainTTS, y_testTTS)
print("+---+ Logisitic Regression +---+")
train_test_model(LRmodel, X_trainTTS, X_testTTS, y_trainTTS, y_testTTS)
print("+---+ Random Forest +---+")
train_test_model(RFmodel, X_trainTTS, X_testTTS, y_trainTTS, y_testTTS)

In [None]:
print("+===+ Twitter Sentiment Analysis Dataset +===+")
print("+---+ Bernoulli Naive Bayes +---+")
train_test_model(BNBmodel, X_trainTSA, X_testTSA, y_trainTSA, y_testTSA)
print("+---+ Linear Support Vector Classification +---+")
train_test_model(SVCmodel, X_trainTSA, X_testTSA, y_trainTSA, y_testTSA)
print("+---+ Logisitic Regression +---+")
train_test_model(LRmodel, X_trainTSA, X_testTSA, y_trainTSA, y_testTSA)
print("+---+ Random Forest +---+")
train_test_model(RFmodel, X_trainTSA, X_testTSA, y_trainTSA, y_testTSA)