# Twitter Sentiment Analysis - Final Model Export
   * This iPython file is just to export the selected model obtained by experimenting using the SentimentAnalysisTrails.ipynb.

## Importing the libraries

In [1]:
# For working with data
import pandas as pd
import numpy as np

# Removing stopwords and Stemming
import nltk

# Time taken by functions
import time

# Cleaning Tweets
import re

## Importing the dataset
* Reading the Train CSV file
* Dropping a few useless columns
* Mapping Positive (4) to 1 and Negative (0) to 0
* Shuffling the data

In [2]:
# Reading the Train CSV file

raw_data = pd.read_csv("./train.csv" , header = None , encoding = 'latin')

# Dropping a few useless columns

raw_data.columns = ['Y', 'A', 'B', 'C', 'D', 'X']
raw_data = raw_data.drop(['A', 'B', 'C', 'D'], axis=1)

# Mapping Positive (4) to 1 and Negative (0) to 0

raw_data['Y'] = raw_data['Y'].map({4 : 1 , 0 : 0})
raw_data['Y'].value_counts()

# Shuffling the data

raw_data = raw_data.sample(frac=1).reset_index(drop=True)

## Preprocessing the Tweets
* Cleaning the Tweets
    * Lowercasing
    * Removing all the urls, user tags, hashtags and some punctuations.
    * Stemming and removing spaces
* Train-Test Split
* Removing very rare words ( <= 60 occurences ).
* Splitting into Targets and Features.
* Count Vectorizing and TF-IDF Vectorizing

In [3]:
# Declaring the function
def process_text(text):

    # Lowercasing
    text = text.lower()

    # Replacing all the urls
    text = re.sub('(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'\\".,<>?\xc2\xab\xc2\xbb\xe2\x80\x9c\xe2\x80\x9d\xe2\x80\x98\xe2\x80\x99]))'
               , '', text)

    # Replacing all user tags
    text = re.sub(r"@[^\s]+", '', text)

    # Replacing all hashtags
    text = re.sub(r"#[^\s]+", '', text)

    # Remove some punctuations
    text = re.sub(r"[!?,'\"*)@#%(&$_.^-]", '', text)

    # Splitting on spaces
    text = text.split(' ')

    # Stemming and removing spaces
    stemmer_ps = nltk.stem.PorterStemmer()  
    text = [stemmer_ps.stem(word) for word in text if len(word)]

    return text

start = time.time()

raw_data['X'] = raw_data['X'].map(process_text)

end = time.time()
print(end - start)

767.2241866588593


In [4]:
# Train Test Split

from sklearn.model_selection import train_test_split

Train, Test = train_test_split(raw_data , stratify = raw_data["Y"], test_size=0.02)

Train = Train.to_numpy()
Test = Test.to_numpy()

del raw_data

In [5]:
# Creating the word frequency dictionary to remove very rare words.
wordfreq = {}

start = time.time()

for _ , text in Train:
    for w in text:
        if w not in wordfreq:
            wordfreq[w] = 1
    else:
        wordfreq[w] += 1

end = time.time()
print(end - start)

10.769980430603027


In [6]:
# Getting words that have occured more than 75 times in tweets

words = [word for word in wordfreq if wordfreq[word] <= 60]
low_words = len(words)
total_words = len(wordfreq)
print(low_words , total_words - low_words , (low_words / total_words * 100) , ((total_words - low_words) / total_words * 100))

398190 2441 99.39071115315589 0.6092888468441034


In [7]:
# Removing the low occurence words from the wordfreq dictionaries

for word in words:
    if word in wordfreq: 
        del wordfreq[word]

In [8]:
# Removing the low occurence words from the tweets

start = time.time()

for i in range(len(Train)):
    Train[i][1] = [ word for word in Train[i][1] if word in wordfreq]

end = time.time()
print(end - start)

8.897157192230225


In [9]:
# Joining for Vectorizer

start = time.time()

for i in range(len(Train)):
    Train[i][1] = " ".join(Train[i][1])

end = time.time()
print(end - start)

4.18176007270813


In [10]:
# Creating the targets and labels for train

X = []
Y = []

for i in range(len(Train)):
    if ( len(Train[i][1]) > 2 ):
        X.append(Train[i][1])
        Y.append(Train[i][0])

X = np.array(X)
Y = np.array(Y)

In [11]:
# Vectoritzing 

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_CV = CountVectorizer()

X_CV = vectorizer_CV.fit_transform(X)

## Training the Model

In [12]:
# Importing the models

from sklearn.linear_model import LogisticRegression

In [13]:
# Instantiating the models 

# Logistic Regression
lr_CV = LogisticRegression(random_state=0 , max_iter=1000)

In [14]:
print("Logistic regression started!")
start = time.time()
clf_lr_CV = lr_CV.fit(X_CV, Y)
end = time.time()
print("Logistic regression ended in " + str(end - start) + " secs")

Logistic regression started!
Logistic regression ended in 107.57787108421326 secs


## Testing the model

In [15]:
# Preprocessing the test set

for i in range(len(Test)):
    Test[i][1] = [ word for word in Test[i][1] if word in wordfreq]

In [16]:
# Creating the targets and labels for train

X_test = []
Y_test = []

for i in range(len(Test)):
    X_test.append(" ".join(Test[i][1]))
    Y_test.append(Test[i][0])

X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [17]:
# Vectorizing

X_CV_test = vectorizer_CV.transform(X_test)

In [18]:
# Predicting

Y_pred_lr_CV = clf_lr_CV.predict(X_CV_test)

In [19]:
# Importing the metrics

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [20]:
def metrics (model_name , Y_test , Y_pred ):
    print("------ Metrics for " + model_name + "------")
    print("Accuracy : " + str(accuracy_score(Y_test, Y_pred)))
    print("Precision : " + str(precision_score(Y_test, Y_pred)))
    print("Recall : " + str(recall_score(Y_test, Y_pred)))
    print("F1 Score : " + str(f1_score(Y_test, Y_pred)))
    print("------------------------------------------- ")

In [21]:
metrics("Logistic Regression", Y_test ,Y_pred_lr_CV)

------ Metrics for Logistic Regression------
Accuracy : 0.77703125
Precision : 0.7626978012208855
Recall : 0.8043125
F1 Score : 0.7829525750616008
------------------------------------------- 


## Saving the model

In [22]:
import joblib

joblib.dump(clf_lr_CV, "Logistic_Regression.mdl")
joblib.dump(vectorizer_CV, "Vectorizer_CV.mdl")

['Vectorizer_CV.mdl']

In [23]:
loaded_model = joblib.load("Logistic_Regression.mdl")
result = loaded_model.predict(X_CV_test)

metrics("Logistic Regression Loaded", Y_test ,result)

------ Metrics for Logistic Regression Loaded------
Accuracy : 0.77703125
Precision : 0.7626978012208855
Recall : 0.8043125
F1 Score : 0.7829525750616008
------------------------------------------- 


In [24]:
import json

json_object = json.dumps(wordfreq, indent = 4)
with open("wordfreq.json", "w") as outfile:
    outfile.write(json_object)