### Reading the data 

In [1]:
import pandas as pd

#### using original dataset provided with project

In [2]:

'''
files = ['twitter-2013train.txt','twitter-2015train.txt','twitter-2016train.txt'] #original dataset

df0, df1, df2 = [pd.read_csv(name, delimiter = '\t', header = None) for name in files] #original dataset

data = pd.concat([df0, df1, df2], ignore_index=True) #concatinating the tweets data in 1 dataframe #original dataset
'''

"\nfiles = ['twitter-2013train.txt','twitter-2015train.txt','twitter-2016train.txt'] #original dataset\n\ndf0, df1, df2 = [pd.read_csv(name, delimiter = '\t', header = None) for name in files] #original dataset\n\ndata = pd.concat([df0, df1, df2], ignore_index=True) #concatinating the tweets data in 1 dataframe #original dataset\n"

#### adding more data from kaggle sentiment analysis dataset sample(30%)

In [3]:

files = ['twitter-2013train.txt','twitter-2015train.txt','twitter-2016train.txt', 'kaggle_samples.txt'] #dataset including samples from kaggle

df0, df1, df2, df3 = [pd.read_csv(name, delimiter = '\t', header = None) for name in files] #dataset including samples from kaggle

data = pd.concat([df0, df1, df2, df3], ignore_index=True) #concatinating the tweets data in 1 dataframe #dataset including samples from kaggle


In [4]:
data.columns = ['serial', 'opinion', 'tweet_text']

In [5]:
data.groupby(by = 'opinion').count()

Unnamed: 0_level_0,serial,tweet_text
opinion,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,242150,242150
neutral,6840,6840
positive,247051,247051


In [6]:
data.shape

(496041, 3)

In [7]:
data.head()

Unnamed: 0,serial,opinion,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...


### Data Clean up & Pre-processing

In [8]:
# Step 1: casefold

import nltk

lowerTweets =[]
for tweet in data['tweet_text']:
    lowerTweets.append(tweet.casefold())
#lowerTweets[0:5]

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arch.Mona\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# Step 2: remove stopwords applying on all tweets 

from nltk.corpus import stopwords
import re

stops = set(stopwords.words("english"))
filtered_tweets =[]
for doc in lowerTweets:
    curr = ""
    for word in  re.split("\W+",doc):
        if word not in stops: 
            curr = curr + word +" "
    curr = curr.strip()
    filtered_tweets.append(curr)
#filtered_tweets[0:5]

In [11]:
# Step 3: Remove punctuation and digits from tweets and replace it by space
#### NOTE: Through different combinations, it is observed that accuracy is decreased after removing punctuation.

import string

def remove_punctuation(input_text):
    output = []
    for tweet in input_text:
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        output.append(tweet.translate(trantab))
    return output

def remove_digits(input_text):
    out_list = []
    for j in input_text:
        out_list.append(re.sub('\d+', '', j))
    return out_list

punctuation_removed_tweets = remove_punctuation(filtered_tweets)
punctuation_removed_tweets[0:5]

# Here we will skip removing the punctuation,
# and will use the "remove_digits" function with "filtered_tweets" (output of stopwords)
digits_removed_tweets = remove_digits(filtered_tweets)
#digits_removed_tweets[0:5]


In [12]:
# Step 4: Perform trimming to remove extra whitespaces:

spaces_removed_tweets = []
for j in digits_removed_tweets:
    spaces_removed_tweets.append(" ".join(j.split()))

#spaces_removed_tweets[0:5]

In [13]:
# Step 5: stemwords
from nltk.stem.porter import PorterStemmer

def stemDocs(f_docs):
    stemmed_docs =[]
    for doc in  f_docs:
        curr = ""
        for word in doc.split():  
            curr = curr + PorterStemmer().stem(word) +" "
        curr = curr.strip()
        stemmed_docs.append(curr)
    return  stemmed_docs
    
stemmed_tweets = stemDocs(spaces_removed_tweets)
#stemmed_tweets[0:5]

### After multiple trials with different combinations, the highest accuracy (64.99%) is reached through the below steps:

### 1- Casefolding
### 2- Remove stopwords
### 3- Remove digits
### 4- Trimming (remove whitespaces)

In [14]:
data['stemmed_tweet'] = stemmed_tweets
data['non_stemmed_tweet'] = spaces_removed_tweets
data.head()

Unnamed: 0,serial,opinion,tweet_text,stemmed_tweet,non_stemmed_tweet
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,ga hous hit um go chapel hill sat,gas house hit um going chapel hill sat
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...,theo walcott still shit uc watch rafa johnni d...,theo walcott still shit uc watch rafa johnny d...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...,um gsp fan uc hate nick diaz ut wait februari,um gsp fan uc hate nick diaz ut wait february
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...,iranian gener say israel us iron dome ut deal ...,iranian general says israel us iron dome ut de...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...,tehran uc mon amour obama tri establish tie mu...,tehran uc mon amour obama tried establish ties...


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

accuracies = []
f1_scores = []

### Count Vectorizer + Logistic Regression on Stemmed Tweets

In [16]:
# Use 70% of the provided data as training data and the remaining 30% to test a classifier 
from sklearn.model_selection import train_test_split

tweets_train,tweets_test,train_labels,test_labels = train_test_split(data["stemmed_tweet"],                   
                                                 data['opinion'], test_size=0.3,
                                                 random_state=0)

In [17]:
# CountVectorizer + Logistic Regression

vectorizer = CountVectorizer().fit(tweets_train)

# Training Dataset:
tweets_train_vectorized = vectorizer.transform(tweets_train)

# Test Dataset:
tweets_test_vectorized = vectorizer.transform(tweets_test)

# Create a Logistic Regression classifier & use it with CountVectorizer:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clfr = LogisticRegression()
clfr.fit(tweets_train_vectorized,train_labels)

predicted = clfr.predict(tweets_test_vectorized)
acc = metrics.accuracy_score(test_labels,predicted)

print ('Accuracy of CountVectorizer + Logistic Regression (Stemmed Tweets) = '+str(acc*100)+'%')
print (metrics.classification_report(test_labels,predicted))
accuracies.append(('Accuracy of CountVectorizer + Logistic Regression', acc*100))
f1_scores.append(('F1-score of CountVectorizer + Logistic Regression', metrics.f1_score(test_labels,predicted, average='weighted')))

Accuracy of CountVectorizer + Logistic Regression (Stemmed Tweets) = 76.1627008393084%
             precision    recall  f1-score   support

   negative       0.77      0.75      0.76     72460
    neutral       0.64      0.40      0.49      2073
   positive       0.76      0.78      0.77     74280

avg / total       0.76      0.76      0.76    148813



### Count Vectorizer + Logistic Regression on NON Stemmed Tweets

In [18]:
# Use 70% of the provided data as training data and the remaining 30% to test a classifier 
from sklearn.model_selection import train_test_split

tweets_train,tweets_test,train_labels,test_labels = train_test_split(data["non_stemmed_tweet"],                   
                                                 data['opinion'], test_size=0.3,
                                                 random_state=0)

In [19]:
# CountVectorizer + Logistic Regression

vectorizer = CountVectorizer().fit(tweets_train)

# Training Dataset:
tweets_train_vectorized = vectorizer.transform(tweets_train)

# Test Dataset:
tweets_test_vectorized = vectorizer.transform(tweets_test)

# Create a Logistic Regression classifier & use it with CountVectorizer:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#clfr = LogisticRegression()
clfr = LogisticRegression()
clfr.fit(tweets_train_vectorized,train_labels)

predicted = clfr.predict(tweets_test_vectorized)
acc = metrics.accuracy_score(test_labels,predicted)

print ('Accuracy of CountVectorizer + Logistic Regression (NON Stemmed Tweets) = '+str(acc*100)+'%')
print (metrics.classification_report(test_labels,predicted))
accuracies.append(('Accuracy of CountVectorizer + Logistic Regression', acc*100))
f1_scores.append(('F1-score of CountVectorizer + Logistic Regression', metrics.f1_score(test_labels,predicted, average='weighted')))

Accuracy of CountVectorizer + Logistic Regression (NON Stemmed Tweets) = 76.39251947074517%
             precision    recall  f1-score   support

   negative       0.77      0.75      0.76     72460
    neutral       0.65      0.40      0.50      2073
   positive       0.76      0.79      0.77     74280

avg / total       0.76      0.76      0.76    148813



In [20]:
# Printing output
from pandas import DataFrame

output = {'Classifier': ['CountVectorizer + LR (Stemmed Tweets)', 'CountVectorizer + LR (NON Stemmed Tweets)'],
                   'Accuracy': [accuracies[0][1],accuracies[1][1]], 
                   'FScore': [f1_scores[0][1],f1_scores[1][1]]}

output_df = DataFrame(output,columns= ['Classifier', 'Accuracy', 'FScore'])
print(output_df)

                                  Classifier   Accuracy    FScore
0      CountVectorizer + LR (Stemmed Tweets)  76.162701  0.760819
1  CountVectorizer + LR (NON Stemmed Tweets)  76.392519  0.763126


### Now Let's make some hyperparameters tuning related to logistic regression classifier noting that we will use non-stemmed data from now on

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

In [22]:
# CountVectorizer + Logistic Regression

vectorizer = CountVectorizer().fit(tweets_train)

# Training Dataset:
tweets_train_vectorized = vectorizer.transform(tweets_train)

# Test Dataset:
tweets_test_vectorized = vectorizer.transform(tweets_test)

# Create a Logistic Regression classifier & use it with CountVectorizer:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

pipe = Pipeline([('classifier', LogisticRegression())])

search_space = [{'classifier__penalty': ['l1','l2'],
                 'classifier__C': np.logspace(-3,3,20)}
               ]
                 
clf = GridSearchCV(pipe, search_space, cv=10, verbose=0)

best_acc = clf.fit(tweets_train_vectorized, train_labels)

In [23]:
best_acc.best_score_

0.7647165551165228

In [24]:
print (best_acc.best_estimator_.get_params()['classifier'])

LogisticRegression(C=0.3359818286283781, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


In [25]:
means = best_acc.cv_results_['mean_test_score']
stds = best_acc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, best_acc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

0.602 (+/-0.002) for {'classifier__C': 0.001, 'classifier__penalty': 'l1'}
0.719 (+/-0.005) for {'classifier__C': 0.001, 'classifier__penalty': 'l2'}
0.646 (+/-0.004) for {'classifier__C': 0.00206913808111479, 'classifier__penalty': 'l1'}
0.728 (+/-0.005) for {'classifier__C': 0.00206913808111479, 'classifier__penalty': 'l2'}
0.677 (+/-0.005) for {'classifier__C': 0.004281332398719396, 'classifier__penalty': 'l1'}
0.737 (+/-0.004) for {'classifier__C': 0.004281332398719396, 'classifier__penalty': 'l2'}
0.706 (+/-0.005) for {'classifier__C': 0.008858667904100823, 'classifier__penalty': 'l1'}
0.745 (+/-0.004) for {'classifier__C': 0.008858667904100823, 'classifier__penalty': 'l2'}
0.724 (+/-0.004) for {'classifier__C': 0.018329807108324356, 'classifier__penalty': 'l1'}
0.752 (+/-0.003) for {'classifier__C': 0.018329807108324356, 'classifier__penalty': 'l2'}
0.738 (+/-0.004) for {'classifier__C': 0.0379269019073225, 'classifier__penalty': 'l1'}
0.757 (+/-0.004) for {'classifier__C': 0.037

In [26]:
predicted = clf.predict(tweets_test_vectorized)
acc = metrics.accuracy_score(test_labels,predicted)

print ('Accuracy of CountVectorizer + Logistic Regression (NON Stemmed Tweets) = '+str(acc*100)+'%')
print (metrics.classification_report(test_labels,predicted))
accuracies.append(('Accuracy of CountVectorizer + Logistic Regression', acc*100))
f1_scores.append(('F1-score of CountVectorizer + Logistic Regression', metrics.f1_score(test_labels,predicted, average='weighted')))

Accuracy of CountVectorizer + Logistic Regression (NON Stemmed Tweets) = 76.5685793579862%
             precision    recall  f1-score   support

   negative       0.78      0.75      0.76     72460
    neutral       0.66      0.37      0.48      2073
   positive       0.76      0.79      0.77     74280

avg / total       0.77      0.77      0.76    148813

