In [3]:
import numpy as np 
import pandas as pd 
import re
from gensim.models import Doc2Vec, doc2vec
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
nltk.download('stopwords')
import xgboost as xgb
from xgboost import XGBRegressor

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agupt69\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Read the data from the excel file
xls = pd.ExcelFile('training-Obama-Romney-tweets.xlsx')
df = pd.read_excel(xls, 'Obama')
# drop the first row from the data "1: positive, -1: negative, 0: neutral, 2: mixed"
df = df.drop([0])
# Rename the unnamed column 4 that contains the class
df.rename(columns={'Unnamed: 4':'classlabel'}, inplace=True )
# print the unique class label inorder to prepare for pre processing
print(df['classlabel'].value_counts())

-1            1922
0             1896
1             1653
2             1474
0               82
2               70
-1              46
1               26
irrevelant      23
irrelevant       1
Name: classlabel, dtype: int64


In [5]:
# Drop all the unnecessary and empty columns
df = df[['Anootated tweet', 'classlabel']]
# Drop all the rows where the class is mixed i.e 2
df = df.dropna(axis=0)
# The data provided had class 'irrelevant' and 'irrevelant', so dropping those
df = df[df.classlabel != 'irrelevant']
df = df[df.classlabel != 'irrevelant']
# Converting the classlabel column to type integer
df['classlabel'] = df['classlabel'].astype(int)
# Dropping all rows that have been labelled as mixed class, i.e. class (2)
df = df[df.classlabel != 2]
#df.loc[(df.classlabel == -1),'classlabel']= 2

In [6]:
# Check the unique values again to make that the data only consists of positive (1), negative(-1) and neutral(0) classes
print(df['classlabel'].value_counts())
# Extract all tweets and put them into a list for cleaning them up
tweets = df['Anootated tweet'].tolist()
class_tweets = df['classlabel'].tolist()

 0    1977
-1    1968
 1    1679
Name: classlabel, dtype: int64


In [7]:
# Tweets clean up
def tweets_cleanup(tweet):
    # Remove HTML tags from the tweet
    tweet = re.sub("<.*?>", "", tweet)
    # Remove Twitter usernames from the tweet
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    # Remove hashtags from the tweet
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
    # Remove http/https link from the tweet
    tweet = re.sub(r'http\S+', "", tweet)
    # Remove special characters
    #tweet = re.sub("[<>!#@$:.,%\?(-)]+", "", tweet)
    tweet = re.sub('[^a-zA-Z\d\s\n]', "", tweet)
    # Remove all numbers from the tweet
    tweet = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", tweet)
    # Convert tweet to lower case
    tweet = tweet.lower()
    # Remove extra white spaces from start and end of tweet
    tweet = tweet.strip()
    # Removing stop words and applying stemming to the tweets
    words = tweet.split()
    tweet = ' '.join([w for w in words if not w in nltk.corpus.stopwords.words("english")])
    ps = nltk.stem.PorterStemmer()
    stemmedTweet = [ps.stem(word) for word in tweet.split(" ")]
    stemmedTweet = " ".join(stemmedTweet)
    tweet = str(stemmedTweet)
    tweet = tweet.replace("'", "")
    tweet = tweet.replace("\"","")
    return tweet

In [8]:
def write_to_file(predictions):
    with open('obama.txt', 'w') as f:
        i = 1
        f.write("Ayush 76 and Gagan 82 \n")
        for val in predictions:
            line = str(i)+";;"+str(val)
            f.write(line)
            i = i + 1
            f.write('\n')

In [9]:
# Apply the clean up function to the tweets
df['Cleaned tweet'] = df['Anootated tweet'].apply(tweets_cleanup)
df.to_excel("cleaned_tweets.xlsx")  
X = df['Cleaned tweet']
Y = df['classlabel']
vectorizer = CountVectorizer(max_features=3000).fit(X)
X = vectorizer.transform(X)
print(X.shape)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=101)

(5624, 3000)


In [13]:
# Gradient Descent
sgd = SGDClassifier()
x_train.shape
y_train.shape
sgd.fit(x_train, y_train)
sgd_predictions = sgd.predict(x_test)
print("SGD Accuracy", accuracy_score(y_test, sgd_predictions))
print("SGD F1 score", metrics.f1_score(y_test, sgd_predictions, average='macro'))
print("SGD Precision", metrics.precision_score(y_test, sgd_predictions, average='macro'))
print("SGD Recall", metrics.recall_score(y_test, sgd_predictions, average='macro'))

SGD Accuracy 0.5742222222222222
SGD F1 score 0.5757455731067996
SGD Precision 0.5770282722401543
SGD Recall 0.5749138422300367


In [14]:
# SVM 
from sklearn.svm import SVC
svclassifier = SVC(kernel='rbf')
svclassifier.fit(x_train, y_train)
svm_pred = svclassifier.predict(x_test)
print("SVM Accuracy", accuracy_score(y_test, svm_pred))
print("SVM F1 score", metrics.f1_score(y_test, svm_pred, average='macro'))
print("SVM Precision", metrics.precision_score(y_test, svm_pred, average='macro'))
print("SVM Recall", metrics.recall_score(y_test, svm_pred, average='macro'))

SVM Accuracy 0.5973333333333334
SVM F1 score 0.5995949622195567
SVM Precision 0.606929319288065
SVM Recall 0.596409180034528


In [15]:
#Naive Bayes
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(x_train, y_train)
nb_pred = svclassifier.predict(x_test)
print("NB Accuracy", accuracy_score(y_test, nb_pred))
print("NB F1 score", metrics.f1_score(y_test, nb_pred, average='macro'))
print("NB Precision", metrics.precision_score(y_test, nb_pred, average='macro'))
print("NB Recall", metrics.recall_score(y_test, nb_pred, average='macro'))

NB Accuracy 0.5973333333333334
NB F1 score 0.5995949622195567
NB Precision 0.606929319288065
NB Recall 0.596409180034528


In [21]:
# XGBoost Model
reg = xgb.XGBRegressor()
reg.fit(x_train, y_train)
pred = reg.predict(x_test)
xg_pred = list()
for p in pred:
    p = round(p)
    xg_pred.append(p)
print("XGBoost Accuracy", accuracy_score(y_test, xg_pred))
print("XGBoost F1 score", metrics.f1_score(y_test, xg_pred, average='macro'))
print("XGBoost Precision", metrics.precision_score(y_test, xg_pred, average='macro'))
print("XGBoost Recall", metrics.recall_score(y_test, xg_pred, average='macro'))

XGBoost Accuracy 0.44177777777777777
XGBoost F1 score 0.37557380675847574
XGBoost Precision 0.6180101670297748
XGBoost Recall 0.42676742866688827


In [18]:
# logistic regression
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
lr_pred = logisticRegr.predict(x_test)
print("Logistic Regression Accuracy", accuracy_score(y_test, lr_pred))
print("Logistic Regression F1 score", metrics.f1_score(y_test, lr_pred, average='macro'))
print("Logistic Regression Precision", metrics.precision_score(y_test, lr_pred, average='macro'))
print("Logistic Regression Recall", metrics.recall_score(y_test, lr_pred, average='macro'))

Logistic Regression Accuracy 0.6026666666666667
Logistic Regression F1 score 0.6026666666666667
Logistic Regression Precision 0.6026666666666667
Logistic Regression Recall 0.6026666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [151]:
# Test data with SVM for Obama
test_xls = pd.ExcelFile('final-testData-no-label-Obama-tweets(3).xlsx')
test_df = pd.read_excel(test_xls, 'Obama')
# Drop all the rows where the class is mixed i.e 2
#test_df = test_df['Anootated tweet']
#test_df = test_df.dropna(axis=0)
test_df.rename(columns={'Unnamed: 1':'tweet'}, inplace=True )
test_df['Cleaned tweet'] = test_df['tweet'].apply(tweets_cleanup)
X_testing = test_df['Cleaned tweet']
vectorizer = CountVectorizer(max_features=3600).fit(X_testing)
X_testing = vectorizer.transform(X_testing)
print(X_testing.shape)

(1951, 3600)


In [152]:
final_pred = logisticRegr.predict(X_testing)
write_to_file(final_pred)