# Twitter Sentiment Analysis

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

#### Helpers

In [2]:
from utils import *
from preprocessing import *

#### Feature Extraction

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

#### Machine Learning

In [4]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm

#### Options

In [5]:
pd.options.mode.chained_assignment = None

## Load Data

In [6]:
data_path = '../data/'
pos_tweets_file = 'train_pos.txt'
neg_tweets_file = 'train_neg.txt'
test_tweets_file = 'test_data.txt'

### Train Data

In [7]:
pos_tweets = pd.read_table(data_path+pos_tweets_file, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(data_path+neg_tweets_file ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

In [8]:
pos_tweets.head()

Unnamed: 0,tweet,sentiment
0,<user> i dunno justin read my mention or not ....,pos
1,"because your logic is so dumb , i won't even c...",pos
2,<user> just put casper in a box ! looved the...,pos
3,<user> <user> thanks sir > > don't trip lil ma...,pos
4,visiting my brother tmr is the bestest birthda...,pos


In [9]:
neg_tweets.head()

Unnamed: 0,tweet,sentiment
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,neg
1,glad i dot have taks tomorrow ! ! #thankful #s...,neg
2,1-3 vs celtics in the regular season = were fu...,neg
3,<user> i could actually kill that girl i'm so ...,neg
4,<user> <user> <user> i find that very hard to ...,neg


In [10]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (97902, 2)
negative tweets shape:  (99068, 2)


In [11]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(196970, 2)

In [12]:
tweets.head()

Unnamed: 0,tweet,sentiment
0,<user> i dunno justin read my mention or not ....,pos
1,"because your logic is so dumb , i won't even c...",pos
2,<user> just put casper in a box ! looved the...,pos
3,<user> <user> thanks sir > > don't trip lil ma...,pos
4,visiting my brother tmr is the bestest birthda...,pos


In [13]:
tweets.tail()

Unnamed: 0,tweet,sentiment
99063,can't wait to fake tan tonight ! hate being pale,neg
99064,<user> darling i lost my internet connection ....,neg
99065,kanguru defender basic 4 gb usb 2.0 flash driv...,neg
99066,rizan is sad now,neg
99067,"no text back ? yea , he mad",neg


### Test Data

In [14]:
test_tweets = pd.read_table(data_path+test_tweets_file, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Remove Duplicate Tweets

Duplicates are removed to avoid putting extra weight on any particular tweet.

In [15]:
print('number of tweets before duplicates removal:\t', tweets.shape[0])
tweets.drop_duplicates(subset='tweet', inplace=True)
print('number of tweets after duplicates removal:\t', tweets.shape[0])

number of tweets before duplicates removal:	 196970
number of tweets after duplicates removal:	 178483


### Fix repeated letters

We use preprocessing so that any letter occurring more than two times in a row is replaced with two occurrences.
As an example, the words haaaaaaaaappy and haaaaappy should be converted to haappy

In [16]:
tweets['tweet'] = tweets.apply(lambda tweet: filter_repeated_chars_on_tweet(tweet['tweet']), axis=1)

### Convert all tweets to lowercase

In [17]:
# done by default
# tweets['tweet'] = convert_to_lowercase(tweets['tweet'])

### Filter Punctuation

### Part of speech tagging

In [18]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

### Filter Stopwords

### Tweets final representation

In [19]:
tweets.head()

Unnamed: 0,tweet,sentiment
0,<user> i dunno justin read my mention or not ....,pos
1,"because your logic is so dumb , i won't even c...",pos
2,<user> just put casper in a box ! looved the...,pos
3,<user> <user> thanks sir > > don't trip lil ma...,pos
4,visiting my brother tmr is the bestest birthda...,pos


## Feature Extraction

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=0.50, random_state=4)

### bag of words

#### occurencies

In [None]:
#initialize bag of words
count_vectorizer = CountVectorizer(min_df=1)

count_train_vectors = count_vectorizer.fit_transform(X_train)
count_test_vectors = count_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

#### frequencies

In [None]:
#initialize bag of words (tf-idf)
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True)

tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

## Feature Expansion

### polynomial expansion

### standardization

## Classification

### Naive Bayes

In [None]:
clf = MultinomialNB()
clf.fit(count_train_vectors, y_train)
prediction_bayes = clf.predict(count_test_vectors)
prediction_bayes.shape

In [None]:
print(classification_report(y_test, prediction_bayes))
print(accuracy_score(y_test,prediction_bayes))

In [None]:
clf = MultinomialNB()
clf.fit(tfidf_train_vectors, y_train)
prediction_bayes = clf.predict(tfidf_test_vectors)
prediction_bayes.shape

In [None]:
print(classification_report(y_test, prediction_bayes))
print(accuracy_score(y_test,prediction_bayes))

### SVM

In [None]:
# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)
# prediction_linear.shape

In [None]:
# print(classification_report(y_test, prediction_linear))
# print(accuracy_score(y_test,prediction_bayes))

## Write results to file

In [None]:
pred_file = 'pred_submission.csv'
create_csv_submission(prediction_bayes, data_path+pred_file)