# Twitter Sentiment Analysis

### Parameters

In [1]:
from options import *

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline

#### Utils

In [3]:
from utils import *

#### Plots

In [4]:
from plots import *

#### Data Preprocessing

In [5]:
from preprocessing import *

#### Feature Extraction

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

from vectorizer import init_tfidf_vectorizer

#### Cross validation

In [7]:
from cross_validation import *



#### Machine Learning

In [8]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#### Options

In [9]:
pd.options.mode.chained_assignment = None

## Load Data

### Train Data

In [10]:
pos_tweets = pd.read_table(DATA_PATH+POS_TWEETS_FILE, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(DATA_PATH+NEG_TWEETS_FILE ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

In [11]:
# pos_tweets.head()

In [12]:
# neg_tweets.head()

In [13]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (971, 2)
negative tweets shape:  (947, 2)


In [14]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(1918, 2)

In [15]:
# tweets.head()

In [16]:
# tweets.tail()

### Test Data

In [17]:
test_tweets = pd.read_table(DATA_PATH+TEST_TWEETS_FILE, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Part of speech tagging

In [18]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

In [19]:
tweets = preprocessing(tweets,train=True, params=preprocessing_params)

Preprocessing Settings:

furl :	 True
fpunctuation :	 True
fsmall_words :	 True
fdigits :	 True
fduplicates :	 True
frepeated_chars :	 True
save :	 False
fuser :	 True
fhashtag :	 True
-

Tweets Preprocessing for the Training set started


There is no cached file for preprocessed tweets

Number of tweets before duplicates removal:	 1918
Number of tweets after duplicates removal:	 1791
Duplicates removal DONE
Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

 Preprocessed tweets did not saved...

Tweets Preprocessing have been successfully finished!


### Tweets final representation

In [20]:
tweets

Unnamed: 0,tweet,sentiment
0,dunno justin read mention not only justin and ...,pos
1,because your logic dumb won even crop out your...,pos
2,just put casper box looved the battle crakkbitch,pos
3,thanks sir don trip lil mama just keep doin thang,pos
4,visiting brother tmr the bestest birthday gift...,pos
5,yay lifecompleted tweet facebook let know please,pos
6,dnextalbumtitle feel for you rollercoaster lif...,pos
7,workin hard hardly workin hardee with future c...,pos
8,saw replying bit,pos
9,this were belong,pos


In [21]:
# pd.isnull(tweets).any(1).nonzero()[0]

## Feature Extraction

### bag of words

#### frequencies TF-IDF

Initialize bag of words (TF-IDF)

In [22]:
tfidf_vectorizer = init_tfidf_vectorizer()

tf-idf Vectorizer settings

min_df :	 5
max_features :	 5000
sublinear_tf :	 True
use_idf :	 True
max_df :	 0.8
ngram_range :	 (1, 1)
tokenizer :	 True
number_of_stopwords :	 153
-

stopwords:
 frozenset({'elsewhere', 'everywhere', 'however', 'needn', 'in', 'whereupon', 'it', 'indeed', 'around', 'not', 'eight', 'empty', 'its', 'herein', 'couldn', 'yet', 'few', 'for', 'than', 'behind', 'wasn', 'anywhere', 'under', 'by', 'nobody', 'since', 'won', 'has', 'something', 'wherever', 'would', 'shouldn', 'co', 'whatever', 'myself', 'are', 'them', 'himself', 'first', 'through', 'this', 'many', 'was', 'down', 'wherein', 'him', 'forty', 's', 'until', 'me', 'an', 'mill', 'thence', 'often', 'etc', 'inc', 'had', 'another', 'us', 'two', 'my', 'i', 'herself', 'rather', 'hundred', 'most', 'must', 'others', 'next', 'be', 'please', 'about', 'amount', 'less', 'against', 'namely', 'such', 'take', 'been', 'll', 'doing', 'afterwards', 'back', 'but', 'top', 'ain', 'although', 'made', 'put', 'anyway', 'so', 'we

## Feature Expansion

### polynomial expansion

### standardization

## Classification (simple training)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=split_params['test_size'], random_state=split_params['random_state'])
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

### Naive Bayes

In [24]:
nbclf = MultinomialNB()
nbclf.fit(tfidf_train_vectors, y_train)
prediction_bayes = nbclf.predict(tfidf_test_vectors)
print(prediction_bayes.shape)
print(classification_report(y_test, prediction_bayes))
print('score: ',accuracy_score(y_test,prediction_bayes))

(180,)
             precision    recall  f1-score   support

        neg       0.68      0.40      0.50        81
        pos       0.63      0.85      0.72        99

avg / total       0.65      0.64      0.62       180

score:  0.644444444444


### Random Forests

Very nice algorithm because it runs in parallel!

In [25]:
forest = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
forest.fit(tfidf_train_vectors, y_train)
y_pred_forest = forest.predict(tfidf_test_vectors)

print(classification_report(y_test, y_pred_forest))
print('score: ',accuracy_score(y_test,y_pred_forest))

             precision    recall  f1-score   support

        neg       0.60      0.59      0.60        81
        pos       0.67      0.68      0.67        99

avg / total       0.64      0.64      0.64       180

score:  0.638888888889


### SVM

In [26]:
# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)

# print(classification_report(y_test, prediction_linear))
# print('score: ',accuracy_score(y_test,prediction_bayes))

### Get top k most important features

#### Naive Bayes

In [27]:
print(topk_most_important_features(tfidf_vectorizer, nbclf, k=10))

['love', 'good', 'follow', 'thanks', 'like', 'lol', 'day', 'know', 'time', 'got']


In [28]:
show_most_informative_features(tfidf_vectorizer, nbclf, n=1000)

	-7.3987	battery        		-4.0926	love           
	-7.3987	brand          		-4.1579	good           
	-7.3987	camera         		-4.1766	follow         
	-7.3987	college        		-4.2666	thanks         
	-7.3987	complete       		-4.2742	like           
	-7.3987	custom         		-4.3019	lol            
	-7.3987	damn           		-4.3498	day            
	-7.3987	digital        		-4.4074	know           
	-7.3987	dvd            		-4.4748	time           
	-7.3987	edition        		-4.5181	got            
	-7.3987	electronics    		-4.5869	haha           
	-7.3987	frame          		-4.6852	wa             
	-7.3987	glass          		-4.7247	thank          
	-7.3987	hardcover      		-4.7327	want           
	-7.3987	health         		-4.8598	girl           
	-7.3987	inch           		-4.8744	make           
	-7.3987	kit            		-4.8864	night          
	-7.3987	memory         		-4.9688	wait           
	-7.3987	pack           		-4.9858	today          
	-7.3987	paperback      		-5.0206	going          


## K fold Cross validation & Model Selection

In [29]:
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])

### Naive Bayes

In [30]:
# need to do a for loop to find best alpha param (when get access to server)
naivebayesclf = MultinomialNB()
avg_test_accuracy, cv_bayes = cross_validation(naivebayesclf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=kfold['naive_bayes'])
print('avg score: ',avg_test_accuracy)

avg score:  0.673367983692


### Random Forests (Model Selection)

In [31]:
# need to do a for loop to find best parameters (when get access to server)
forest_clf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
avg_test_accuracy, cv_forest = cross_validation(forest_clf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=kfold['random_forest'])
print('avg score: ',avg_test_accuracy)

avg score:  0.651044957284


## Learning curves

### Naive Bayes

In [32]:
# plot_learning_curve(naivebayesclf, 'Learning Curve - Naive Bayes', tfidf_train_vectors, tweets['sentiment'], cv=cv_bayes)
# plt.show()

### Random Forests

In [33]:
# plot_learning_curve(forest_clf, 'Learning Curve - Random Forest', tfidf_train_vectors, tweets['sentiment'], cv=cv_forest)
# plt.show()

### Vocabulary

In [34]:
# tfidf_vectorizer.vocabulary_
tfidf_vectorizer.get_feature_names()

['actually',
 'ago',
 'aha',
 'ahh',
 'amazing',
 'american',
 'answer',
 'anymore',
 'anyways',
 'ask',
 'asleep',
 'awake',
 'away',
 'awesome',
 'aww',
 'babe',
 'baby',
 'bad',
 'barca',
 'battery',
 'beach',
 'beat',
 'beautiful',
 'beauty',
 'bed',
 'believe',
 'best',
 'better',
 'big',
 'birthday',
 'bit',
 'bitch',
 'black',
 'blue',
 'body',
 'boo',
 'book',
 'box',
 'boy',
 'boyfriend',
 'brand',
 'break',
 'bring',
 'brother',
 'brown',
 'btw',
 'busy',
 'buy',
 'called',
 'came',
 'camera',
 'car',
 'care',
 'case',
 'cause',
 'cell',
 'chance',
 'change',
 'check',
 'class',
 'classic',
 'close',
 'cold',
 'college',
 'color',
 'come',
 'coming',
 'complete',
 'congrats',
 'cont',
 'cool',
 'course',
 'cover',
 'crazy',
 'cuddle',
 'custom',
 'cut',
 'cute',
 'dad',
 'damn',
 'dance',
 'date',
 'day',
 'ddr',
 'design',
 'dick',
 'didnt',
 'die',
 'digital',
 'dinner',
 'direction',
 'doe',
 'dog',
 'doin',
 'dont',
 'dream',
 'dude',
 'dvd',
 'early',
 'eat',
 'eating',


## Write results to file

In [35]:
test_tweets = preprocessing(test_tweets,train=False, params=preprocessing_params)
test_tweets.head()

Preprocessing Settings:

furl :	 True
fpunctuation :	 True
fsmall_words :	 True
fdigits :	 True
fduplicates :	 True
frepeated_chars :	 True
save :	 False
fuser :	 True
fhashtag :	 True
-

Tweets Preprocessing for the Training set started


There is no cached file for preprocessed tweets

Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

 Preprocessed tweets did not saved...

Tweets Preprocessing have been successfully finished!


Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter sports with the portab...,
1,shucks well work all week now can come cheer y...,
2,cant stay away from bug thats baby,
3,lol perfectly fine and not contagious anymore ...,
4,whenever fall asleep watching the always wake ...,


In [36]:
final_clf = MultinomialNB()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
test_data = tfidf_vectorizer.transform(test_tweets['tweet'])
final_clf.fit(tfidf_train_vectors, tweets['sentiment'])
pred = final_clf.predict(test_data)

In [37]:
create_csv_submission(pred, DATA_PATH+PRED_SUBMISSION_FILE)

## Test Code & Useful stuff

In [38]:
from nltk.corpus import brown
len(brown.words())

1161192

In [39]:
print(tfidf_vectorizer.vocabulary_['follow'])

120


In [40]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))

Stem studying: studi
Lemmatise studying: studying
Lemmatise studying: study
