# Twitter Sentiment Analysis

### Parameters

In [1]:
from options import *

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline

#### Utils

In [3]:
from utils import *

#### Plots

In [4]:
from plots import *

#### Data Preprocessing

In [5]:
from preprocessing import *

#### Feature Extraction

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

from vectorizer import init_tfidf_vectorizer

#### Cross validation

In [7]:
from cross_validation import *



#### Machine Learning

In [8]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#### Options

In [9]:
pd.options.mode.chained_assignment = None

## Load Data

### Train Data

In [10]:
pos_tweets = pd.read_table(DATA_PATH+POS_TWEETS_FILE, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(DATA_PATH+NEG_TWEETS_FILE ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

In [11]:
# pos_tweets.head()

In [12]:
# neg_tweets.head()

In [13]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (971, 2)
negative tweets shape:  (947, 2)


In [14]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(1918, 2)

In [15]:
# tweets.head()

In [16]:
# tweets.tail()

### Test Data

In [17]:
test_tweets = pd.read_table(DATA_PATH+TEST_TWEETS_FILE, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Part of speech tagging

In [18]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

In [19]:
tweets = preprocessing(tweets,train=True, params=preprocessing_params)

Preprocessing Settings:

fuser :	 True
fhashtag :	 True
fpunctuation :	 True
furl :	 True
fduplicates :	 True
frepeated_chars :	 True
fsmall_words :	 True
fdigits :	 True
save :	 False
-

Tweets Preprocessing for the Training set started


There is no cached file for preprocessed tweets

Number of tweets before duplicates removal:	 1918
Number of tweets after duplicates removal:	 1791
Duplicates removal DONE
Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

 Preprocessed tweets did not saved...

Tweets Preprocessing have been successfully finished!


### Tweets final representation

In [20]:
tweets

Unnamed: 0,tweet,sentiment
0,dunno justin read mention not only justin and ...,pos
1,because your logic dumb won even crop out your...,pos
2,just put casper box looved the battle crakkbitch,pos
3,thanks sir don trip lil mama just keep doin thang,pos
4,visiting brother tmr the bestest birthday gift...,pos
5,yay lifecompleted tweet facebook let know please,pos
6,dnextalbumtitle feel for you rollercoaster lif...,pos
7,workin hard hardly workin hardee with future c...,pos
8,saw replying bit,pos
9,this were belong,pos


In [21]:
# pd.isnull(tweets).any(1).nonzero()[0]

## Feature Extraction

### bag of words

#### frequencies TF-IDF

Initialize bag of words (TF-IDF)

In [22]:
tfidf_vectorizer = init_tfidf_vectorizer()

tf-idf Vectorizer settings

use_idf :	 True
min_df :	 5
tokenizer :	 True
max_features :	 None
max_df :	 0.8
number_of_stopwords :	 10
ngram_range :	 (1, 1)
sublinear_tf :	 True
-

stopwords:
 frozenset({'mustn', 'before', 'always', 'hence', 'our', 'perhaps', 'empty', 'give', 'above', 'just', 'whom', 'few', 'whence', 'nine', 'if', 'will', 'along', 'being', 'their', 'behind', 'whoever', 'anything', 'everything', 'system', 'all', 'except', 'ie', 'however', 'see', 'mill', 'side', 'every', 'we', 'yours', 'someone', 'others', 'ever', 'although', 'cant', 'get', 'do', 'forty', 'whenever', 're', 'him', 'by', 'against', 'indeed', 'my', 'whereas', 'hasnt', 'last', 'anyway', 'and', 'below', 'cannot', 'or', 'be', 'sincere', 'each', 'name', 'one', 'thereby', 'first', 'yet', 'towards', 'at', 'eight', 'seemed', 'per', 'has', 'only', 'so', 'into', 'seem', 'mostly', 'even', 'became', 'onto', 'five', 'well', 'somewhere', 'top', 'us', 'former', 'very', 'un', 'whether', 'than', 'ltd', 'take', 'own', 'mine

## Feature Expansion

### polynomial expansion

### standardization

## Classification (simple training)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=split_params['test_size'], random_state=split_params['random_state'])
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

### Naive Bayes

In [24]:
nbclf = MultinomialNB()
nbclf.fit(tfidf_train_vectors, y_train)
prediction_bayes = nbclf.predict(tfidf_test_vectors)
print(prediction_bayes.shape)
print(classification_report(y_test, prediction_bayes))
print('score: ',accuracy_score(y_test,prediction_bayes))

(180,)
             precision    recall  f1-score   support

        neg       0.69      0.42      0.52        81
        pos       0.64      0.85      0.73        99

avg / total       0.66      0.66      0.64       180

score:  0.655555555556


### Random Forests

Very nice algorithm because it runs in parallel!

In [25]:
forest = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
forest.fit(tfidf_train_vectors, y_train)
y_pred_forest = forest.predict(tfidf_test_vectors)

print(classification_report(y_test, y_pred_forest))
print('score: ',accuracy_score(y_test,y_pred_forest))

             precision    recall  f1-score   support

        neg       0.60      0.60      0.60        81
        pos       0.67      0.67      0.67        99

avg / total       0.64      0.64      0.64       180

score:  0.638888888889


### SVM

In [26]:
# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)

# print(classification_report(y_test, prediction_linear))
# print('score: ',accuracy_score(y_test,prediction_bayes))

### Get top k most important features

#### Naive Bayes

In [27]:
print(topk_most_important_features(tfidf_vectorizer, nbclf, k=10))

['love', 'good', 'follow', 'thanks', 'lol', 'like', 'day', 'know', 'time', 'got']


In [28]:
show_most_informative_features(tfidf_vectorizer, nbclf, n=1000)

	-7.4151	battery        		-4.1147	love           
	-7.4151	brand          		-4.1763	good           
	-7.4151	camera         		-4.2003	follow         
	-7.4151	college        		-4.2945	thanks         
	-7.4151	complete       		-4.3191	lol            
	-7.4151	custom         		-4.3322	like           
	-7.4151	damn           		-4.3677	day            
	-7.4151	digital        		-4.4402	know           
	-7.4151	dvd            		-4.4913	time           
	-7.4151	edition        		-4.5472	got            
	-7.4151	electronics    		-4.6170	haha           
	-7.4151	frame          		-4.7016	wa             
	-7.4151	glass          		-4.7468	thank          
	-7.4151	hardcover      		-4.7652	want           
	-7.4151	health         		-4.8843	girl           
	-7.4151	inch           		-4.8976	make           
	-7.4151	kit            		-4.9028	night          
	-7.4151	memory         		-4.9880	wait           
	-7.4151	pack           		-5.0021	today          
	-7.4151	paperback      		-5.0232	don            


## K fold Cross validation & Model Selection

In [29]:
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])

### Naive Bayes

In [30]:
# need to do a for loop to find best alpha param (when get access to server)
naivebayesclf = MultinomialNB()
avg_test_accuracy = cross_validation(naivebayesclf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=kfold['naive_bayes'])
print('avg score: ',avg_test_accuracy)

avg score:  0.676713714384


### Random Forests (Model Selection)

In [31]:
# need to do a for loop to find best parameters (when get access to server)
forest_clf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
avg_test_accuracy = cross_validation(forest_clf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=kfold['random_forest'])
print('avg score: ',avg_test_accuracy)

avg score:  0.645455252797


## Learning curves

### Naive Bayes

In [32]:
# plot_learning_curve(naivebayesclf, 'Learning Curve - Naive Bayes', tfidf_train_vectors, tweets['sentiment'], cv=cv)
# plt.show()

### Random Forests

In [33]:
# plot_learning_curve(forest_clf, 'Learning Curve - Random Forest', tfidf_train_vectors, tweets['sentiment'], cv=cv)
# plt.show()

## Write results to file

In [34]:
test_tweets = preprocessing(test_tweets,train=False, params=preprocessing_params)
test_tweets.head()

Preprocessing Settings:

fuser :	 True
fhashtag :	 True
fpunctuation :	 True
furl :	 True
fduplicates :	 True
frepeated_chars :	 True
fsmall_words :	 True
fdigits :	 True
save :	 False
-

Tweets Preprocessing for the Training set started


There is no cached file for preprocessed tweets

Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

 Preprocessed tweets did not saved...

Tweets Preprocessing have been successfully finished!


Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter sports with the portab...,
1,shucks well work all week now can come cheer y...,
2,cant stay away from bug thats baby,
3,lol perfectly fine and not contagious anymore ...,
4,whenever fall asleep watching the always wake ...,


In [35]:
final_clf = MultinomialNB()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
test_data = tfidf_vectorizer.transform(test_tweets['tweet'])
final_clf.fit(tfidf_train_vectors, tweets['sentiment'])
pred = final_clf.predict(test_data)

In [36]:
create_csv_submission(pred, DATA_PATH+PRED_SUBMISSION_FILE)

## Test Code & Useful stuff

In [37]:
from nltk.corpus import brown
len(brown.words())

1161192

In [38]:
tfidf_vectorizer.vocabulary_

{'actually': 0,
 'ago': 1,
 'aha': 2,
 'ahh': 3,
 'ain': 4,
 'amazing': 5,
 'american': 6,
 'answer': 7,
 'anymore': 8,
 'anyways': 9,
 'ask': 10,
 'asleep': 11,
 'awake': 12,
 'away': 13,
 'awesome': 14,
 'aww': 15,
 'babe': 16,
 'baby': 17,
 'bad': 18,
 'barca': 19,
 'battery': 20,
 'beach': 21,
 'beat': 22,
 'beautiful': 23,
 'beauty': 24,
 'bed': 25,
 'believe': 26,
 'best': 27,
 'better': 28,
 'big': 29,
 'birthday': 30,
 'bit': 31,
 'bitch': 32,
 'black': 33,
 'blue': 34,
 'body': 35,
 'boo': 36,
 'book': 37,
 'box': 38,
 'boy': 39,
 'boyfriend': 40,
 'brand': 41,
 'break': 42,
 'bring': 43,
 'brother': 44,
 'brown': 45,
 'btw': 46,
 'busy': 47,
 'buy': 48,
 'called': 49,
 'came': 50,
 'camera': 51,
 'car': 52,
 'care': 53,
 'case': 54,
 'cause': 55,
 'cell': 56,
 'chance': 57,
 'change': 58,
 'check': 59,
 'class': 60,
 'classic': 61,
 'close': 62,
 'cold': 63,
 'college': 64,
 'color': 65,
 'come': 66,
 'coming': 67,
 'complete': 68,
 'congrats': 69,
 'cont': 70,
 'cool': 71,
 

In [39]:
print(tfidf_vectorizer.vocabulary_['follow'])

127


In [40]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))

Stem studying: studi
Lemmatise studying: studying
Lemmatise studying: study
