# Twitter Sentiment Analysis

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

#### Utils

In [2]:
from utils import *

#### Plots

In [3]:
from plots import *

#### Data Preprocessing

In [4]:
from preprocessing import *

#### Feature Extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

#### Cross validation

In [6]:
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold as cross_validation_KFold



#### Machine Learning

In [7]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm

#### Options

In [8]:
pd.options.mode.chained_assignment = None

## Load Data

In [9]:
data_path = '../data/'
pos_tweets_file = 'train_pos_small.txt'
neg_tweets_file = 'train_neg_small.txt'
test_tweets_file = 'test_data.txt'

### Train Data

In [10]:
pos_tweets = pd.read_table(data_path+pos_tweets_file, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(data_path+neg_tweets_file ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

In [11]:
# pos_tweets.head()

In [12]:
# neg_tweets.head()

In [13]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (971, 2)
negative tweets shape:  (947, 2)


In [14]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(1918, 2)

In [15]:
# tweets.head()

In [16]:
# tweets.tail()

### Test Data

In [17]:
test_tweets = pd.read_table(data_path+test_tweets_file, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Part of speech tagging

In [18]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

In [19]:
tweets = preprocessing(tweets,train=True)
my_stop_words = find_stopwords()

Tweets Preprocessing for the Training set started


There is no cached file for preprocessed tweets

Number of tweets before duplicates removal:	 1918
Number of tweets after duplicates removal:	 1791
Duplicates removal DONE
Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

Saving preprocessed tweets...
DONE

Tweets Preprocessing have been successfully finished!


### Tweets final representation

In [20]:
tweets

Unnamed: 0,tweet,sentiment
0,dunno justin read mention not only justin and ...,pos
1,because your logic dumb won even crop out your...,pos
2,just put casper box looved the battle crakkbitch,pos
3,thanks sir don trip lil mama just keep doin thang,pos
4,visiting brother tmr the bestest birthday gift...,pos
5,yay lifecompleted tweet facebook let know please,pos
6,dnextalbumtitle feel for you rollercoaster lif...,pos
7,workin hard hardly workin hardee with future c...,pos
8,saw replying bit,pos
9,this were belong,pos


In [21]:
# pd.isnull(tweets).any(1).nonzero()[0]

## Feature Extraction

In [22]:
X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=0.10, random_state=4)

### bag of words

#### frequencies TF-IDF

In [23]:
#initialize bag of words (tf-idf)
#ngram_range=(1, 2)
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True,
                                   stop_words=my_stop_words, tokenizer=LemmaTokenizer())

## Feature Expansion

### polynomial expansion

### standardization

## Classification

### Naive Bayes

In [24]:
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

clf = MultinomialNB()
clf.fit(tfidf_train_vectors, y_train)
prediction_bayes = clf.predict(tfidf_test_vectors)
print(prediction_bayes.shape)
print(classification_report(y_test, prediction_bayes))
print('score: ',accuracy_score(y_test,prediction_bayes))

(180,)
             precision    recall  f1-score   support

        neg       0.68      0.40      0.50        81
        pos       0.63      0.85      0.72        99

avg / total       0.65      0.64      0.62       180

score:  0.644444444444


#### Get top k most important features

In [25]:
print(topk_most_important_features(tfidf_vectorizer, clf, k=10))

['love', 'good', 'follow', 'thanks', 'like', 'lol', 'day', 'know', 'time', 'got']


In [26]:
show_most_informative_features(tfidf_vectorizer, clf, n=1000)

	-7.3987	battery        		-4.0926	love           
	-7.3987	brand          		-4.1579	good           
	-7.3987	camera         		-4.1766	follow         
	-7.3987	college        		-4.2666	thanks         
	-7.3987	complete       		-4.2742	like           
	-7.3987	custom         		-4.3019	lol            
	-7.3987	damn           		-4.3498	day            
	-7.3987	digital        		-4.4074	know           
	-7.3987	dvd            		-4.4748	time           
	-7.3987	edition        		-4.5181	got            
	-7.3987	electronics    		-4.5869	haha           
	-7.3987	frame          		-4.6852	wa             
	-7.3987	glass          		-4.7247	thank          
	-7.3987	hardcover      		-4.7327	want           
	-7.3987	health         		-4.8598	girl           
	-7.3987	inch           		-4.8744	make           
	-7.3987	kit            		-4.8864	night          
	-7.3987	memory         		-4.9688	wait           
	-7.3987	pack           		-4.9858	today          
	-7.3987	paperback      		-5.0206	going          


### SVM

In [27]:
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
# tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)

# print(classification_report(y_test, prediction_linear))
# print(accuracy_score(y_test,prediction_bayes))

             precision    recall  f1-score   support

        neg       0.58      0.40      0.47        81
        pos       0.61      0.77      0.68        99

avg / total       0.60      0.60      0.58       180

0.644444444444


## K fold Cross validation

### Naive Bayes

In [28]:
# cv = cross_validation_KFold(tweets.shape[0], shuffle = True, n_folds=10, random_state=4)
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
# clfkfold = MultinomialNB()
# avg_test_accuracy = np.mean(cross_val_score(clfkfold, tfidf_train_vectors, tweets['sentiment'], cv=cv, scoring='accuracy'))
# print('avg score: ',avg_test_accuracy)

## Learning curves

### Naive Bayes

In [29]:
# plot_learning_curve(clfkfold, 'Learning Curve - Naive Bayes', tfidf_train_vectors, tweets['sentiment'], cv=cv)
# plt.show()

## Write results to file

In [30]:
test_tweets = preprocessing(test_tweets,train=False)
test_tweets.head()

Tweets Preprocessing for the Training set started

Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

Tweets Preprocessing have been successfully finished!


Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter sports with the portab...,
1,shucks well work all week now can come cheer y...,
2,cant stay away from bug thats baby,
3,lol perfectly fine and not contagious anymore ...,
4,whenever fall asleep watching the always wake ...,


In [31]:
final_clf = MultinomialNB()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
test_data = tfidf_vectorizer.transform(test_tweets['tweet'])
final_clf.fit(tfidf_train_vectors, tweets['sentiment'])
pred = final_clf.predict(test_data)

In [32]:
pred_file = 'pred_submission.csv'
create_csv_submission(pred, data_path+pred_file)

## Test Code & Useful stuff

In [33]:
from nltk.corpus import brown
len(brown.words())

1161192

In [34]:
tfidf_vectorizer.vocabulary_

{'actually': 0,
 'ago': 1,
 'aha': 2,
 'ahh': 3,
 'amazing': 4,
 'american': 5,
 'answer': 6,
 'anymore': 7,
 'anyways': 8,
 'ask': 9,
 'asleep': 10,
 'awake': 11,
 'away': 12,
 'awesome': 13,
 'aww': 14,
 'babe': 15,
 'baby': 16,
 'bad': 17,
 'barca': 18,
 'battery': 19,
 'beach': 20,
 'beat': 21,
 'beautiful': 22,
 'beauty': 23,
 'bed': 24,
 'believe': 25,
 'best': 26,
 'better': 27,
 'big': 28,
 'birthday': 29,
 'bit': 30,
 'bitch': 31,
 'black': 32,
 'blue': 33,
 'body': 34,
 'boo': 35,
 'book': 36,
 'box': 37,
 'boy': 38,
 'boyfriend': 39,
 'brand': 40,
 'break': 41,
 'bring': 42,
 'brother': 43,
 'brown': 44,
 'btw': 45,
 'busy': 46,
 'buy': 47,
 'called': 48,
 'came': 49,
 'camera': 50,
 'car': 51,
 'care': 52,
 'case': 53,
 'cause': 54,
 'cell': 55,
 'chance': 56,
 'change': 57,
 'check': 58,
 'class': 59,
 'classic': 60,
 'close': 61,
 'cold': 62,
 'college': 63,
 'color': 64,
 'come': 65,
 'coming': 66,
 'complete': 67,
 'congrats': 68,
 'cont': 69,
 'cool': 70,
 'course': 71

In [35]:
print(tfidf_vectorizer.vocabulary_['follow'])

120


In [36]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))

Stem studying: studi
Lemmatise studying: studying
Lemmatise studying: study
