# Twitter Sentiment Analysis

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

#### Utils

In [2]:
from utils import *

#### Plots

In [3]:
from plots import *

#### Data Preprocessing

In [4]:
from preprocessing import *

#### Feature Extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

#### Cross validation

In [6]:
from cross_validation import *



#### Machine Learning

In [7]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#### Options

In [8]:
pd.options.mode.chained_assignment = None

## Load Data

In [9]:
data_path = '../data/'
pos_tweets_file = 'train_pos.txt'
neg_tweets_file = 'train_neg.txt'
test_tweets_file = 'test_data.txt'

### Train Data

In [10]:
pos_tweets = pd.read_table(data_path+pos_tweets_file, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(data_path+neg_tweets_file ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

In [11]:
# pos_tweets.head()

In [12]:
# neg_tweets.head()

In [13]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (97902, 2)
negative tweets shape:  (99068, 2)


In [14]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(196970, 2)

In [15]:
# tweets.head()

In [16]:
# tweets.tail()

### Test Data

In [17]:
test_tweets = pd.read_table(data_path+test_tweets_file, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Part of speech tagging

In [18]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

In [19]:
tweets = preprocessing(tweets,train=True)
my_stop_words = find_stopwords()

Tweets Preprocessing for the Training set started


There is no cached file for preprocessed tweets

Number of tweets before duplicates removal:	 196970
Number of tweets after duplicates removal:	 178483
Duplicates removal DONE
Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

Saving preprocessed tweets...
DONE

Tweets Preprocessing have been successfully finished!


### Tweets final representation

In [20]:
tweets

Unnamed: 0,tweet,sentiment
0,dunno justin read mention not only justin and ...,pos
1,because your logic dumb won even crop out your...,pos
2,just put casper box looved the battle crakkbitch,pos
3,thanks sir don trip lil mama just keep doin thang,pos
4,visiting brother tmr the bestest birthday gift...,pos
5,yay lifecompleted tweet facebook let know please,pos
6,dnextalbumtitle feel for you rollercoaster lif...,pos
7,workin hard hardly workin hardee with future c...,pos
8,saw replying bit,pos
9,this were belong,pos


In [21]:
# pd.isnull(tweets).any(1).nonzero()[0]

## Feature Extraction

### bag of words

#### frequencies TF-IDF

Initialize bag of words (TF-IDF)

In [22]:
tfidf_vectorizer = TfidfVectorizer(
                                   min_df=5, 
                                   max_df = 0.8, 
                                   sublinear_tf=True, 
                                   use_idf=True,
                                   stop_words=my_stop_words, 
                                   tokenizer=LemmaTokenizer(),
#                                    ngram_range=(1, 2)
#                                    max_features=1500
                                  )

## Feature Expansion

### polynomial expansion

### standardization

## Classification (simple training)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=0.10, random_state=4)
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

### Naive Bayes

In [24]:
nbclf = MultinomialNB()
nbclf.fit(tfidf_train_vectors, y_train)
prediction_bayes = nbclf.predict(tfidf_test_vectors)
print(prediction_bayes.shape)
print(classification_report(y_test, prediction_bayes))
print('score: ',accuracy_score(y_test,prediction_bayes))

(17849,)
             precision    recall  f1-score   support

        neg       0.78      0.68      0.73      9070
        pos       0.71      0.80      0.75      8779

avg / total       0.74      0.74      0.74     17849

score:  0.738640820214


### Random Forests

Very nice algorithm because it runs in parallel!

In [25]:
forest = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
forest.fit(tfidf_train_vectors, y_train)
y_pred_forest = forest.predict(tfidf_test_vectors)

print(classification_report(y_test, y_pred_forest))
print('score: ',accuracy_score(y_test,y_pred_forest))

             precision    recall  f1-score   support

        neg       0.80      0.66      0.72      9070
        pos       0.70      0.83      0.76      8779

avg / total       0.75      0.74      0.74     17849

score:  0.743290940669


### SVM

In [26]:
# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)

# print(classification_report(y_test, prediction_linear))
# print('score: ',accuracy_score(y_test,prediction_bayes))

### Get top k most important features

#### Naive Bayes

In [27]:
print(topk_most_important_features(tfidf_vectorizer, nbclf, k=10))

['love', 'good', 'thanks', 'follow', 'lol', 'day', 'know', 'like', 'haha', 'wa']


In [28]:
show_most_informative_features(tfidf_vectorizer, nbclf, n=1000)

	-12.0764	aaa            		-4.6317	love           
	-12.0764	abandon        		-4.7722	good           
	-12.0764	abandoned      		-4.8393	thanks         
	-12.0764	abo            		-4.8520	follow         
	-12.0764	abraham        		-4.8835	lol            
	-12.0764	abstract       		-5.0253	day            
	-12.0764	acai           		-5.0279	know           
	-12.0764	accidental     		-5.0597	like           
	-12.0764	acclaimed      		-5.1937	haha           
	-12.0764	accused        		-5.2262	wa             
	-12.0764	acl            		-5.2915	thank          
	-12.0764	acorn          		-5.3951	got            
	-12.0764	acti           		-5.4678	happy          
	-12.0764	acupuncture    		-5.4837	time           
	-12.0764	adap           		-5.5094	think          
	-12.0764	adaptive       		-5.5149	today          
	-12.0764	adaptor        		-5.5529	make           
	-12.0764	adhesive       		-5.5876	want           
	-12.0764	adj            		-5.6395	going          
	-12.0764	adjustable     		-5.6

## K fold Cross validation & Model Selection

In [29]:
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])

### Naive Bayes

In [30]:
# needless
naivebayesclf = MultinomialNB()
avg_test_accuracy = cross_validation(naivebayesclf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=5)
print('avg score: ',avg_test_accuracy)

avg score:  0.745028938


### Random Forests (Model Selection)

In [31]:
# need to do a for loop to find best parameters (when get access to server)
forest_clf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
avg_test_accuracy = cross_validation(forest_clf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=5)
print('avg score: ',avg_test_accuracy)

avg score:  0.738384011255


## Learning curves

### Naive Bayes

In [32]:
# plot_learning_curve(naivebayesclf, 'Learning Curve - Naive Bayes', tfidf_train_vectors, tweets['sentiment'], cv=cv)
# plt.show()

### Random Forests

In [33]:
# plot_learning_curve(forest_clf, 'Learning Curve - Random Forest', tfidf_train_vectors, tweets['sentiment'], cv=cv)
# plt.show()

## Write results to file

In [34]:
test_tweets = preprocessing(test_tweets,train=False)
test_tweets.head()

Tweets Preprocessing for the Training set started

Repeated characters filtering DONE
Punctuation filtering DONE
User filtering DONE
Url filtering DONE
Hashtag filtering DONE
Digits DONE
Small words filtering DONE

Tweets Preprocessing have been successfully finished!


Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter sports with the portab...,
1,shucks well work all week now can come cheer y...,
2,cant stay away from bug thats baby,
3,lol perfectly fine and not contagious anymore ...,
4,whenever fall asleep watching the always wake ...,


In [35]:
final_clf = MultinomialNB()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
test_data = tfidf_vectorizer.transform(test_tweets['tweet'])
final_clf.fit(tfidf_train_vectors, tweets['sentiment'])
pred = final_clf.predict(test_data)

In [36]:
pred_file = 'pred_submission.csv'
create_csv_submission(pred, data_path+pred_file)

## Test Code & Useful stuff

In [37]:
from nltk.corpus import brown
len(brown.words())

1161192

In [38]:
tfidf_vectorizer.vocabulary_

{'dubstep': 3865,
 'immigration': 6370,
 'painter': 9364,
 'unbelievable': 13731,
 'suggestion': 12570,
 'smartphone': 11908,
 'kwento': 7201,
 'driveway': 3823,
 'tanker': 12827,
 'gomawo': 5354,
 'cause': 2055,
 'somthing': 12046,
 'commander': 2586,
 'comparison': 2617,
 'jaime': 6745,
 'teamfollowback': 12897,
 'claw': 2393,
 'inconvenient': 6417,
 'bowling': 1563,
 'swirl': 12723,
 'pentax': 9563,
 'ashbk': 694,
 'congrats': 2698,
 'peeta': 9544,
 'eva': 4299,
 'successfully': 12548,
 'hayy': 5808,
 'grief': 5503,
 'birthday': 1343,
 'scene': 11263,
 'barbara': 1044,
 'rejected': 10675,
 'fleet': 4752,
 'bfore': 1284,
 'explains': 4404,
 'cloak': 2433,
 'dae': 3132,
 'dan': 3158,
 'ahahahah': 249,
 'sensation': 11421,
 'forr': 4904,
 'chili': 2259,
 'usher': 13865,
 'buffalo': 1753,
 'scripting': 11325,
 'profession': 10175,
 'broadband': 1680,
 'autumn': 850,
 'annyeong': 498,
 'gloss': 5312,
 'liek': 7467,
 'folk': 4831,
 'connector': 2706,
 'pweasee': 10343,
 'gut': 5590,
 'ign

In [39]:
print(tfidf_vectorizer.vocabulary_['follow'])

4834


In [40]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))

Stem studying: studi
Lemmatise studying: studying
Lemmatise studying: study
