# Twitter Sentiment Analysis

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline

#### Utils

In [None]:
from utils import *

#### Plots

In [None]:
from plots import *

#### Data Preprocessing

In [4]:
from preprocessing import *

#### Feature Extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

#### Cross validation

In [6]:
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold as cross_validation_KFold



#### Machine Learning

In [7]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#### Options

In [8]:
pd.options.mode.chained_assignment = None

## Load Data

In [9]:
data_path = '../data/'
pos_tweets_file = 'train_pos_small.txt'
neg_tweets_file = 'train_neg_small.txt'
test_tweets_file = 'test_data.txt'

### Train Data

In [10]:
pos_tweets = pd.read_table(data_path+pos_tweets_file, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(data_path+neg_tweets_file ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

In [11]:
# pos_tweets.head()

In [12]:
# neg_tweets.head()

In [13]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (97902, 2)
negative tweets shape:  (99068, 2)


In [14]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(196970, 2)

In [15]:
# tweets.head()

In [16]:
# tweets.tail()

### Test Data

In [17]:
test_tweets = pd.read_table(data_path+test_tweets_file, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Part of speech tagging

In [18]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

In [19]:
tweets = preprocessing(tweets,train=True)
my_stop_words = find_stopwords()

Tweets Preprocessing for the Training set started


Tweets have been successfully loaded!


### Tweets final representation

In [20]:
tweets

Unnamed: 0,tweet,sentiment
0,dunno justin read mention not only justin and ...,pos
1,because your logic dumb won even crop out your...,pos
2,just put casper box looved the battle crakkbitch,pos
3,thanks sir don trip lil mama just keep doin thang,pos
4,visiting brother tmr the bestest birthday gift...,pos
5,yay lifecompleted tweet facebook let know please,pos
6,dnextalbumtitle feel for you rollercoaster lif...,pos
7,workin hard hardly workin hardee with future c...,pos
8,saw replying bit,pos
9,this were belong,pos


In [21]:
# pd.isnull(tweets).any(1).nonzero()[0]

## Feature Extraction

### bag of words

#### frequencies TF-IDF

Initialize bag of words (TF-IDF)

In [34]:
tfidf_vectorizer = TfidfVectorizer(
                                   min_df=5, 
                                   max_df = 0.8, 
                                   sublinear_tf=True, 
                                   use_idf=True,
                                   stop_words=my_stop_words, 
                                   tokenizer=LemmaTokenizer(),
#                                    ngram_range=(1, 2)
#                                    max_features=1500
                                  )

## Feature Expansion

### polynomial expansion

### standardization

## Classification (simple training)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=0.10, random_state=4)
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

### Naive Bayes

In [36]:
clf = MultinomialNB()
clf.fit(tfidf_train_vectors, y_train)
prediction_bayes = clf.predict(tfidf_test_vectors)
print(prediction_bayes.shape)
print(classification_report(y_test, prediction_bayes))
print('score: ',accuracy_score(y_test,prediction_bayes))

(17849,)
             precision    recall  f1-score   support

        neg       0.79      0.69      0.74      9070
        pos       0.72      0.81      0.76      8779

avg / total       0.75      0.75      0.75     17849

score:  0.747772984481


#### Get top k most important features

In [None]:
print(topk_most_important_features(tfidf_vectorizer, clf, k=10))

In [None]:
show_most_informative_features(tfidf_vectorizer, clf, n=1000)

### Random Forests

Very nice algorithm because it runs in parallel!

In [37]:
forest = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
forest.fit(tfidf_train_vectors, y_train)
y_pred_forest = forest.predict(tfidf_test_vectors)

print(classification_report(y_test, y_pred_forest))
print('score: ',accuracy_score(y_test,y_pred_forest))

             precision    recall  f1-score   support

        neg       0.82      0.62      0.70      9070
        pos       0.68      0.86      0.76      8779

avg / total       0.75      0.74      0.73     17849

score:  0.73634377276


### SVM

In [None]:
# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)

# print(classification_report(y_test, prediction_linear))
# print('score: ',accuracy_score(y_test,prediction_bayes))

## K fold Cross validation

### Naive Bayes

In [None]:
# cv = cross_validation_KFold(tweets.shape[0], shuffle = True, n_folds=10, random_state=4)
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
# clfkfold = MultinomialNB()
# avg_test_accuracy = np.mean(cross_val_score(clfkfold, tfidf_train_vectors, tweets['sentiment'], cv=cv, scoring='accuracy'))
# print('avg score: ',avg_test_accuracy)

### Random Forests (Model Selection)

In [None]:
cv = cross_validation_KFold(tweets.shape[0], shuffle = True, n_folds=10, random_state=4)
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
forest_clf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
avg_test_accuracy = np.mean(cross_val_score(forest_clf, tfidf_train_vectors, tweets['sentiment'], cv=cv, scoring='accuracy'))
print('avg score: ',avg_test_accuracy)

## Learning curves

### Naive Bayes

In [None]:
# plot_learning_curve(clfkfold, 'Learning Curve - Naive Bayes', tfidf_train_vectors, tweets['sentiment'], cv=cv)
# plt.show()

## Write results to file

In [None]:
test_tweets = preprocessing(test_tweets,train=False)
test_tweets.head()

In [None]:
final_clf = MultinomialNB()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
test_data = tfidf_vectorizer.transform(test_tweets['tweet'])
final_clf.fit(tfidf_train_vectors, tweets['sentiment'])
pred = final_clf.predict(test_data)

In [None]:
pred_file = 'pred_submission.csv'
create_csv_submission(pred, data_path+pred_file)

## Test Code & Useful stuff

In [None]:
from nltk.corpus import brown
len(brown.words())

In [None]:
tfidf_vectorizer.vocabulary_

In [None]:
print(tfidf_vectorizer.vocabulary_['follow'])

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))