# Twitter Sentiment Analysis

### Parameters

In [1]:
from options import *

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline

#### Utils

In [3]:
from utils import *

#### Plots

In [4]:
from plots import *

#### Data Preprocessing

In [5]:
from preprocessing import *

#### Feature Extraction

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

from vectorizer import init_tfidf_vectorizer

#### Cross validation

In [7]:
from cross_validation import *



#### Machine Learning

In [8]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#### Options

In [9]:
pd.options.mode.chained_assignment = None

## Load Data

### Train Data

In [10]:
pos_tweets = pd.read_table(DATA_PATH+POS_TWEETS_FILE, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(DATA_PATH+NEG_TWEETS_FILE ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

In [11]:
# pos_tweets.head()

In [12]:
# neg_tweets.head()

In [13]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (97902, 2)
negative tweets shape:  (99068, 2)


In [14]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(196970, 2)

In [15]:
# tweets.head()

In [16]:
# tweets.tail()

### Test Data

In [17]:
test_tweets = pd.read_table(DATA_PATH+TEST_TWEETS_FILE, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Part of speech tagging

In [18]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

In [19]:
tweets = preprocessing(tweets,train=True, params=preprocessing_params)

Preprocessing Settings:

fhashtag :	 True
fuser :	 True
fduplicates :	 True
frepeated_chars :	 True
furl :	 True
fdigits :	 True
fsmall_words :	 True
save :	 True
fpunctuation :	 True
-

Tweets Preprocessing for the Training set started


Tweets have been successfully loaded!


### Tweets final representation

In [20]:
tweets

Unnamed: 0,tweet,sentiment
0,dunno justin read mention not only justin and ...,pos
1,because your logic dumb won even crop out your...,pos
2,just put casper box looved the battle crakkbitch,pos
3,thanks sir don trip lil mama just keep doin thang,pos
4,visiting brother tmr the bestest birthday gift...,pos
5,yay lifecompleted tweet facebook let know please,pos
6,dnextalbumtitle feel for you rollercoaster lif...,pos
7,workin hard hardly workin hardee with future c...,pos
8,saw replying bit,pos
9,this were belong,pos


In [21]:
# pd.isnull(tweets).any(1).nonzero()[0]

## Feature Extraction

### bag of words

#### frequencies TF-IDF

Initialize bag of words (TF-IDF)

In [None]:
tfidf_vectorizer = init_tfidf_vectorizer()

tf-idf Vectorizer settings

use_idf :	 True
min_df :	 5
number_of_stopwords :	 153
max_df :	 0.8
ngram_range :	 (1, 1)
tokenizer :	 True
max_features :	 5000
sublinear_tf :	 True
-

stopwords:
 frozenset({'nevertheless', 'via', 'though', 'which', 'whoever', 'anyone', 'anyhow', 'did', 'because', 'there', 'un', 'name', 'never', 'wherein', 'eight', 'some', 'not', 'fill', 'throughout', 'yet', 'about', 'from', 'ourselves', 'toward', 'whatever', 'she', 'someone', 'll', 'seemed', 'nine', 'somehow', 'be', 'does', 'etc', 'else', 'every', 'least', 'always', 'fifteen', 'ours', 'own', 'having', 'isn', 'last', 'third', 'him', 'mightn', 'enough', 'very', 'afterwards', 'fire', 'didn', 'whereby', 'nor', 'been', 'our', 'each', 'o', 'wherever', 'a', 'now', 'her', 'you', 'other', 'around', 'still', 'cant', 'below', 'have', 'mostly', 'rather', 'bottom', 've', 'ma', 'ain', 'moreover', 'alone', 'although', 'has', 'hereby', 'former', 'even', 'per', 'sincere', 'indeed', 'something', 'am', 'nobody', 'here', 'w

## Feature Expansion

### polynomial expansion

### standardization

## Classification (simple training)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=split_params['test_size'], random_state=split_params['random_state'])
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
#shape: (number_of_tweets, all_words)

### Naive Bayes

In [None]:
nbclf = MultinomialNB()
nbclf.fit(tfidf_train_vectors, y_train)
prediction_bayes = nbclf.predict(tfidf_test_vectors)
print(prediction_bayes.shape)
print(classification_report(y_test, prediction_bayes))
print('score: ',accuracy_score(y_test,prediction_bayes))

### Random Forests

Very nice algorithm because it runs in parallel!

In [None]:
# forest = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
# forest.fit(tfidf_train_vectors, y_train)
# y_pred_forest = forest.predict(tfidf_test_vectors)

# print(classification_report(y_test, y_pred_forest))
# print('score: ',accuracy_score(y_test,y_pred_forest))

### SVM

In [None]:
# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)

# print(classification_report(y_test, prediction_linear))
# print('score: ',accuracy_score(y_test,prediction_bayes))

### Get top k most important features

#### Naive Bayes

In [None]:
print(topk_most_important_features(tfidf_vectorizer, nbclf, k=10))

In [None]:
show_most_informative_features(tfidf_vectorizer, nbclf, n=1000)

## K fold Cross validation & Model Selection

In [None]:
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])

### Naive Bayes

In [None]:
# need to do a for loop to find best alpha param (when get access to server)
# naivebayesclf = MultinomialNB()
# avg_test_accuracy, cv_bayes = cross_validation(naivebayesclf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=kfold['naive_bayes'])
# print('avg score: ',avg_test_accuracy)

### Random Forests (Model Selection)

In [None]:
# need to do a for loop to find best parameters (when get access to server)
# forest_clf = RandomForestClassifier(n_estimators=100,max_depth=100,n_jobs=-1,random_state=4)
# avg_test_accuracy, cv_forest = cross_validation(forest_clf , tweets.shape[0], tfidf_train_vectors, tweets['sentiment'], n_folds=kfold['random_forest'])
# print('avg score: ',avg_test_accuracy)

## Learning curves

### Naive Bayes

In [None]:
# plot_learning_curve(naivebayesclf, 'Learning Curve - Naive Bayes', tfidf_train_vectors, tweets['sentiment'], cv=cv_bayes)
# plt.show()

### Random Forests

In [None]:
# plot_learning_curve(forest_clf, 'Learning Curve - Random Forest', tfidf_train_vectors, tweets['sentiment'], cv=cv_forest)
# plt.show()

### Vocabulary

In [None]:
# tfidf_vectorizer.vocabulary_
tfidf_vectorizer.get_feature_names()

## Write results to file

In [None]:
test_tweets = preprocessing(test_tweets,train=False, params=preprocessing_params)
test_tweets.head()

In [None]:
final_clf = MultinomialNB()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
test_data = tfidf_vectorizer.transform(test_tweets['tweet'])
final_clf.fit(tfidf_train_vectors, tweets['sentiment'])
pred = final_clf.predict(test_data)

In [None]:
create_csv_submission(pred)

## Test Code & Useful stuff

In [None]:
from nltk.corpus import brown
len(brown.words())

In [None]:
print(tfidf_vectorizer.vocabulary_['follow'])

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))

### WE

In [None]:
np.load(DATA_PATH+'embeddings.npy')