# Twitter Sentiment Analysis

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

#### Helpers

In [2]:
from utils import *
from preprocessing import *
from plots import *

#### Data Preprocessing

In [3]:
from nltk.probability import FreqDist
from nltk.corpus import stopwords

#### Feature Extraction

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split

#### Cross validation

In [5]:
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold as cross_validation_KFold

from sklearn.model_selection import learning_curve



#### Machine Learning

In [6]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import svm

#### Options

In [7]:
pd.options.mode.chained_assignment = None

## Load Data

In [8]:
data_path = '../data/'
pos_tweets_file = 'train_pos_full.txt'
neg_tweets_file = 'train_neg_full.txt'
test_tweets_file = 'test_data.txt'

### Train Data

In [9]:
pos_tweets = pd.read_table(data_path+pos_tweets_file, names=['tweet','sentiment'])
pos_tweets['sentiment'] = 'pos'
neg_tweets = pd.read_table(data_path+neg_tweets_file ,names=['tweet','sentiment'])
neg_tweets['sentiment'] = 'neg'

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
pos_tweets.head()

Unnamed: 0,tweet,sentiment
0,<user> i dunno justin read my mention or not ....,pos
1,"because your logic is so dumb , i won't even c...",pos
2,<user> just put casper in a box ! looved the...,pos
3,<user> <user> thanks sir > > don't trip lil ma...,pos
4,visiting my brother tmr is the bestest birthda...,pos


In [11]:
neg_tweets.head()

Unnamed: 0,tweet,sentiment
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,neg
1,glad i dot have taks tomorrow ! ! #thankful #s...,neg
2,1-3 vs celtics in the regular season = were fu...,neg
3,<user> i could actually kill that girl i'm so ...,neg
4,<user> <user> <user> i find that very hard to ...,neg


In [12]:
print('positive tweets shape: ',pos_tweets.shape)
print('negative tweets shape: ',neg_tweets.shape)

positive tweets shape:  (1218655, 2)
negative tweets shape:  (1239642, 2)


In [13]:
tweets = pd.concat([pos_tweets, neg_tweets], axis=0)
tweets.shape

(2458297, 2)

In [14]:
tweets.head()

Unnamed: 0,tweet,sentiment
0,<user> i dunno justin read my mention or not ....,pos
1,"because your logic is so dumb , i won't even c...",pos
2,<user> just put casper in a box ! looved the...,pos
3,<user> <user> thanks sir > > don't trip lil ma...,pos
4,visiting my brother tmr is the bestest birthda...,pos


In [15]:
tweets.tail()

Unnamed: 0,tweet,sentiment
1239637,im so sorry ! <user> & to <user> & <user> u gu...,neg
1239638,i can't find food coloring anywhere,neg
1239639,<user> same here ! ! but tort ! ! wonder why y...,neg
1239640,keyless entry remote fob clicker for 2005 buic...,neg
1239641,<user> yeap . doctor don't know what's wrong w...,neg


### Test Data

In [16]:
test_tweets = pd.read_table(data_path+test_tweets_file, names=['tweet','sentiment'])
test_tweets['tweet'] = test_tweets.apply(lambda tweet: remove_tweet_id(tweet['tweet']), axis=1)
test_tweets.head()

Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter ( sports with the port...,
1,<user> shucks well i work all week so now i ca...,
2,i cant stay away from bug thats my baby,
3,<user> no ma'am ! ! ! lol im perfectly fine an...,
4,"whenever i fall asleep watching the tv , i alw...",


## Tweets Preprocessing

### Remove Duplicate Tweets

Duplicates are removed to avoid putting extra weight on any particular tweet.

In [17]:
# print('number of tweets before duplicates removal:\t', tweets.shape[0])
# tweets.drop_duplicates(subset='tweet', inplace=True)
# print('number of tweets after duplicates removal:\t', tweets.shape[0])

### Fix repeated letters

We use preprocessing so that any letter occurring more than two times in a row is replaced with two occurrences.
As an example, the words haaaaaaaaappy and haaaaappy should be converted to haappy

In [18]:
# tweets['tweet'] = tweets.apply(lambda tweet: filter_repeated_chars_on_tweet(tweet['tweet']), axis=1)

### Filter Punctuation

In [19]:
# tweets['tweet'] = tweets.apply(lambda tweet: filter_punctuation(tweet['tweet']), axis=1)

### Filter user & url etc..

In [20]:
# tweets['tweet'] = filter_user(tweets['tweet'])
# tweets['tweet'] = filter_url(tweets['tweet'])
# tweets['tweet'] = filter_hashtag(tweets['tweet'])
# tweets['tweet'] = tweets.apply(lambda tweet: filter_digits(tweet['tweet']), axis=1)
# tweets['tweet'] = tweets.apply(lambda tweet: filter_small_words(tweet['tweet']), axis=1)

### filter Stopwords

In [21]:
# stoplist = stopwords.words('english')
# fdist = FreqDist(stoplist)
# top = fdist.most_common(1000)
# top = [x[0] for x in top] 

# stop_words = set(top)
# my_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

### Part of speech tagging

In [22]:
# tweets['tagged'] = tweets.apply(lambda tweet: pos_tag(tweet['tweet']), axis=1)

In [23]:
def preprocessing(tweets,train=True):
    if train:
        print('number of tweets before duplicates removal:\t', tweets.shape[0])
        tweets.drop_duplicates(subset='tweet', inplace=True)
        print('number of tweets after duplicates removal:\t', tweets.shape[0])

    tweets['tweet'] = tweets.apply(lambda tweet: filter_repeated_chars_on_tweet(tweet['tweet']), axis=1)
    print('repeated chars DONE')
    
    tweets['tweet'] = tweets.apply(lambda tweet: filter_punctuation(tweet['tweet']), axis=1)
    print('punctuation DONE')

    tweets['tweet'] = filter_user(tweets['tweet'])
    print('user DONE')
    tweets['tweet'] = filter_url(tweets['tweet'])
    print('url DONE')
    tweets['tweet'] = filter_hashtag(tweets['tweet'])
    print('hashtag DONE')
    tweets['tweet'] = tweets.apply(lambda tweet: filter_digits(tweet['tweet']), axis=1)
    print('digits DONE')
    tweets['tweet'] = tweets.apply(lambda tweet: filter_small_words(tweet['tweet']), axis=1)
    print('small words DONE')
    
    stoplist = stopwords.words('english')
    fdist = FreqDist(stoplist)
    top = fdist.most_common(1000)
    top = [x[0] for x in top] 
    stop_words = set(top)
    my_stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)
    print('stopwords DONE')
    
    return tweets, my_stop_words

tweets, my_stop_words = preprocessing(tweets,train=True)

number of tweets before duplicates removal:	 2458297
number of tweets after duplicates removal:	 2230333
repeated chars DONE
punctuation DONE
user DONE
url DONE
hashtag DONE
digits DONE
small words DONE
stopwords DONE


### Tweets final representation

In [24]:
tweets

Unnamed: 0,tweet,sentiment
0,dunno justin read mention not only justin and ...,pos
1,because your logic dumb won even crop out your...,pos
2,just put casper box looved the battle crakkbitch,pos
3,thanks sir don trip lil mama just keep doin thang,pos
4,visiting brother tmr the bestest birthday gift...,pos
5,yay lifecompleted tweet facebook let know please,pos
6,dnextalbumtitle feel for you rollercoaster lif...,pos
7,workin hard hardly workin hardee with future c...,pos
8,saw replying bit,pos
9,this were belong,pos


## Feature Extraction

In [25]:
# X_train, X_test, y_train, y_test = train_test_split(tweets['tweet'], tweets['sentiment'], test_size=0.10, random_state=4)

### bag of words

#### frequencies TF-IDF

In [26]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [27]:
#initialize bag of words (tf-idf)
#ngram_range=(1, 2)
tfidf_vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(),min_df=5, max_df = 0.8,
                                   sublinear_tf=True, use_idf=True, stop_words=my_stop_words)

## Feature Expansion

### polynomial expansion

### standardization

## Classification

### Naive Bayes

In [28]:
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
# tfidf_test_vectors = tfidf_vectorizer.transform(X_test)
# #shape: (number_of_tweets, all_words)

# clf = MultinomialNB()
# clf.fit(tfidf_train_vectors, y_train)
# prediction_bayes = clf.predict(tfidf_test_vectors)
# print(prediction_bayes.shape)
# print(classification_report(y_test, prediction_bayes))
# print('score: ',accuracy_score(y_test,prediction_bayes))

#### Get top k most important features

In [29]:
# print(topk_most_important_features(tfidf_vectorizer, clf, k=10))

In [30]:
# show_most_informative_features(tfidf_vectorizer, clf, n=1000)

## K fold Cross validation

In [31]:
# cv = cross_validation_KFold(tweets.shape[0], shuffle = True, n_folds=5, random_state=4)
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
# clfkfold = MultinomialNB()
# avg_test_accuracy = np.mean(cross_val_score(clfkfold, tfidf_train_vectors, tweets['sentiment'], cv=cv, scoring='accuracy'))
# print('avg score: ',avg_test_accuracy)

## Learning curves

### Naive Bayes

In [32]:
# plot_learning_curve(clfkfold, 'Learning Curve - Naive Bayes', tfidf_train_vectors, tweets['sentiment'], cv=cv)
# plt.show()

### SVM

In [33]:
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
# tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [34]:
# classifier_linear = svm.SVC(kernel='linear')
# classifier_linear.fit(tfidf_train_vectors, y_train)
# prediction_linear = classifier_linear.predict(tfidf_test_vectors)
# prediction_linear.shape

In [35]:
# print(classification_report(y_test, prediction_linear))
# print(accuracy_score(y_test,prediction_bayes))

## Write results to file

In [36]:
test_tweets, _ = preprocessing(test_tweets,train=False)
test_tweets.head()

repeated chars DONE
punctuation DONE
user DONE
url DONE
hashtag DONE
digits DONE
small words DONE
stopwords DONE


Unnamed: 0,tweet,sentiment
0,sea doo pro sea scooter sports with the portab...,
1,shucks well work all week now can come cheer y...,
2,cant stay away from bug thats baby,
3,lol perfectly fine and not contagious anymore ...,
4,whenever fall asleep watching the always wake ...,


In [37]:
final_clf = MultinomialNB()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(tweets['tweet'])
test_data = tfidf_vectorizer.transform(test_tweets['tweet'])
final_clf.fit(tfidf_train_vectors, tweets['sentiment'])
pred = final_clf.predict(test_data)

In [38]:
pred_file = 'pred_submission.csv'
create_csv_submission(pred, data_path+pred_file)

## Test Code

In [39]:
from nltk.corpus import brown
len(brown.words())

1161192

In [40]:
ff = tfidf_vectorizer.fit(X_train)
# tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
# tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

NameError: name 'X_train' is not defined

In [None]:
ff.vocabulary_

In [None]:
print(ff.vocabulary_['follow'])

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
 
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
 
print("Stem %s: %s" % ("studying", stemmer.stem("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying")))
print("Lemmatise %s: %s" % ("studying", lemmatiser.lemmatize("studying", pos="v")))