# Sentiment Analysis Notebook

## Dataset

The data given is in the form of a comma-separated values files with tweets and their corresponding sentiments. The training dataset is a csv file of type tweet_id,sentiment,tweet where the
tweet_id is a unique integer identifying the tweet, sentiment is either 1 (positive) or 0 (negative), and tweet is the tweet enclosed in "". Similarly, the test dataset is a csv file of type
tweet_id,tweet.

In [186]:
import pandas as pd
import dill
import sys
import nltk
sys.path.insert(0, 'src')
sys.path.insert(1, 'airline')

Save or restore notebook session

In [180]:
dill.dump_session('notebook_env.db')

In [2]:
dill.load_session('notebook_env.db')

NameError: name 'dill' is not defined

In [16]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [7]:
dataset = pd.read_csv('dataset/train.csv', encoding = "ISO-8859-1")
dataset.head(5)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


## Preprocessing

### Preprocessing functions

In [8]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

def preprocess_word(word):
    # remove punctation
    word = word.strip('\'"?!,.():;*')
    # more than 3 letter repetition removed
    word = re.sub(r'(.)\1\1+', r'\1\1\1', word)
    # remove - & '
    word = word.strip('-&\'')
    return word

def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

def preprocess_tweet(tweet, use_stemmer=False, use_lemmatizer=False):
    # convert tweet to lowercase
    tweet = tweet.lower()
    # replace urls with 'URL'
    tweet = re.sub(r'((www.[\S]+)|(https?://.[\S]+))', 'URL', tweet)
    # replace user mentions @user with 'USER_MENTION'
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # replace #hashtag with hastag
    tweet = re.sub(r'#(\S+)', r' \1', tweet)
    # remove retweet RT
    tweet = re.sub(r'\brt\b', '', tweet)
    # replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # remove space, " and ' 
    tweet.strip('" \'')
    # handle emojis. Use only EMO_POS and EMO_NEG
    tweet = handle_emojis(tweet)
    # replace multiple spaces with only one space
    tweet = re.sub(r'\s+', ' ', tweet)
    # preprocess words
    words = tweet.split()

    processed_words = []
    porter_stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            if use_stemmer:
                # use stemmer
                word = str(porter_stemmer.stem(word))
            elif use_lemmatizer:
                word = str(wordnet_lemmatizer.lemmatize(word))
            processed_words.append(word)
    return ' '.join(processed_words)

In [9]:
#tweet = "Hi, how is going on? :) CIAO COME vaaaaa? :) :( https://tartarus.org/martin/PorterStemmer/  bella @ivocerti    #sentimentanalysis"
for i in range(4):
    tweet = dataset.loc[i]['SentimentText']
    print('ORIGINAL: ' + tweet)
    print('STEMMER: ' + preprocess_tweet(tweet, use_stemmer=True))
    print('LEMMATIZER: ' + preprocess_tweet(tweet, use_lemmatizer=True))
    print('----------------------------------------------------------------')

ORIGINAL:                      is so sad for my APL friend.............
STEMMER: is so sad for my apl friend
LEMMATIZER: is so sad for my apl friend
----------------------------------------------------------------
ORIGINAL:                    I missed the New Moon trailer...
STEMMER: i miss the new moon trailer
LEMMATIZER: i missed the new moon trailer
----------------------------------------------------------------
ORIGINAL:               omg its already 7:30 :O
STEMMER: omg it alreadi o
LEMMATIZER: omg it already o
----------------------------------------------------------------
ORIGINAL:           .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...
STEMMER: omgaga im sooo im gunna cri been at thi dentist sinc i wa supos just get a crown put on
LEMMATIZER: omgaga im sooo im gunna cry been at this dentist since i wa suposed just get a crown put on
----------------------------------------------------------------


### Preprocess the whole dataset

In [10]:
preprocessed_train_file = open('dataset/preprocessed_train.csv', 'w')
preprocessed_train_file.write('ItemID,Sentiment,SentimentText\n')
for index, row in dataset.iterrows():
    res = preprocess_tweet(row['SentimentText'])
    preprocessed_train_file.write(str(index+1) + ',' + str(row['Sentiment']) + ',' + preprocess_tweet(row['SentimentText']) + '\n')
    sys.stdout.write('\r')
    sys.stdout.write('Processing ' + str(index+1) + '/' + str(dataset.shape[0]))
    sys.stdout.flush()
preprocessed_train_file.close()

Processing 99989/99989

In [11]:
preprocessed_dataset = pd.read_csv('dataset/preprocessed_train.csv', encoding = "ISO-8859-1")
preprocessed_dataset.head(5)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend
1,2,0,i missed the new moon trailer
2,3,1,omg its already o
3,4,0,omgaga im sooo im gunna cry been at this denti...
4,5,0,i think mi bf is cheating on me t_t


### Feature parameters (Word, SentiWordNet, POS + POS_Word, Semantic)

In [12]:
from nltk.corpus import sentiwordnet as swn

def word_to_SWNt(word):
    # take the first result from sentiwordnet (if there is)
    list_sent = list(swn.senti_synsets(word))
    if len(list_sent) > 0:
        sent = list_sent[0]
        pos = round(sent.pos_score()*10)
        neg = round(sent.neg_score()*10)
        obj = round(sent.obj_score()*10)
        # return the word if the word itself is mostly object
        # otherwise return POS-X or NEG-X if the word is 
        # mostly positive or negative and X is the rounded score*10 (from 0 to 10)
        if pos > neg:
            return 'POS-' + str(pos)
        elif neg > pos:
            return 'NEG-' + str(neg)
    return word

def tweet_to_feature_word(tweet):
    # just return the tweet as it is
    return str(tweet)

def tweet_to_feature_SWNt(tweet):
    words = str(tweet).split()
    processed_words = []
    for word in words:
        processed_words.append(word_to_SWNt(word))
    return str(' '.join(processed_words))

def tweet_to_feature_POS(tweet):
    # the tweet is composed by the POS (parts of speech) of the words
    # first just the list of pos, then the list of the pos and the words
    # in the format pos_word
    words = str(tweet).split()
    words_POS = nltk.pos_tag(words)
    processed_words = []
    pos = []
    for word in words_POS:
        pos.append(word[1])
    processed_words = pos
    size = len(processed_words)
    for i in range(size):
        processed_words.append(processed_words[i] + '_' + words[i])
    return str(' '.join(processed_words))

def tweet_to_feature_semantic(tweet):
    return tweet

In [17]:
for i in range(4):
    tweet = preprocessed_dataset.loc[i]['SentimentText']
    print('WORD: ' + tweet_to_feature_word(tweet))
    print('SWNt: ' + tweet_to_feature_SWNt(tweet))
    print('POS+POS_word: ' + tweet_to_feature_POS(tweet))
    #print('Semantic: ' + tweet_to_feature_semantic(tweet))
    print('----------------------------------------------------------------')

WORD: is so sad for my apl friend
SWNt: POS-2 so NEG-8 for my apl POS-1
POS+POS_word: VBZ RB JJ IN PRP$ NN NN VBZ_is RB_so JJ_sad IN_for PRP$_my NN_apl NN_friend
----------------------------------------------------------------
WORD: i missed the new moon trailer
SWNt: i NEG-2 the POS-4 moon trailer
POS+POS_word: NN VBD DT JJ NN NN NN_i VBD_missed DT_the JJ_new NN_moon NN_trailer
----------------------------------------------------------------
WORD: omg its already o
SWNt: omg its POS-1 o
POS+POS_word: VB PRP$ RB VB VB_omg PRP$_its RB_already VB_o
----------------------------------------------------------------
WORD: omgaga im sooo im gunna cry been at this dentist since i was suposed just get a crown put on
SWNt: omgaga im sooo im gunna cry POS-2 at this dentist since i was suposed POS-6 get a crown put on
POS+POS_word: JJ NN NN NN NN NN VBN IN DT NN IN NN VBD VBN RB VB DT NN NN IN JJ_omgaga NN_im NN_sooo NN_im NN_gunna NN_cry VBN_been IN_at DT_this NN_dentist IN_since NN_i VBD_was VBN

### Save the whole datasets

In [189]:
# WORD
features_train_file = open('dataset/word_features_train.csv', 'w')
features_train_file.write('ItemID,Sentiment,SentimentText\n')
print('WORD')
for index, row in preprocessed_dataset.iterrows():
    features_train_file.write(str(index+1) + ',' + str(row['Sentiment']) + ',' + tweet_to_feature_word(row['SentimentText']) + '\n')
    sys.stdout.write('\r')
    sys.stdout.write('Processing ' + str(index+1) + '/' + str(preprocessed_dataset.shape[0]))
    sys.stdout.flush()
preprocessed_train_file.close()

# SWNt
features_train_file = open('dataset/swnt_features_train.csv', 'w')
features_train_file.write('ItemID,Sentiment,SentimentText\n')
print('\nSWNt')
for index, row in preprocessed_dataset.iterrows():
    features_train_file.write(str(index+1) + ',' + str(row['Sentiment']) + ',' + tweet_to_feature_SWNt(row['SentimentText']) + '\n')
    sys.stdout.write('\r')
    sys.stdout.write('Processing ' + str(index+1) + '/' + str(preprocessed_dataset.shape[0]))
    sys.stdout.flush()
preprocessed_train_file.close()

# POS+POS_word
features_train_file = open('dataset/pos_features_train.csv', 'w')
features_train_file.write('ItemID,Sentiment,SentimentText\n')
print('\nSWNt')
for index, row in preprocessed_dataset.iterrows():
    features_train_file.write(str(index+1) + ',' + str(row['Sentiment']) + ',' + tweet_to_feature_POS(row['SentimentText']) + '\n')
    sys.stdout.write('\r')
    sys.stdout.write('Processing ' + str(index+1) + '/' + str(preprocessed_dataset.shape[0]))
    sys.stdout.flush()
preprocessed_train_file.close()

WORD
Processing 99989/99989Processing 11364/99989Processing 12648/99989Processing 32953/99989Processing 63347/99989Processing 64747/99989Processing 67747/99989Processing 73962/99989
SWNt
Processing 99989/99989
SWNt
Processing 99989/99989

In [18]:
# WORD
features_word_dataset = pd.read_csv('dataset/word_features_train.csv', encoding = "ISO-8859-1")
print('WORD')
features_word_dataset.head(5)

WORD


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my apl friend
1,2,0,i missed the new moon trailer
2,3,1,omg its already o
3,4,0,omgaga im sooo im gunna cry been at this denti...
4,5,0,i think mi bf is cheating on me t_t


In [19]:
# SWNt
features_swnt_dataset = pd.read_csv('dataset/swnt_features_train.csv', encoding = "ISO-8859-1")
print('SWNt')
features_swnt_dataset.head(5)

SWNt


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,POS-2 so NEG-8 for my apl POS-1
1,2,0,i NEG-2 the POS-4 moon trailer
2,3,1,omg its POS-1 o
3,4,0,omgaga im sooo im gunna cry POS-2 at this dent...
4,5,0,i think mi bf POS-2 POS-6 on me t_t


In [20]:
# POS+POS_word
features_pos_dataset = pd.read_csv('dataset/pos_features_train.csv', encoding = "ISO-8859-1")
print('POS+POS_word')
features_pos_dataset.head(5)

POS+POS_word


Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,VBZ RB JJ IN PRP$ NN NN VBZ_is RB_so JJ_sad IN...
1,2,0,NN VBD DT JJ NN NN NN_i VBD_missed DT_the JJ_n...
2,3,1,VB PRP$ RB VB VB_omg PRP$_its RB_already VB_o
3,4,0,JJ NN NN NN NN NN VBN IN DT NN IN NN VBD VBN R...
4,5,0,NN VBP NN NN VBZ VBG IN PRP VB NN_i VBP_think ...


## Vectorization

### Dictionary

In [21]:
from nltk import FreqDist
import pickle
from collections import Counter

def analyze_tweet(tweet):
    result = {}
    result['MENTIONS'] = tweet.count('USER_MENTION')
    result['URLS'] = tweet.count('URL')
    result['POS_EMOS'] = tweet.count('EMO_POS')
    result['NEG_EMOS'] = tweet.count('EMO_NEG')
    tweet = tweet.replace('USER_MENTION', '').replace(
        'URL', '')
    words = tweet.split()
    result['WORDS'] = len(words)
    bigrams = get_bigrams(words)
    result['BIGRAMS'] = len(bigrams)
    return result, words, bigrams

def get_bigrams(tweet_words):
    bigrams = []
    num_words = len(tweet_words)
    for i in range(num_words - 1):
        bigrams.append((tweet_words[i], tweet_words[i + 1]))
    return bigrams

def get_bigram_freqdist(bigrams):
    freq_dict = {}
    for bigram in bigrams:
        if freq_dict.get(bigram):
            freq_dict[bigram] += 1
        else:
            freq_dict[bigram] = 1
    counter = Counter(freq_dict)
    return counter

In [22]:
# CALCULATE FREQUENCY DISTRIBUTIONS (for each features dataset)
num_tweets, num_pos_tweets, num_neg_tweets = 0, 0, 0
num_mentions, max_mentions = 0, 0
num_emojis, num_pos_emojis, num_neg_emojis, max_emojis = 0, 0, 0, 0
num_urls, max_urls = 0, 0
num_words, num_unique_words, min_words, max_words = 0, 0, 1e6, 0
num_bigrams, num_unique_bigrams = 0, 0
all_words = []
all_bigrams = []
datasets = ['word', 'swnt', 'pos']
for d in datasets:
    print(d)
    with open('dataset/' + d + '/' + d + '_features_train.csv', 'r') as csv:
        lines = csv.readlines()
        num_tweets = len(lines)
        for i, line in enumerate(lines):
            if i == 0:
                continue
            t_id, if_pos, tweet = line.strip().split(',')
            if_pos = int(if_pos)
            if if_pos:
                num_pos_tweets += 1
            else:
                num_neg_tweets += 1
            result, words, bigrams = analyze_tweet(tweet)
            num_mentions += result['MENTIONS']
            max_mentions = max(max_mentions, result['MENTIONS'])
            num_pos_emojis += result['POS_EMOS']
            num_neg_emojis += result['NEG_EMOS']
            max_emojis = max(
                max_emojis, result['POS_EMOS'] + result['NEG_EMOS'])
            num_urls += result['URLS']
            max_urls = max(max_urls, result['URLS'])
            num_words += result['WORDS']
            min_words = min(min_words, result['WORDS'])
            max_words = max(max_words, result['WORDS'])
            all_words.extend(words)
            num_bigrams += result['BIGRAMS']
            all_bigrams.extend(bigrams)
            sys.stdout.write('\r')
            sys.stdout.write('Processing %d/%d' % (i+1, num_tweets))
            sys.stdout.flush()
    num_emojis = num_pos_emojis + num_neg_emojis
    unique_words = list(set(all_words))
    with open('dataset/' + d + '/' + d + '_features_train_unique.txt', 'w') as uwf:
        uwf.write('\n'.join(unique_words))
    num_unique_words = len(unique_words)
    num_unique_bigrams = len(set(all_bigrams))
    print('\nCalculating frequency distribution')
    # Unigrams
    freq_dist = FreqDist(all_words)
    pkl_file_name = 'dataset/' + d + '/' + d + '_features_train_freqdist.pkl'
    with open(pkl_file_name, 'wb') as pkl_file:
        pickle.dump(freq_dist, pkl_file)
    print('Saved uni-frequency distribution to %s' % pkl_file_name)
    # Bigrams
    bigram_freq_dist = get_bigram_freqdist(all_bigrams)
    bi_pkl_file_name = 'dataset/' + d + '/' + d + '_features_train_freqdistbi.pkl'
    with open(bi_pkl_file_name, 'wb') as pkl_file:
        pickle.dump(bigram_freq_dist, pkl_file)
    print('Saved bi-frequency distribution to %s' % bi_pkl_file_name)
    print('\n[Analysis Statistics]')
    print('Tweets => Total: %d, Positive: %d, Negative: %d' % (num_tweets, num_pos_tweets, num_neg_tweets))
    print('User Mentions => Total: %d, Avg: %.4f, Max: %d' % (num_mentions, num_mentions / float(num_tweets), max_mentions))
    print('URLs => Total: %d, Avg: %.4f, Max: %d' % (num_urls, num_urls / float(num_tweets), max_urls))
    print('Emojis => Total: %d, Positive: %d, Negative: %d, Avg: %.4f, Max: %d' % (num_emojis, num_pos_emojis, num_neg_emojis, num_emojis / float(num_tweets), max_emojis))
    print('Words => Total: %d, Unique: %d, Avg: %.4f, Max: %d, Min: %d' % (num_words, num_unique_words, num_words / float(num_tweets), max_words, min_words))
    print('Bigrams => Total: %d, Unique: %d, Avg: %.4f' % (num_bigrams, num_unique_bigrams, num_bigrams / float(num_tweets)))
    print('\n')

word
Processing 99784/99990Processing 50664/99990Saved uni-frequency distribution to dataset/word/word_features_train_freqdist.pkl
Saved bi-frequency distribution to dataset/word/word_features_train_freqdistbi.pkl

[Analysis Statistics]
Tweets => Total: 99990, Positive: 56457, Negative: 43532
User Mentions => Total: 88955, Avg: 0.8896, Max: 12
URLs => Total: 4237, Avg: 0.0424, Max: 4
Emojis => Total: 1184, Positive: 997, Negative: 187, Avg: 0.0118, Max: 5
Words => Total: 1154343, Unique: 48477, Avg: 11.5446, Max: 93, Min: 0
Bigrams => Total: 1054840, Unique: 381798, Avg: 10.5495


swnt
Processing 96715/99990Saved uni-frequency distribution to dataset/swnt/swnt_features_train_freqdist.pkl
Saved bi-frequency distribution to dataset/swnt/swnt_features_train_freqdistbi.pkl

[Analysis Statistics]
Tweets => Total: 99990, Positive: 112914, Negative: 87064
User Mentions => Total: 177910, Avg: 1.7793, Max: 12
URLs => Total: 8474, Avg: 0.0847, Max: 4
Emojis => Total: 2368, Positive: 1994, Negati

### Create features vectors

In [4]:
import numpy as np
import random
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer


def split_data(tweets, validation_split=0.1):
    random.seed(1118)
    index = int((1 - validation_split) * len(tweets))
    random.shuffle(tweets)
    return tweets[:index], tweets[index:]

def top_n_words(pkl_file_name, N, shift=0):
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    words = {p[0]: i + shift for i, p in enumerate(most_common)}
    return words

def top_n_bigrams(pkl_file_name, N, shift=0):
    with open(pkl_file_name, 'rb') as pkl_file:
        freq_dist = pickle.load(pkl_file)
    most_common = freq_dist.most_common(N)
    bigrams = {p[0]: i for i, p in enumerate(most_common)}
    return bigrams

def apply_tf_idf(X):
    transformer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
    transformer.fit(X)
    return transformer


class FeatureVectorGenerator():
    
    def __init__(self, freq_dist_file, bi_freq_dist_file, train_processed_file, use_bigrams=True):
        self.FREQ_DIST_FILE = freq_dist_file
        self.BI_FREQ_DIST_FILE = bi_freq_dist_file
        self.TRAIN_PROCESSED_FILE = train_processed_file
        self.USE_BIGRAMS = use_bigrams
        self.UNIGRAM_SIZE = 15000
        self.VOCAB_SIZE = self.UNIGRAM_SIZE
        if use_bigrams:
            self.BIGRAM_SIZE = 10000
            self.VOCAB_SIZE = self.UNIGRAM_SIZE + self.BIGRAM_SIZE
    
    def get_feature_vector(self, tweet):
        # get the unigrams and bigrams contained in the vocabolary
        uni_feature_vector = []
        bi_feature_vector = []
        words = tweet.split()
        for i in range(len(words) - 1):
            word = words[i]
            next_word = words[i + 1]
            if self.unigrams.get(word):
                uni_feature_vector.append(word)
            if self.USE_BIGRAMS:
                if self.bigrams.get((word, next_word)):
                    bi_feature_vector.append((word, next_word))
        if len(words) >= 1:
            if self.unigrams.get(words[-1]):
                uni_feature_vector.append(words[-1])
        return uni_feature_vector, bi_feature_vector\
    
    def extract_features(self, tweets, test_file=True, feat_type='presence'):
        # feat_type can be 'presence' for bag of words or frequency for tf-idf
        features = lil_matrix((len(tweets), self.VOCAB_SIZE))
        labels = np.zeros(len(tweets))
        for j, tweet in enumerate(tweets):
            sys.stdout.write('\r')
            sys.stdout.write('Processing %d/%d' % (j+1, len(tweets)))
            sys.stdout.flush()
            if test_file:
                tweet_words = tweet[1][0]
                tweet_bigrams = tweet[1][1]
            else:
                tweet_words = tweet[2][0]
                tweet_bigrams = tweet[2][1]
                labels[j] = tweet[1]
            if feat_type == 'presence':
                tweet_words = set(tweet_words)
                tweet_bigrams = set(tweet_bigrams)
            for word in tweet_words:
                idx = self.unigrams.get(word)
                if idx:
                    features[j, idx] += 1
            if self.USE_BIGRAMS:
                for bigram in tweet_bigrams:
                    idx = self.bigrams.get(bigram)
                    if idx:
                        features[j, self.UNIGRAM_SIZE + idx] += 1
        if feat_type == 'frequency':
            if not test_file:
                self.tfidf = apply_tf_idf(features)
            features = self.tfidf.transform(features)
        return features, labels
    
    def extract_features_single(self, tweet, feat_type='presence'):
        features = lil_matrix((1, self.VOCAB_SIZE))
        tweet_words = tweet[0]
        tweet_bigrams = tweet[1]
        if feat_type == 'presence':
            tweet_words = set(tweet_words)
            tweet_bigrams = set(tweet_bigrams)
        for word in tweet_words:
            idx = self.unigrams.get(word)
            if idx:
                features[0, idx] += 1
        if self.USE_BIGRAMS:
            for bigram in tweet_bigrams:
                idx = self.bigrams.get(bigram)
                if idx:
                    features[0, self.UNIGRAM_SIZE + idx] += 1
        if feat_type == 'frequency':
            features = self.tfidf.transform(features)
        return features

    def process_tweets(self, csv_file, test_file=True):
        tweets = []
        print('Generating feature vectors')
        with open(csv_file, 'r') as csv:
            lines = csv.readlines()
            total = len(lines)
            for i, line in enumerate(lines):
                if i == 0:
                    # ignore header
                    continue
                if test_file:
                    tweet_id, tweet = line.split(',')
                else:
                    tweet_id, sentiment, tweet = line.split(',')
                feature_vector = self.get_feature_vector(tweet)
                if test_file:
                    tweets.append((tweet_id, feature_vector))
                else:
                    tweets.append((tweet_id, int(sentiment), feature_vector))
                sys.stdout.write('\r')
                sys.stdout.write('Processing %d/%d' % (i+1, total))
                sys.stdout.flush()
        print('\n')
        return tweets
    
    def fit(self):
        self.unigrams = top_n_words(self.FREQ_DIST_FILE, self.UNIGRAM_SIZE)
        if self.USE_BIGRAMS:
            self.bigrams = top_n_bigrams(self.BI_FREQ_DIST_FILE, self.BIGRAM_SIZE)

### Word model

In [104]:
FREQ_DIST_FILE = 'dataset/word/word_features_train_freqdist.pkl'
BI_FREQ_DIST_FILE = 'dataset/word/word_features_train_freqdistbi.pkl'
TRAIN_PROCESSED_FILE = 'dataset/word/word_features_train.csv'
FEAT_TYPE = 'frequency'
#TEST_PROCESSED_FILE = 'dataset/test-processed.csv'

word_feature_generator = FeatureVectorGenerator(FREQ_DIST_FILE, BI_FREQ_DIST_FILE, TRAIN_PROCESSED_FILE)
word_feature_generator.fit()
tweets = word_feature_generator.process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
train_tweets, val_tweets = split_data(tweets)

print('Extracting features & training batches')
word_training_set_X, word_training_set_y = word_feature_generator.extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE)
word_validation_set_X, word_validation_set_y = word_feature_generator.extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE)

Generating feature vectors
Processing 99990/99990Processing 29526/99990Processing 33925/99990Processing 49918/99990Processing 90305/99990

Extracting features & training batches
Processing 9999/999990Processing 89068/89990

In [105]:
print(train_tweets[0])
print(word_training_set_X[0])

('84378', 0, (['wish', 'were', 'at', 'the', 'beach'], [('i', 'wish'), ('wish', 'i'), ('i', 'were'), ('were', 'at'), ('at', 'the'), ('the', 'beach')]))
  (0, 20860)	0.45624438783837346
  (0, 16267)	0.3824592958334005
  (0, 15898)	0.37465163017086883
  (0, 15040)	0.26960692354169286
  (0, 15037)	0.2702038648270284
  (0, 15034)	0.26684255364065795
  (0, 658)	0.34700312000734956
  (0, 120)	0.2470853353381329
  (0, 112)	0.24243535882803996
  (0, 29)	0.18780180279567585
  (0, 1)	0.1131034164249352


### SWNt model

In [107]:
FREQ_DIST_FILE = 'dataset/swnt/swnt_features_train_freqdist.pkl'
BI_FREQ_DIST_FILE = 'dataset/swnt/swnt_features_train_freqdistbi.pkl'
TRAIN_PROCESSED_FILE = 'dataset/swnt/swnt_features_train.csv'
FEAT_TYPE = 'frequency'
#TEST_PROCESSED_FILE = 'dataset/test-processed.csv'

swnt_feature_generator = FeatureVectorGenerator(FREQ_DIST_FILE, BI_FREQ_DIST_FILE, TRAIN_PROCESSED_FILE)
swnt_feature_generator.fit()
tweets = swnt_feature_generator.process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
train_tweets, val_tweets = split_data(tweets)

print('Extracting features & training batches')
swnt_training_set_X, swnt_training_set_y = swnt_feature_generator.extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE)
swnt_validation_set_X, swnt_validation_set_y = swnt_feature_generator.extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE)

Generating feature vectors
Processing 99862/99990Processing 40992/99990Processing 53382/99990

Extracting features & training batches
Processing 9999/999990Processing 34767/89990Processing 41519/89990

In [108]:
print(train_tweets[0])
print(swnt_training_set_X[0])

('84378', 0, (['POS-5', 'POS-2', 'at', 'the', 'beach'], [('i', 'POS-5'), ('POS-5', 'i'), ('i', 'POS-2'), ('POS-2', 'at'), ('at', 'the'), ('the', 'beach')]))
  (0, 16851)	0.43853622575927786
  (0, 16017)	0.4441484939367343
  (0, 15057)	0.2937529457503533
  (0, 15052)	0.2909081151635931
  (0, 15033)	0.31634112539074666
  (0, 15017)	0.2554330607283537
  (0, 611)	0.4113712599417268
  (0, 32)	0.2226385291110598
  (0, 15)	0.14748695554162722
  (0, 6)	0.12512761180078183
  (0, 1)	0.13408379416719343


### POS+POS_word model

In [109]:
FREQ_DIST_FILE = 'dataset/pos/pos_features_train_freqdist.pkl'
BI_FREQ_DIST_FILE = 'dataset/pos/pos_features_train_freqdistbi.pkl'
TRAIN_PROCESSED_FILE = 'dataset/pos/pos_features_train.csv'
FEAT_TYPE = 'frequency'
#TEST_PROCESSED_FILE = 'dataset/test-processed.csv'

pos_feature_generator = FeatureVectorGenerator(FREQ_DIST_FILE, BI_FREQ_DIST_FILE, TRAIN_PROCESSED_FILE)
pos_feature_generator.fit()
tweets = pos_feature_generator.process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
train_tweets, val_tweets = split_data(tweets)

print('Extracting features & training batches')
pos_training_set_X, pos_training_set_y = pos_feature_generator.extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE)
pos_validation_set_X, pos_validation_set_y = pos_feature_generator.extract_features(val_tweets, test_file=False, feat_type=FEAT_TYPE)

Generating feature vectors
Processing 9164/999990Processing 74291/99990

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [110]:
print(train_tweets[0])
print(pos_training_set_X[0])

('84378', 0, (['JJ', 'VBP', 'NNS', 'VBD', 'IN', 'DT', 'JJ_i', 'VBP_wish', 'NNS_i', 'VBD_were', 'IN_at', 'DT_the', 'NN_beach'], [('JJ', 'VBP'), ('VBP', 'NNS'), ('NNS', 'VBD'), ('VBD', 'IN'), ('IN', 'DT'), ('DT', 'NN'), ('NN', 'JJ_i'), ('JJ_i', 'VBP_wish'), ('IN_at', 'DT_the'), ('DT_the', 'NN_beach')]))
  (0, 19149)	0.3578309102866506
  (0, 18666)	0.3556783871388087
  (0, 17317)	0.33213732860379186
  (0, 15361)	0.25332901676051495
  (0, 15206)	0.22984293470666342
  (0, 15142)	0.21189358496122349
  (0, 15105)	0.197060377692977
  (0, 15043)	0.1694604797492354
  (0, 15004)	0.11416514609807148
  (0, 15002)	0.09369969703889273
  (0, 1800)	0.3315213711236042
  (0, 792)	0.2805652080828856
  (0, 346)	0.23457235063580986
  (0, 147)	0.18612134162457872
  (0, 121)	0.17829107613828402
  (0, 77)	0.15669087500633502
  (0, 22)	0.10737559240183435
  (0, 17)	0.0988938248318331
  (0, 14)	0.08787082538150585
  (0, 7)	0.07689983442482862
  (0, 6)	0.07402091733507168
  (0, 2)	0.06650848940908176
  (0, 1)	0.0

### preprocess on test tweets

In [43]:
sentence = 'Hi, this is an awesome sample tweet to try how testing wooooorks! :) car'
prep = preprocess_tweet(sentence)
sentence_prep = word_feature_generator.get_feature_vector(prep)
print(sentence_prep)
print(word_feature_generator.extract_features_single(sentence_prep, feat_type='frequency'))

(['hi', 'this', 'is', 'an', 'awesome', 'sample', 'tweet', 'to', 'try', 'how', 'testing', 'EMO_POS', 'car'], [('this', 'is'), ('is', 'an'), ('an', 'awesome'), ('tweet', 'to'), ('to', 'try')])
  (0, 24259)	0.3480045586192851
  (0, 16125)	0.2973134832617887
  (0, 15769)	0.28248729415910595
  (0, 15723)	0.27956046199395196
  (0, 15095)	0.22615956330715867
  (0, 5461)	0.32265902094053694
  (0, 2179)	0.31449958028172165
  (0, 411)	0.24231556737767498
  (0, 204)	0.2185262595172075
  (0, 201)	0.21047296082411993
  (0, 195)	0.20569671871002487
  (0, 157)	0.21090063682832816
  (0, 131)	0.19453016949482776
  (0, 92)	0.17817831127895933
  (0, 54)	0.166919353923811
  (0, 34)	0.14907699015634612
  (0, 9)	0.11581074529912529
  (0, 2)	0.08792113847315476


## Machine Learning Algorithms

Now that I have the pipeline from tweet to features vector, train machine learning algorithms and see what happen. All the examples with "word" dataset

In [44]:
def calculate_accuracy(val_set_labels, predictions):
    print('Calculate accuracy')
    acc = np.sum(predictions == val_set_labels) / len(val_set_labels) 
    return acc

### SVM

In [45]:
from sklearn import svm

clf_svm = svm.LinearSVC(C=0.1, max_iter=1000)
clf_svm.fit(word_training_set_X, word_training_set_y)
svm_prediction = clf_svm.predict(word_validation_set_X)
svm_acc = calculate_accuracy(word_validation_set_y, svm_prediction)
print('Accuracy: ' + str(svm_acc))

Calculate accuracy
Accuracy: 0.7753775377537754


### Logistic regression

In [46]:
from sklearn.linear_model import LogisticRegression

clf_logreg = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)
clf_logreg.fit(word_training_set_X, word_training_set_y)
logreg_prediction = clf_logreg.predict(word_validation_set_X)
logreg_acc = calculate_accuracy(word_validation_set_y, logreg_prediction)
print('Accuracy: ' + str(logreg_acc))

Calculate accuracy
Accuracy: 0.7746774677467747


### Naive Bayes

In [47]:
from sklearn.naive_bayes import MultinomialNB

clf_nabay = MultinomialNB()
clf_nabay.partial_fit(word_training_set_X, word_training_set_y, classes=[0, 1])
nabay_prediction = clf_nabay.predict(word_validation_set_X)
nabay_acc = calculate_accuracy(word_validation_set_y, nabay_prediction)
print('Accuracy: ' + str(nabay_acc))

Calculate accuracy
Accuracy: 0.7718771877187719


### Neural Network

In [142]:
from keras.models import Sequential, load_model
from keras.layers import Dense

model = Sequential()
model.add(Dense(500, input_dim=VOCAB_SIZE, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

ModuleNotFoundError: No module named 'keras'

### Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier

clf_rndfrs = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=0)
clf_rndfrs.fit(word_training_set_X, word_training_set_y)
rndfrs_prediction = clf_rndfrs.predict(word_validation_set_X)
rndfrs_acc = calculate_accuracy(word_validation_set_y, rndfrs_prediction)
print('Accuracy: ' + str(rndfrs_acc))

Calculate accuracy
Accuracy: 0.753975397539754


### Ensemble

In [167]:
class Ensemble:
    """
                 | word | swnt | pos
    ___________________________________
    SVM          |      |      |
    ___________________________________
    logistic_reg |      |      |
    ___________________________________
    naive_bayes  |      |      |
    ___________________________________
    rnd_forest   |      |      |
    """
    
    def __init__(self):
        self.matrix = []
    
    def fit(self, word_features, word_labels, swnt_features, swnt_labels, pos_features, pos_labels):
        # SVM
        clf_svm1 = svm.LinearSVC(C=0.1, max_iter=1000)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 1/12')
        sys.stdout.flush()
        clf_svm1.fit(word_features, word_labels)
        clf_svm2 = svm.LinearSVC(C=0.1, max_iter=1000)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 2/12')
        sys.stdout.flush()
        clf_svm2.fit(swnt_features, swnt_labels)
        clf_svm3 = svm.LinearSVC(C=0.1, max_iter=1000)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 3/12')
        sys.stdout.flush()
        clf_svm3.fit(pos_features, pos_labels)
        self.matrix.append((clf_svm1, clf_svm2, clf_svm3))
        # Logistic regression
        clf_logreg1 = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 4/12')
        sys.stdout.flush()
        clf_logreg1.fit(word_features, word_labels)
        clf_logreg2 = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 5/12')
        sys.stdout.flush()
        clf_logreg2.fit(swnt_features, swnt_labels)
        clf_logreg3 = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 6/12')
        sys.stdout.flush()
        clf_logreg3.fit(pos_features, pos_labels)
        self.matrix.append((clf_logreg1, clf_logreg2, clf_logreg3))
        # Naive Bayes
        clf_nabay1 = MultinomialNB()
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 7/12')
        sys.stdout.flush()
        clf_nabay1.partial_fit(word_features, word_labels, classes=[0, 1])
        clf_nabay2 = MultinomialNB()
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 8/12')
        sys.stdout.flush()
        clf_nabay2.partial_fit(swnt_features, swnt_labels, classes=[0, 1])
        clf_nabay3 = MultinomialNB()
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 9/12')
        sys.stdout.flush()
        clf_nabay3.partial_fit(pos_features, pos_labels, classes=[0, 1])
        self.matrix.append((clf_nabay1, clf_nabay2, clf_nabay3))
        # Random Forest
        clf_rndfrs1 = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=0)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 10/12')
        sys.stdout.flush()
        clf_rndfrs1.fit(word_features, word_labels)
        clf_rndfrs2 = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=0)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 11/12')
        sys.stdout.flush()
        clf_rndfrs2.fit(swnt_features, swnt_labels)
        clf_rndfrs3 = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=0)
        sys.stdout.write('\r')
        sys.stdout.write('Fitting 12/12')
        sys.stdout.flush()
        clf_rndfrs3.fit(pos_features, pos_labels)
        self.matrix.append((clf_rndfrs1, clf_rndfrs2, clf_rndfrs3))
        return
    
    # pass one vector a time, this because word, swnt, pos are not ordered in the same way
    def predict(self, word_features, swnt_features, pos_features):
        features = (word_features, swnt_features, pos_features)
        predictions = []
        for i in range(len(m)):
            for j in range(len(m[0])):
                pred = m[i][j].predict(features[j])
                predictions.append(pred[0])
        return predictions
    
    def save_matrix(self, path):
        with open(path, 'wb') as pkl_file:
            pickle.dump(self.matrix, pkl_file)
            
    def load(self, path):
        with open(path, 'rb') as matrix_file:
            self.matrix = pickle.load(matrix_file)
            
#     def evaluate(self, word_features, swnt_features, pos_features, labels):
        

In [168]:
a = []
a.append((clf_svm, clf_logreg))
a.append((clf_nabay, clf_rndfrs))
a[1][0].predict(word_validation_set_X)

array([1, 1, 1, ..., 0, 1, 1])

In [169]:
e = Ensemble()

In [170]:
e.fit(word_training_set_X, word_training_set_y, swnt_training_set_X, swnt_training_set_y, pos_training_set_X, pos_training_set_y)

Fitting 12/12

In [171]:
e.save_matrix('dataset/matrix.pkl')

In [132]:
e.load('dataset/matrix.pkl')

In [175]:
m = e.matrix
print(m[0][0])
print(len(m[0]))
print(len(m))

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
3
4


In [176]:
# check if correctly shuffled
for i in range(1000):
    if word_validation_set_y[i] != swnt_validation_set_y[i] or swnt_validation_set_y[i] != pos_validation_set_y[i]:
        print('ahia')

### Verify performance of ensembled model

In [177]:
correct = 0
total = word_validation_set_X.shape[0]
final_predictions = []
for i in range(total):
    sys.stdout.write('\r')
    sys.stdout.write('Predicting %d/%d - Correct %d/%d - Accuracy %.3f' % (i+1, total, correct, total, correct/(i+1)))
    sys.stdout.flush()
    p = e.predict(word_validation_set_X[i], swnt_validation_set_X[i], pos_validation_set_X[i])
    counts = np.bincount(p)
    final_pred = np.argmax(counts)
    final_predictions.append(final_pred)
    if final_pred == word_validation_set_y[i]:
        correct += 1
print('Correct: ' + str(correct) + ' / ' + str(total))

Predicting 9999/9999 - Correct 7751/9999 - Accuracy 0.775Correct: 7752 / 9999


In [178]:
print(word_validation_set_X.shape)

(9999, 25000)
