In [3]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline

In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

### 1. Loading the tweets

Read the tweets from the files into pandas dataframes

In [28]:
train_pos = pd.read_table("../twitter-datasets/train_pos_full.txt", names=["tweets"])

In [5]:
train_neg = pd.read_table("../twitter-datasets/train_neg_full.txt", names=["tweets"])

In [23]:
unsup_data = pd.read_table("../twitter-datasets/test_data.txt", names=["tweets"])

Check if there are any null values (just in case)

In [7]:
train_pos['tweets'].isnull().sum()

0

In [8]:
train_neg['tweets'].isnull().sum()

0

See how many tweets we are dealing with

In [9]:
len(train_pos)

1218655

In [10]:
len(train_neg)

1239642

In [11]:
train_pos.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218655 entries, 0 to 1218654
Data columns (total 1 columns):
tweets    1218655 non-null object
dtypes: object(1)
memory usage: 147.1 MB


In [12]:
train_neg.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239642 entries, 0 to 1239641
Data columns (total 1 columns):
tweets    1239642 non-null object
dtypes: object(1)
memory usage: 167.6 MB


### 2. Construct train and test datasets

In [13]:
# Step 1. shuffle imported tweets
train_pos = train_pos.reindex(np.random.permutation(train_pos.index))
train_pos = train_pos.reset_index(drop=True)
train_neg = train_neg.reindex(np.random.permutation(train_neg.index))
train_neg = train_neg.reset_index(drop=True)

In [33]:
# Step 2. Pick ratio to be train, respectively test
ratio = 0.8
x_train = train_pos[train_pos.index <= int(ratio*len(train_pos))]
x_test = train_pos[train_pos.index > int(ratio*len(train_pos))]
size_x_pos_train = len(x_train)
size_x_pos_test = len(x_test)
x_train = pd.concat([x_train,train_neg[train_neg.index <= int(ratio*len(train_neg))]],axis=0)
x_test = pd.concat([x_test,train_neg[train_neg.index > int(ratio*len(train_neg))]],axis=0)
# reset index
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
# check if lengths are as expected
len(x_train) + len(x_test) == len(train_pos) + len(train_neg)

True

In [30]:
ratio = 0.8
size_x_pos_train = len(train_pos[train_pos.index <= int(ratio*len(train_pos))])
size_x_pos_test = len(train_pos[train_pos.index > int(ratio*len(train_pos))])

In [31]:
# Step 3. Create the labels
y_train = np.concatenate((np.ones(size_x_pos_train), np.zeros(len(x_train) - size_x_pos_train)))
y_test = np.concatenate((np.ones(size_x_pos_test), np.zeros(len(x_test) - size_x_pos_test)))
# check if lengths are as expected
len(y_train) == len(x_train) and len(y_test) == len(x_test)

True

### 3. Preprocessing functionality

In [16]:
import nltk
from nltk.corpus import stopwords # Import the stop word list
len(stopwords.words("english")) 

153

In [31]:
import re
from bs4 import BeautifulSoup

def preprocess_tweet(tweet_body):
    # Function to convert a raw tweet to a string of words
    # The input is a single string (a raw tweet), and 
    # the output is a single string (a preprocessed tweet)

    # 1. Remove the HTML tags or markup (just in case).
    review_text = BeautifulSoup(tweet_body, "lxml").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words and words with length 1
    words = [w for w in words if (not w in stops and len(w) > 1 )]
    #
    ### Try Stemming and Lemmatization later to check if the results improve
    # 7. Stemming
    # porter = nltk.PorterStemmer()
    # words = [porter.stem(t) for t in words]
    #
    # 8.The WordNet lemmatizer only removes affixes if the resulting word is in its dictionary
    # wnl = nltk.WordNetLemmatizer()
    # words = [wnl.lemmatize(t) for t in words]
    
    # 9. Join the words back into one string separated by space, and return the result.
    return( " ".join(words)) 

In [35]:
x_train['tweets'][0]

"i've been reading one direction fan fics all day . it's <user> fault"

In [36]:
preprocess_tweet(train_pos['tweets'][0])

'reading one direction fan fics day fault'

Note: should take under 1:30-2 minutes in total for the overall preprocessing (for the small positive and negative tweet datasets at least).
Around 20-25 minutes for the full datasets

In [37]:
x_train['tweets'] = x_train['tweets'].apply(preprocess_tweet)

In [40]:
x_test['tweets'] = x_test['tweets'].apply(preprocess_tweet)

In [41]:
unsup_data['tweets'] = unsup_data['tweets'].apply(preprocess_tweet)

In [38]:
x_train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966639 entries, 0 to 1966638
Data columns (total 1 columns):
tweets    object
dtypes: object(1)
memory usage: 191.2 MB


In [42]:
x_test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491658 entries, 0 to 491657
Data columns (total 1 columns):
tweets    491658 non-null object
dtypes: object(1)
memory usage: 47.8 MB


In [43]:
unsup_data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 1 columns):
tweets    10000 non-null object
dtypes: object(1)
memory usage: 979.5 KB


In [39]:
x_train.to_pickle("./preprocessed_full_train_2.pkl")

In [44]:
x_test.to_pickle("./preprocessed_full_test_2.pkl")

In [33]:
np.save("./train_labels_2", y_train)

In [34]:
np.save("./test_labels_2", y_test)

In [45]:
x_train.head(3)

Unnamed: 0,tweets
0,reading one direction fan fics day fault
1,welcomee thought let know
2,wat said hey jas


In [3]:
x_train = pd.read_pickle("./preprocessed_full_train_2.pkl")

In [4]:
x_train.head(3)

Unnamed: 0,tweets
0,reading one direction fan fics day fault
1,welcomee thought let know
2,wat said hey jas


In [48]:
x_test.head(3)

Unnamed: 0,tweets
0,flowery
1,watching toughlove getting tips help
2,oh yes would come


In [5]:
x_test = pd.read_pickle("./preprocessed_full_test_2.pkl")

In [6]:
x_test.head(3)

Unnamed: 0,tweets
0,flowery
1,watching toughlove getting tips help
2,oh yes would come


### 4. Trying out Doc2Vec

In [10]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

In [11]:
from sklearn.model_selection import train_test_split

#### 4.1 Convert our inputs into Doc2Vec input structure (LabeledSentence)

In [12]:
def labelize_tweets(texts, label_type):
    labelized = []
    for row in texts.itertuples():
        label = '%s_%s'%(label_type, row.Index)
        labelized.append(LabeledSentence(row.tweets.split(), [label]))
    return labelized

In [13]:
x_train_lab = labelize_tweets(x_train, 'TRAIN')

In [57]:
x_test_lab = labelize_tweets(x_test, 'TEST')

In [58]:
unsup_data_lab = labelize_tweets(unsup_data, 'UNSUP')

In [59]:
from collections import Mapping, Container
from sys import getsizeof
 
def deep_getsizeof(o, ids):
    """Find the memory footprint of a Python object
 
    This is a recursive function that drills down a Python object graph
    like a dictionary holding nested dictionaries with lists of lists
    and tuples and sets.
 
    The sys.getsizeof function does a shallow size of only. It counts each
    object inside a container as pointer only regardless of how big it
    really is.
 
    :param o: the object
    :param ids:
    :return:
    """
    d = deep_getsizeof
    if id(o) in ids:
        return 0
 
    r = getsizeof(o)
    ids.add(id(o))
 
    if isinstance(o, str) or isinstance(0, bytes):
        return r
 
    if isinstance(o, Mapping):
        return r + sum(d(k, ids) + d(v, ids) for k, v in o.iteritems())
 
    if isinstance(o, Container):
        return r + sum(d(x, ids) for x in o)
 
    return r 

In [60]:
deep_getsizeof(x_train, set()) / (1024**2)

191.22897243499756

In [61]:
x_train_lab[0]

LabeledSentence(words=['reading', 'one', 'direction', 'fan', 'fics', 'day', 'fault'], tags=['TRAIN_0'])

In [62]:
x_test_lab[5]

LabeledSentence(words=['hahahaha', 'yeahhh'], tags=['TEST_5'])

#### 4.2 Build vocab

In [63]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)

model.build_vocab(x_train_lab + x_test_lab + unsup_data_lab)

In [64]:
import random
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#### 4.3 Train Doc2Vec 

In [65]:
sentences = x_train_lab + x_test_lab + unsup_data_lab

for epoch in range(10):
    print("Epoch " + str(epoch))
    random.shuffle(sentences)
    model.train(sentences)

Epoch 0


2016-12-19 19:03:27,987 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 19:03:28,915 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 19:03:54,061 : INFO : PROGRESS: at 0.01% examples, 400 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:04:02,970 : INFO : PROGRESS: at 0.06% examples, 1416 words/s, in_qsize 8, out_qsize 0
2016-12-19 19:04:09,197 : INFO : PROGRESS: at 0.10% examples, 2112 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:04:14,579 : INFO : PROGRESS: at 0.15% examples, 2654 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:04:17,729 : INFO : PROGRESS: at 0.17% examples, 2845 words/s, in_qsize 6, out_qsize 1
2016-12-19 19:04:22,085 : INFO : PROGRESS: at 0.19% examples, 2935 words/s, in_qsize 8, out_qsize 0
2016-12-19 19:04:26,869 : INFO : PROGRESS: at 0.23% examples, 3304 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:04:28,178 : INFO :

Epoch 1


2016-12-19 19:33:43,203 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 19:33:43,218 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 19:33:44,891 : INFO : PROGRESS: at 0.06% examples, 28163 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:33:46,339 : INFO : PROGRESS: at 0.14% examples, 37219 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:33:47,815 : INFO : PROGRESS: at 0.23% examples, 40091 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:33:49,266 : INFO : PROGRESS: at 0.32% examples, 41725 words/s, in_qsize 8, out_qsize 0
2016-12-19 19:33:50,780 : INFO : PROGRESS: at 0.41% examples, 42315 words/s, in_qsize 8, out_qsize 0
2016-12-19 19:33:52,230 : INFO : PROGRESS: at 0.50% examples, 42967 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:33:53,760 : INFO : PROGRESS: at 0.58% examples, 43159 words/s, in_qsize 7, out_qsize 0
2016-12-19 19:33:55,278 

Epoch 2


2016-12-19 20:03:01,670 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 20:03:01,688 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 20:03:03,338 : INFO : PROGRESS: at 0.06% examples, 28617 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:03:04,900 : INFO : PROGRESS: at 0.15% examples, 36174 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:03:06,373 : INFO : PROGRESS: at 0.23% examples, 39363 words/s, in_qsize 8, out_qsize 0
2016-12-19 20:03:07,888 : INFO : PROGRESS: at 0.32% examples, 40619 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:03:09,343 : INFO : PROGRESS: at 0.41% examples, 41719 words/s, in_qsize 8, out_qsize 0
2016-12-19 20:03:10,850 : INFO : PROGRESS: at 0.50% examples, 42263 words/s, in_qsize 8, out_qsize 0
2016-12-19 20:03:12,353 : INFO : PROGRESS: at 0.59% examples, 42636 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:03:13,354 

Epoch 3


2016-12-19 20:30:51,429 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 20:30:51,430 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 20:30:53,039 : INFO : PROGRESS: at 0.06% examples, 27000 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:30:54,532 : INFO : PROGRESS: at 0.14% examples, 35922 words/s, in_qsize 8, out_qsize 0
2016-12-19 20:30:55,983 : INFO : PROGRESS: at 0.23% examples, 39255 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:30:57,000 : INFO : PROGRESS: at 0.31% examples, 42708 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:30:58,304 : INFO : PROGRESS: at 0.37% examples, 40739 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:30:59,352 : INFO : PROGRESS: at 0.43% examples, 41732 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:31:00,631 : INFO : PROGRESS: at 0.50% examples, 41481 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:31:01,721 

Epoch 4


2016-12-19 20:58:49,836 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 20:58:49,850 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 20:58:51,556 : INFO : PROGRESS: at 0.06% examples, 27105 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:58:53,124 : INFO : PROGRESS: at 0.14% examples, 35152 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:58:54,637 : INFO : PROGRESS: at 0.23% examples, 38231 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:58:56,184 : INFO : PROGRESS: at 0.32% examples, 39618 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:58:57,776 : INFO : PROGRESS: at 0.41% examples, 40176 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:58:58,795 : INFO : PROGRESS: at 0.49% examples, 42256 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:59:00,149 : INFO : PROGRESS: at 0.54% examples, 40777 words/s, in_qsize 7, out_qsize 0
2016-12-19 20:59:01,200 

Epoch 5


2016-12-19 21:27:22,632 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 21:27:22,646 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 21:27:24,743 : INFO : PROGRESS: at 0.06% examples, 26570 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:27:26,402 : INFO : PROGRESS: at 0.15% examples, 33716 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:27:28,050 : INFO : PROGRESS: at 0.23% examples, 36145 words/s, in_qsize 8, out_qsize 0
2016-12-19 21:27:29,542 : INFO : PROGRESS: at 0.32% examples, 38270 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:27:31,196 : INFO : PROGRESS: at 0.41% examples, 38793 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:27:32,759 : INFO : PROGRESS: at 0.50% examples, 39504 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:27:34,437 : INFO : PROGRESS: at 0.59% examples, 39607 words/s, in_qsize 8, out_qsize 0
2016-12-19 21:27:35,943 

Epoch 6


2016-12-19 21:57:31,961 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 21:57:31,968 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 21:57:33,194 : INFO : PROGRESS: at 0.01% examples, 8011 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:57:34,195 : INFO : PROGRESS: at 0.05% examples, 20462 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:57:35,241 : INFO : PROGRESS: at 0.10% examples, 24492 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:57:36,280 : INFO : PROGRESS: at 0.14% examples, 26530 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:57:37,286 : INFO : PROGRESS: at 0.19% examples, 27871 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:57:38,316 : INFO : PROGRESS: at 0.23% examples, 28668 words/s, in_qsize 7, out_qsize 0
2016-12-19 21:57:39,333 : INFO : PROGRESS: at 0.27% examples, 29278 words/s, in_qsize 8, out_qsize 0
2016-12-19 21:57:40,338 :

Epoch 7


2016-12-19 22:34:05,753 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 22:34:05,776 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 22:34:06,994 : INFO : PROGRESS: at 0.02% examples, 16678 words/s, in_qsize 7, out_qsize 0
2016-12-19 22:34:07,997 : INFO : PROGRESS: at 0.08% examples, 29266 words/s, in_qsize 7, out_qsize 0
2016-12-19 22:34:09,002 : INFO : PROGRESS: at 0.13% examples, 33506 words/s, in_qsize 7, out_qsize 0
2016-12-19 22:34:10,019 : INFO : PROGRESS: at 0.15% examples, 29196 words/s, in_qsize 7, out_qsize 0
2016-12-19 22:34:11,045 : INFO : PROGRESS: at 0.20% examples, 29929 words/s, in_qsize 7, out_qsize 0
2016-12-19 22:34:12,053 : INFO : PROGRESS: at 0.25% examples, 31896 words/s, in_qsize 7, out_qsize 0
2016-12-19 22:34:13,938 : INFO : PROGRESS: at 0.32% examples, 30701 words/s, in_qsize 7, out_qsize 0
2016-12-19 22:34:14,941 

Epoch 8


2016-12-19 23:12:04,477 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 23:12:04,504 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 23:12:06,018 : INFO : PROGRESS: at 0.01% examples, 7752 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:12:07,033 : INFO : PROGRESS: at 0.06% examples, 20151 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:12:08,119 : INFO : PROGRESS: at 0.10% examples, 23791 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:12:09,157 : INFO : PROGRESS: at 0.14% examples, 25896 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:12:10,202 : INFO : PROGRESS: at 0.19% examples, 27140 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:12:11,282 : INFO : PROGRESS: at 0.24% examples, 29089 words/s, in_qsize 8, out_qsize 0
2016-12-19 23:12:12,333 : INFO : PROGRESS: at 0.29% examples, 29542 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:12:13,368 :

Epoch 9


2016-12-19 23:48:52,220 : INFO : training model with 4 workers on 430870 vocabulary and 100 features, using sg=0 hs=1 sample=0.0001 negative=5 window=10
2016-12-19 23:48:52,311 : INFO : expecting 2468297 sentences, matching count from corpus used for vocabulary survey
2016-12-19 23:48:53,755 : INFO : PROGRESS: at 0.01% examples, 6956 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:48:54,946 : INFO : PROGRESS: at 0.06% examples, 17622 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:48:56,215 : INFO : PROGRESS: at 0.10% examples, 20669 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:48:57,437 : INFO : PROGRESS: at 0.14% examples, 22322 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:48:58,679 : INFO : PROGRESS: at 0.19% examples, 23281 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:48:59,689 : INFO : PROGRESS: at 0.24% examples, 25917 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:49:00,728 : INFO : PROGRESS: at 0.30% examples, 27782 words/s, in_qsize 7, out_qsize 0
2016-12-19 23:49:01,763 :

In [66]:
model.most_similar('good')

2016-12-20 00:25:17,466 : INFO : precomputing L2-norms of word weight vectors


[('know', 0.9985889792442322),
 ('like', 0.9983980059623718),
 ('got', 0.9982538819313049),
 ('lol', 0.9981919527053833),
 ('go', 0.9981678128242493),
 ('get', 0.9981188178062439),
 ('going', 0.9980956315994263),
 ('see', 0.9979281425476074),
 ('want', 0.9979101419448853),
 ('love', 0.9978447556495667)]

In [68]:
model.docvecs['TRAIN_0']

array([ -2.12079706e-03,  -1.98014960e-01,  -2.48720986e-03,
        -2.71376371e-01,  -9.85759050e-02,  -1.04363382e-01,
        -1.07281543e-01,   3.55507672e-01,  -4.15954292e-02,
        -6.88320324e-02,   2.28305131e-01,   2.55579859e-01,
        -1.30307645e-01,  -1.28493786e-01,   1.65455848e-01,
        -4.50260080e-02,  -5.48016608e-01,  -7.53123909e-02,
         3.15562151e-02,   2.58958787e-01,   2.25779310e-01,
         4.03748068e-04,  -4.30437475e-01,  -4.80997264e-02,
        -1.99790895e-01,   2.06255503e-02,   1.35858670e-01,
         6.15042169e-03,   8.39654803e-02,   8.75710696e-02,
         1.98080033e-01,  -6.24778681e-02,  -3.70153482e-03,
        -4.27143961e-01,   4.36700396e-02,   2.85156835e-02,
        -1.68132454e-01,  -1.76691383e-01,  -2.57204305e-02,
         1.05877124e-01,  -2.63668690e-02,  -1.32778376e-01,
        -3.34169790e-02,  -5.51636703e-02,  -4.83408868e-01,
        -3.84195238e-01,  -2.58479536e-01,   6.71078563e-02,
        -1.74949512e-01,

In [67]:
### Save model ###
model.save('./tweets_model_2.d2v')

2016-12-20 00:25:46,737 : INFO : saving Doc2Vec object under ./tweets_model_2.d2v, separately None
2016-12-20 00:25:46,950 : INFO : storing numpy array 'doctag_syn0' to ./tweets_model_2.d2v.docvecs.doctag_syn0.npy
2016-12-20 00:26:08,911 : INFO : not storing attribute syn0norm
2016-12-20 00:26:08,913 : INFO : storing numpy array 'syn0' to ./tweets_model_2.d2v.syn0.npy
2016-12-20 00:26:10,978 : INFO : storing numpy array 'syn1neg' to ./tweets_model_2.d2v.syn1neg.npy
2016-12-20 00:26:14,858 : INFO : not storing attribute cum_table
2016-12-20 00:26:15,105 : INFO : storing numpy array 'syn1' to ./tweets_model_2.d2v.syn1.npy
2016-12-20 00:33:30,261 : INFO : saved ./tweets_model_2.d2v


In [14]:
### Load model ###
model = Doc2Vec.load('./tweets_model_2.d2v')

In [16]:
model.docvecs['TRAIN_0']

array([ -2.12079706e-03,  -1.98014960e-01,  -2.48720986e-03,
        -2.71376371e-01,  -9.85759050e-02,  -1.04363382e-01,
        -1.07281543e-01,   3.55507672e-01,  -4.15954292e-02,
        -6.88320324e-02,   2.28305131e-01,   2.55579859e-01,
        -1.30307645e-01,  -1.28493786e-01,   1.65455848e-01,
        -4.50260080e-02,  -5.48016608e-01,  -7.53123909e-02,
         3.15562151e-02,   2.58958787e-01,   2.25779310e-01,
         4.03748068e-04,  -4.30437475e-01,  -4.80997264e-02,
        -1.99790895e-01,   2.06255503e-02,   1.35858670e-01,
         6.15042169e-03,   8.39654803e-02,   8.75710696e-02,
         1.98080033e-01,  -6.24778681e-02,  -3.70153482e-03,
        -4.27143961e-01,   4.36700396e-02,   2.85156835e-02,
        -1.68132454e-01,  -1.76691383e-01,  -2.57204305e-02,
         1.05877124e-01,  -2.63668690e-02,  -1.32778376e-01,
        -3.34169790e-02,  -5.51636703e-02,  -4.83408868e-01,
        -3.84195238e-01,  -2.58479536e-01,   6.71078563e-02,
        -1.74949512e-01,

### 5. Tweet Classification

#### 5.1 Training vectors

In [17]:
train_arrays = np.zeros((len(x_train), 100))

for i in range(len(x_train)):
    prefix_train = 'TRAIN_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train]

In [19]:
np.save("./vectors/train_arrays_2", train_arrays)

#### 5.2 Testing vectors

In [20]:
test_arrays = np.zeros((len(x_test), 100))

for i in range(len(x_test)):
    prefix_test = 'TEST_' + str(i)
    test_arrays[i] = model.docvecs[prefix_test]

In [21]:
np.save("./vectors/test_arrays_2", test_arrays)

#### 5.3 Unsupervised vectors 

In [24]:
unsup_arrays = np.zeros((len(unsup_data), 100))
for i in range(len(unsup_data)):
    prefix_unsup = 'UNSUP_' + str(i)
    unsup_arrays[i] = model.docvecs[prefix_unsup]

In [25]:
np.save("./vectors/unsup_arrays_2", unsup_arrays)

If the previous results have been saved (or they can be made available) they just need to be loaded and run different classification models.

In [5]:
train_arrays = np.load("./vectors/train_arrays_2.npy")

In [6]:
test_arrays = np.load("./vectors/test_arrays_2.npy")

In [7]:
unsup_arrays = np.load("./vectors/unsup_arrays_2.npy")

In [8]:
y_train = np.load("./train_labels_2.npy")

In [9]:
y_test = np.load("./test_labels_2.npy")

#### 5.4 Classification

In [10]:
# classifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier 

In [12]:
# Random Forest
classifier = RandomForestClassifier(n_estimators = 100, n_jobs = 4, verbose=3)
classifier.fit(train_arrays, y_train)

building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 11.3min


building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70

[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 54.5min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=4, oob_score=False, random_state=None,
            verbose=3, warm_start=False)

In [13]:
print("Random Forest Classifier:", classifier.score(test_arrays, y_test))

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   13.8s finished


Random Forest Classifier: 0.687060924464


In [14]:
rfc_predict = classifier.predict(unsup_arrays)
rfc_predict = rfc_predict.astype(int)
rfc_predict

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


array([0, 0, 1, ..., 0, 1, 0])

In [None]:
# Logistic Regression
classifier2 = LogisticRegression(verbose=3)
classifier2.fit(train_arrays, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [37]:
print("Logistic Regression Classifier:", classifier2.score(test_arrays, y_test))

Logistic Regression Classifier: 0.690364033536


In [38]:
log_reg_predict = classifier2.predict(unsup_arrays)
log_reg_predict = log_reg_predict.astype(int)
log_reg_predict

array([0, 0, 0, ..., 0, 1, 1])

In [None]:
# Support Vector Machines
### IMPORTANT: not practical according to the documentation: 
# "The implementation is based on libsvm. The fit time complexity is more than quadratic 
# with the number of samples which makes it hard to scale to dataset with more than a 
# couple of 10000 samples."

classifier3 = svm.SVC()
classifier3.fit(train_arrays, y_train)
#svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
#    max_iter=-1, probability=False, random_state=None, shrinking=True,
#    tol=0.001, verbose=False)

In [None]:
print("Support Vector Machines Classifier:", classifier3.score(test_arrays, y_test))

In [29]:
# Naive Bayes
classifier4 = GaussianNB()
classifier4.fit(train_arrays, y_train)
GaussianNB()

GaussianNB(priors=None)

In [30]:
print("Naive Bayes Classifier:", classifier4.score(test_arrays, y_test))

Naive Bayes Classifier: 0.610209935775


### 6. Submission

In [15]:
import csv

In [16]:
def get_predictions(target_pred):
    pred = np.zeros(target_pred.shape)
    pred[np.where(target_pred <= 0)] = -1
    pred[np.where(target_pred > 0)] = 1
    return pred

In [17]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [18]:
pred = get_predictions(rfc_predict)
pred = pred.astype(int)
ids = range(1,len(pred)+1)
create_csv_submission(ids, pred, 'prediction_test_3.csv')