In [1]:
import tarfile
import pyprind
import os
import multiprocessing
import requests
import pprint

import gensim as gs
import numpy as np
import pandas as pd
import sklearn as sk

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import to_unicode
from gensim.utils import tokenize
from collections import namedtuple
from collections import OrderedDict
from random import shuffle

In [2]:
cores = multiprocessing.cpu_count()
dirname = 'aclImdb'
filename = 'aclImdb_v1.tar.gz'

In [3]:
def download_extract_data(dirname, filename):
    if not os.path.isfile('aclImdb/alldata-id.txt'):
        if not os.path.isdir(dirname):
            if not os.path.isfile(filename):
                print("Downloading IMDB archive...")
                url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
                r = requests.get(url)
                with open(filename, 'wb') as f:
                    f.write(r.content)
            tar = tarfile.open(filename, mode='r')
            tar.extractall()
            tar.close()
            print("Data extracted...")
            
def normalize(text):
    punctuations = ['<br /><br />']
    
    for punctuation in punctuations:
        text = text.replace(punctuation, "")
    return text

In [4]:
# download_extract_data(dirname, filename)

In [5]:
dirname = 'aclImdb'
labels = {'pos' : 1, 'neg' : 0, 'unsup' : -1}
pbar = pyprind.ProgBar(100000)
df = pd.DataFrame()

for s in ('test', 'train'):       
    for l in ('pos', 'neg', 'unsup'):
        path = os.path.join(dirname, s, l)
      
        if s=='test' and l=='unsup':
            break
        
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding = 'utf-8') as infile:
                txt = to_unicode(infile.read())
                txt = normalize(txt)
                txt = list(tokenize(txt))
                df = df.append([[txt, labels[l], l]], ignore_index=True)
            pbar.update() 

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:10:38


In [6]:
df.columns=['token', 'label', 'tags']
df['tags'] = df.index
df.head()

Unnamed: 0,token,label,tags
0,"[I, ve, seen, the, movie, Lost, Horizons, and,...",1,0
1,"[This, short, deals, with, a, severely, critic...",1,1
2,"[this, is, one, of, the, finest, movies, i, ha...",1,2
3,"[Are, You, in, the, House, Alone, belongs, to,...",1,3
4,"[I, saw, this, movie, in, Santa, Monica, on, A...",1,4


In [7]:
sentences = list(zip(df['token'].values, df['tags'].values))
taggeddoc = []
for i,item in enumerate(sentences):
    td = TaggedDocument(words = item[0], tags = [str(item[1])])
    taggeddoc.append(td)

In [8]:
model_dm = Doc2Vec(size = 400, window = 10, min_count = 1, workers = 6, 
                alpha = 0.025, min_alpha = 0.0050, sample = 1e-4, negative = 5, dm=1)
model_dbow = Doc2Vec(size = 400, window = 10, min_count = 1, workers = 6, 
                alpha = 0.025, min_alpha = 0.0050, sample = 1e-4, negative = 5, dm=0)

model_dm.build_vocab(taggeddoc)
model_dbow.build_vocab(taggeddoc)

print('vocabulary built')


vocabulary built


In [9]:
for epoch in range(20):
    print("epoch: ", epoch)
    model_dm.train(taggeddoc, total_examples=model_dm.corpus_count, epochs = model_dm.iter)    
    model_dbow.train(taggeddoc, total_examples=model_dbow.corpus_count, epochs = model_dbow.iter)    
    
    if (epoch + 1) % 10 == 0:
        model_dm.save('./imdb_dm.d2v')
        model_dbow.save('./imdb_dbow.d2v')        

print('models trained')

epoch:  0
epoch:  1
epoch:  2
epoch:  3
epoch:  4
epoch:  5
epoch:  6
epoch:  7
epoch:  8
epoch:  9
epoch:  10
epoch:  11
epoch:  12
epoch:  13
epoch:  14
epoch:  15
epoch:  16
epoch:  17
epoch:  18
epoch:  19
models trained


In [10]:
for word in ['brilliant', 'clever', 'stupid']:
    print('DM')
    pprint.pprint(model_dm.wv.most_similar(word))
    print('DBOW')
    pprint.pprint(model_dbow.wv.most_similar(word))
    print('*'*75)

DM
[('wonderful', 0.5328915119171143),
 ('superb', 0.5063442587852478),
 ('terrible', 0.4888131022453308),
 ('fantastic', 0.48425111174583435),
 ('terrific', 0.47444143891334534),
 ('great', 0.47082194685935974),
 ('excellent', 0.45563891530036926),
 ('horrible', 0.43797510862350464),
 ('amazing', 0.4295591115951538),
 ('fine', 0.41918909549713135)]
DBOW
[('orinirique', 0.2208365797996521),
 ('Straights', 0.22082586586475372),
 ('uomo', 0.21756674349308014),
 ('Melbournehes', 0.2153024524450302),
 ('Repentful', 0.2029258757829666),
 ('kidnappable', 0.20245155692100525),
 ('repulse', 0.20046527683734894),
 ('Acolytes', 0.2001710683107376),
 ('trope', 0.19949795305728912),
 ('communities', 0.1984124779701233)]
***************************************************************************
DM
[('funny', 0.3479025363922119),
 ('witty', 0.3396616280078888),
 ('intelligent', 0.339173287153244),
 ('weak', 0.3217778205871582),
 ('bad', 0.31999388337135315),
 ('stupid', 0.31319567561149597),
 ('sma

In [11]:
df = df[df['label']!=-1]
tags = df['tags'].values

In [12]:
X_raw = np.zeros((50000, 800))

for i in tags:
    X_raw[i] = np.hstack([model_dm.docvecs[str(i)], model_dbow.docvecs[str(i)]])

y_raw = df['label'].values
target_names = ['positive', 'negative']
X_raw_train, X_test, y_raw_train, y_test = train_test_split(X_raw, y_raw, test_size = 0.5)

In [13]:
def train(X_raw_train, y_raw_train, n_splits, clf):

    kfold = sk.model_selection.StratifiedKFold(n_splits=n_splits, random_state=101).split(X= X_raw_train, y=y_raw_train)
    for i, (train_idx, valid_idx) in enumerate(kfold):
        X_train, y_train = X_raw_train[train_idx, :], y_raw_train[train_idx]    
        X_valid, y_valid = X_raw_train[valid_idx, :], y_raw_train[valid_idx]    

        clf.fit(X_train, y_train)    
        score_train = clf.score(X_train, y_train)
        score_valid = clf.score(X_valid, y_valid)
        print('Validation- Train: {:.4f} Test: {:.4f}'.format(score_train, score_valid)) 

    clf.fit(X_raw_train, y_raw_train)
    score_train = clf.score(X_raw_train, y_raw_train)
    score_test = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    
    print()
    print('Final- Train: {:.4f} Test: {:.4f}'.format(score_train, score_test)) 
    print()
    print(classification_report(y_test, y_pred, target_names=target_names))


In [14]:
lr_clf = LogisticRegression(C=1.0, fit_intercept=True, intercept_scaling=1, 
                                 penalty='l2', random_state=101, tol=1e-5)
train(X_raw_train, y_raw_train, 5, lr_clf)

Validation- Train: 0.9245 Test: 0.9050
Validation- Train: 0.9247 Test: 0.9092
Validation- Train: 0.9251 Test: 0.9024
Validation- Train: 0.9247 Test: 0.9072
Validation- Train: 0.9249 Test: 0.9080

Final- Train: 0.9226 Test: 0.9082

             precision    recall  f1-score   support

   positive       0.91      0.91      0.91     12484
   negative       0.91      0.91      0.91     12516

avg / total       0.91      0.91      0.91     25000



In [15]:
svm_clf = SVC(C=1.0, kernel='rbf')
train(X_raw_train, y_raw_train, 5, svm_clf)

Validation- Train: 0.9695 Test: 0.8644
Validation- Train: 0.9690 Test: 0.8732
Validation- Train: 0.9703 Test: 0.8698
Validation- Train: 0.9695 Test: 0.8684
Validation- Train: 0.9699 Test: 0.8734

Final- Train: 0.9714 Test: 0.8802

             precision    recall  f1-score   support

   positive       0.88      0.88      0.88     12484
   negative       0.88      0.88      0.88     12516

avg / total       0.88      0.88      0.88     25000

