## Import section

In [2]:
import os
import numpy
import datetime
import re
import math
# Logger
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# nltk
import nltk
from nltk.corpus import stopwords
# Scipy
from scipy import sparse
# Pickle
import cPickle as pkl
# BeautifulSoup
from bs4 import BeautifulSoup
# Gensim
import gensim
# To print all outputs, not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [73]:
dir_vocab = "/home/debojyoti/aclImdb/imdb.vocab"
dir_pos = "/home/debojyoti/aclImdb/train/pos/"
dir_neg = "/home/debojyoti/aclImdb/train/neg/"

In [74]:
vocab = set([])
for word in open(dir_vocab):
    vocab.add(str(word).split()[0])
vocab = numpy.asarray(sorted(vocab))
vocab = dict(zip(vocab,range(len(vocab))))
with open("word2indx.bin","wb") as outfile:
    pkl.dump(vocab,outfile,pkl.HIGHEST_PROTOCOL)
n_vocab = len(vocab)

## Computing BoW feature vector

In [98]:
fvec_pos = numpy.zeros((len(os.listdir(dir_pos)),n_vocab),dtype=numpy.int)
fvec_neg = numpy.zeros((len(os.listdir(dir_neg)),n_vocab),dtype=numpy.int)
logging.info("train pos size: %s",str(fvec_pos.shape))
logging.info("train neg size: %s",str(fvec_pos.shape))
# BoW for positive reviews
files=[os.path.join(dir_pos,f) for f in os.listdir(dir_pos)]
exc_ct_p = 0
exc_ct_n = 0
datetime.datetime.now()
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s: %s",fname,exc)
            exc_ct_p += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_pos[index][vocab[token]]=1
            except:
                pass
logging.info("Feature vectors for positive reviews constructed. Exception Count: %d",exc_ct_p)
# BoW for negative reviews
files=[os.path.join(dir_neg,f) for f in os.listdir(dir_neg)]
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s %s",fname,exc)
            exc_ct_n += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_neg[index][vocab[token]]=1
            except:
                pass
logging.info("Feature vectors for negative reviews constructed. Exception Count: %d",exc_ct_n)
datetime.datetime.now()
fvec_pos = sparse.csr_matrix(fvec_pos)
fvec_neg = sparse.csr_matrix(fvec_neg)
with open('pos_BoW_pkl.bin','wb') as outfile:
    pkl.dump(fvec_pos,outfile,pkl.HIGHEST_PROTOCOL)
with open('neg_BoW_pkl.bin','wb') as outfile:
    pkl.dump(fvec_neg,outfile,pkl.HIGHEST_PROTOCOL)

INFO:root:Feature vectors for positive reviews constructed.
INFO:root:Feature vectors for negative reviews constructed.


## Function to compute no of documents in which a token takes place

In [28]:
def compute_doc_count(dirname,indctr):
    # this function finds the number of documents a word occurs in, for each word
    # indctr=1 for positive -1 for negative
    if indctr==1 and os.path.isfile('pos_doc_count.bin'):
        with open('pos_doc_count.bin','rb') as myfile:
            doc_count = pkl.load(myfile)
            return doc_count
    if indctr==-1 and os.path.isfile('neg_doc_count.bin'):
        with open('neg_doc_count.bin','rb') as myfile:
            doc_count = pkl.load(myfile)
            return doc_count
    vocab_list = numpy.asarray(vocab.keys())
    doc_count = dict(zip(vocab_list,numpy.zeros(len(vocab_list),dtype=numpy.int)))
    files=[os.path.join(dirname,f) for f in os.listdir(dirname)]
    for fname in files:
        with open(fname) as myfile:
            data = myfile.read()
            data_clean = BeautifulSoup(data)
            data = data_clean.get_text()
            tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
            try:
                tokens = [str(token).lower() for token in tokens_truncated]
            except Exception as exc:
                tokens = []
                for token in tokens_truncated:
                    try:
                        tokens.append(str(token))
                    except:
                        continue
            for token in set(tokens):
                try:
                    doc_count[token] += 1
                except:
                    pass
    # save count_dict in a file
    if indctr==1:
        with open('pos_doc_count.bin','wb') as outfile:
            pkl.dump(doc_count,outfile,pkl.HIGHEST_PROTOCOL)
    elif indctr==-1:
        with open('neg_doc_count.bin','wb') as outfile:
            pkl.dump(doc_count,outfile,pkl.HIGHEST_PROTOCOL)
    return doc_count

# compute_doc_count()

## Computing tf-idf feature vectors

In [35]:
fvec_pos = numpy.zeros((len(os.listdir(dir_pos)),n_vocab),dtype=numpy.float)
fvec_neg = numpy.zeros((len(os.listdir(dir_neg)),n_vocab),dtype=numpy.float)
exc_ct_p = 0
exc_ct_n = 0

datetime.datetime.now()

# BoW using tf-idf for positive reviews
files=[os.path.join(dir_pos,f) for f in os.listdir(dir_pos)]
# Get dictionary mapping word to doc_count
doc_count = compute_doc_count(dir_pos,1)
n_doc = len(files)
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s: %s",fname,exc)
            exc_ct_p += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_pos[index][vocab[token]]+=(1/float(len(tokens)))
            except:
                pass
        for token in set(tokens):
            try:
                idf = math.log(float(n_doc)/doc_count[token],2)
                fvec_pos[index][vocab[token]] *= idf
            except:
                pass
fvec_pos = sparse.csr_matrix(fvec_pos)
logging.info("Feature vectors for positive reviews constructed. Exception Count: %d",exc_ct_p)

# BoW using tf-idf for negative reviews
files=[os.path.join(dir_neg,f) for f in os.listdir(dir_neg)]
n_doc = len(files)
# Get dictionary mapping word to doc_count
doc_count = compute_doc_count(dir_neg,-1)
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s %s",fname,exc)
            exc_ct_n += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_neg[index][vocab[token]]+=(1/float(len(tokens)))
            except:
                pass
        for token in set(tokens):
            try:
                idf = math.log(float(n_doc)/doc_count[token],2)
                fneg_pos[index][vocab[token]] *= idf
            except:
                pass
fvec_neg = sparse.csr_matrix(fvec_neg)
logging.info("Feature vectors for negative reviews constructed. Exception Count: %d",exc_ct_n)

datetime.datetime.now()
with open('pos_BoW_tfidf_pkl.bin','wb') as outfile:
    pkl.dump(fvec_pos,outfile,pkl.HIGHEST_PROTOCOL)
with open('neg_BoW_tfidf_pkl.bin','wb') as outfile:
    pkl.dump(fvec_neg,outfile,pkl.HIGHEST_PROTOCOL)

datetime.datetime(2016, 10, 3, 2, 57, 1, 72651)

INFO:root:Feature vectors for positive reviews constructed. Exception Count: 849
INFO:root:Feature vectors for negative reviews constructed. Exception Count: 986


datetime.datetime(2016, 10, 3, 2, 58, 59, 344864)

## Reading sentences for word2vec

In [68]:
class ImdbReview(object):
    def __init__(self,dirname):
        self.dirname_pos = os.path.join(dirname,"pos")
        self.dirname_neg = os.path.join(dirname,"neg")
        self.files = [os.path.join(self.dirname_pos,x) for x in os.listdir(self.dirname_pos)] + \
            [os.path.join(self.dirname_neg,x) for x in os.listdir(self.dirname_neg)]
        self.stops = set(stopwords.words("english"))        
    def __iter__(self):        
        for fname in self.files:
            with open(fname) as myfile:
                data = myfile.read()
                data_clean = BeautifulSoup(data,"lxml")
                data = data_clean.get_text()
                sentences = nltk.tokenize.sent_tokenize(data)
                for sentence in sentences:
                    try:
                        tmp = [str(elem).lower() for elem in sentence.split() if not re.match(r'[^a-zA-Z0-9_\"\']+',elem)]
                    except:
                        tmp = []
                        for token in sentence.split():
                            try:
                                tmp.append(str(token))
                            except:
                                continue
                    yield tmp

In [69]:
datetime.datetime.now()
import gensim
sentences = ImdbReview('/home/debojyoti/aclImdb/train/')
bigram_transformer = gensim.models.Phrases(sentences)
# model = gensim.models.Word2Vec(sentences, min_count=10, size=300, window=5, workers=4)
model = gensim.models.Word2Vec(bigram_transformer[sentences], min_count=10, size=300, window=5, workers=4)
model.save('word2vec.bin')
datetime.datetime.now()



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


## Bag of Vector

In [5]:
with open("pos_BoW_pkl.bin","rb") as myfile:
    pos_bow = pkl.load(myfile)
with open("neg_BoW_pkl.bin","rb") as myfile:
    neg_bow = pkl.load(myfile)
with open("word2vec.bin","rb") as myfile:
    word2vec_model = pkl.load(myfile)
with open("pos_BoW_tfidf_pkl.bin","rb") as myfile:
    pos_bow_tfidf = pkl.load(myfile)
with open("neg_BoW_tfidf_pkl.bin","rb") as myfile:
    neg_bow_tfidf = pkl.load(myfile)
with open("word2indx.bin","rb") as myfile:
    word2indx = pkl.load(myfile)

In [None]:
datetime.datetime.now()
vec_dim = word2vec_model.layer1_size
n_pos,n_neg = pos_bow.shape[0],neg_bow.shape[0]
n_vocab = pos_bow.shape[1]
indx2word = numpy.array([""]*n_vocab)
for word in word2indx.keys():
    indx2word[word2indx[word]] = word
logging.info("starting BoV creation for pos examples...")
# Creating BoV, weighted BoV matrix for pos
indptr = numpy.array([])
indices = numpy.array([])
data = numpy.array([])
data_tfidf = numpy.array([])
for doc_indx in range(n_pos):
    if doc_indx%100 == 0:
        print doc_indx
    word_indices = pos_bow[doc_indx,:].nonzero()[1]
    word_tfidf = pos_bow_tfidf[doc_indx,word_indices]
    indptr = numpy.append(indptr, len(data))
    for word_indx in word_indices:
        weight = pos_bow_tfidf[0,word_indx]
        data = numpy.concatenate((data,word2vec_model[indx2word[word_indx]]))
        data_tfidf = numpy.concatenate((data_tfidf,weight * word2vec_model[indx2word[word_indx]]))
        start = word_indx*vec_dim
        indices = numpy.concatenate((indices,numpy.asarray(range(start,start+vec_dim))))
indptr = numpy.append(indptr, len(data))
bov = sparse.csr_matrix((data,indices,indptr),shape=(n_pos,n_vocab*vec_dim))
bov_tfidf = sparse.csr_matrix((data_tfidf,indices,indptr),shape=(n_pos,n_vocab*vec_dim))
with open('pos_bov.bin','wb') as outfile:
    pkl.dump(bov,outfile,pkl.HIGHEST_PROTOCOL)
with open('pos_bov_tfidf.bin','wb') as outfile:
    pkl.dump(bov_tfidf,outfile,pkl.HIGHEST_PROTOCOL)
logging.info("BoV created for pos examples.")
logging.info("starting BoV creation for pos examples...")
datetime.datetime.now()
# Creating BoV, weighted BoV matrix for neg
indptr = numpy.array([])
indices = numpy.array([])
data = numpy.array([])
data_tfidf = numpy.array([])
for doc_indx in range(n_neg):
    if doc_indx%100 == 0:
        print doc_indx
    word_indices = neg_bow[doc_indx,:].nonzero()[1]
    word_tfidf = neg_bow_tfidf[doc_indx,word_indices]
    indptr = numpy.append(indptr, len(data))
    for word_indx in word_indices:
        weight = neg_bow_tfidf[0,word_indx]
        data = numpy.concatenate((data,word2vec_model[indx2word[word_indx]]))
        data_tfidf = numpy.concatenate((data_tfidf,weight * word2vec_model[indx2word[word_indx]]))
        start = word_indx*vec_dim
        indices = numpy.concatenate((indices,numpy.asarray(range(start,start+vec_dim))))
indptr = numpy.append(indptr, len(data))
bov = sparse.csr_matrix((data,indices,indptr),shape=(n_neg,n_vocab*vec_dim))
bov_tfidf = sparse.csr_matrix((data_tfidf,indices,indptr),shape=(n_neg,n_vocab*vec_dim))
with open('neg_bov.bin','wb') as outfile:
    pkl.dump(bov,outfile,pkl.HIGHEST_PROTOCOL)
with open('neg_bov_tfidf.bin','wb') as outfile:
    pkl.dump(bov_tfidf,outfile,pkl.HIGHEST_PROTOCOL)
logging.info("BoV created for neg examples.")
datetime.datetime.now()

datetime.datetime(2016, 10, 4, 12, 41, 8, 587226)

INFO:root:starting BoV creation for pos examples...


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


## Liblinear: prepare dataset

In [None]:
with open('imdb_train.tr','w') as tr_file:
    with open("pos_BoW_pkl.bin","rb") as myfile:
        pos_bow = pkl.load(myfile)
    dict = {1:23,2:30}
    tr_f
    