## Import section

In [23]:
import os
import numpy
import datetime
import re
import math
# Logger
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# nltk
import nltk
# Scipy
from scipy import sparse
# Pickle
import cPickle as pkl
# BeautifulSoup
from bs4 import BeautifulSoup
# To print all outputs, not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
dir_vocab = "/home/debojyoti/aclImdb/imdb.vocab"
dir_pos = "/home/debojyoti/aclImdb/train/pos/"
dir_neg = "/home/debojyoti/aclImdb/train/neg/"

In [3]:
vocab = set([])
for word in open(dir_vocab):
    vocab.add(str(word).split()[0])
vocab = numpy.asarray(sorted(vocab))
vocab = dict(zip(vocab,range(len(vocab))))
n_vocab = len(vocab)

## Computing BoW feature vector

In [98]:
fvec_pos = numpy.zeros((len(os.listdir(dir_pos)),n_vocab),dtype=numpy.int)
fvec_neg = numpy.zeros((len(os.listdir(dir_neg)),n_vocab),dtype=numpy.int)
logging.info("train pos size: %s",str(fvec_pos.shape))
logging.info("train neg size: %s",str(fvec_pos.shape))
# BoW for positive reviews
files=[os.path.join(dir_pos,f) for f in os.listdir(dir_pos)]
exc_ct_p = 0
exc_ct_n = 0
datetime.datetime.now()
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s: %s",fname,exc)
            exc_ct_p += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_pos[index][vocab[token]]=1
            except:
                pass
logging.info("Feature vectors for positive reviews constructed. Exception Count: %d",exc_ct_p)
# BoW for negative reviews
files=[os.path.join(dir_neg,f) for f in os.listdir(dir_neg)]
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s %s",fname,exc)
            exc_ct_n += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_neg[index][vocab[token]]=1
            except:
                pass
logging.info("Feature vectors for negative reviews constructed. Exception Count: %d",exc_ct_n)
datetime.datetime.now()
fvec_pos = sparse.csr_matrix(fvec_pos)
fvec_neg = sparse.csr_matrix(fvec_neg)
with open('pos_BoW_pkl.bin','wb') as outfile:
    pkl.dump(fvec_pos,outfile,pkl.HIGHEST_PROTOCOL)
with open('neg_BoW_pkl.bin','wb') as outfile:
    pkl.dump(fvec_neg,outfile,pkl.HIGHEST_PROTOCOL)

INFO:root:Feature vectors for positive reviews constructed.
INFO:root:Feature vectors for negative reviews constructed.


## Function to compute no of documents in which a token takes place

In [28]:
def compute_doc_count(dirname,indctr):
    # this function finds the number of documents a word occurs in, for each word
    # indctr=1 for positive -1 for negative
    if indctr==1 and os.path.isfile('pos_doc_count.bin'):
        with open('pos_doc_count.bin','rb') as myfile:
            doc_count = pkl.load(myfile)
            return doc_count
    if indctr==-1 and os.path.isfile('neg_doc_count.bin'):
        with open('neg_doc_count.bin','rb') as myfile:
            doc_count = pkl.load(myfile)
            return doc_count
    vocab_list = numpy.asarray(vocab.keys())
    doc_count = dict(zip(vocab_list,numpy.zeros(len(vocab_list),dtype=numpy.int)))
    files=[os.path.join(dirname,f) for f in os.listdir(dirname)]
    for fname in files:
        with open(fname) as myfile:
            data = myfile.read()
            data_clean = BeautifulSoup(data)
            data = data_clean.get_text()
            tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
            try:
                tokens = [str(token).lower() for token in tokens_truncated]
            except Exception as exc:
                tokens = []
                for token in tokens_truncated:
                    try:
                        tokens.append(str(token))
                    except:
                        continue
            for token in set(tokens):
                try:
                    doc_count[token] += 1
                except:
                    pass
    # save count_dict in a file
    if indctr==1:
        with open('pos_doc_count.bin','wb') as outfile:
            pkl.dump(doc_count,outfile,pkl.HIGHEST_PROTOCOL)
    elif indctr==-1:
        with open('neg_doc_count.bin','wb') as outfile:
            pkl.dump(doc_count,outfile,pkl.HIGHEST_PROTOCOL)
    return doc_count

# compute_doc_count()

## Computing tf-idf feature vectors

In [35]:
fvec_pos = numpy.zeros((len(os.listdir(dir_pos)),n_vocab),dtype=numpy.float)
fvec_neg = numpy.zeros((len(os.listdir(dir_neg)),n_vocab),dtype=numpy.float)
exc_ct_p = 0
exc_ct_n = 0

datetime.datetime.now()

# BoW using tf-idf for positive reviews
files=[os.path.join(dir_pos,f) for f in os.listdir(dir_pos)]
# Get dictionary mapping word to doc_count
doc_count = compute_doc_count(dir_pos,1)
n_doc = len(files)
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s: %s",fname,exc)
            exc_ct_p += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_pos[index][vocab[token]]+=(1/float(len(tokens)))
            except:
                pass
        for token in set(tokens):
            try:
                idf = math.log(float(n_doc)/doc_count[token],2)
                fvec_pos[index][vocab[token]] *= idf
            except:
                pass
fvec_pos = sparse.csr_matrix(fvec_pos)
logging.info("Feature vectors for positive reviews constructed. Exception Count: %d",exc_ct_p)

# BoW using tf-idf for negative reviews
files=[os.path.join(dir_neg,f) for f in os.listdir(dir_neg)]
n_doc = len(files)
# Get dictionary mapping word to doc_count
doc_count = compute_doc_count(dir_neg,-1)
for index,fname in enumerate(files):
    with open(fname) as myfile:
        data = myfile.read()
        data_clean = BeautifulSoup(data)
        data = data_clean.get_text()
        tokens_truncated = [token for token in nltk.word_tokenize(data) if not re.match(r'\W+',token)]
        try:
            tokens = [str(token).lower() for token in tokens_truncated]
        except Exception as exc:
            # logging.debug("%s %s",fname,exc)
            exc_ct_n += 1
            # Just ignore the exceptional token, read the rest
            tokens = []
            for token in tokens_truncated:
                try:
                    tokens.append(str(token))
                except:
                    continue
        for token in tokens:
            try:
                fvec_neg[index][vocab[token]]+=(1/float(len(tokens)))
            except:
                pass
        for token in set(tokens):
            try:
                idf = math.log(float(n_doc)/doc_count[token],2)
                fneg_pos[index][vocab[token]] *= idf
            except:
                pass
fvec_neg = sparse.csr_matrix(fvec_neg)
logging.info("Feature vectors for negative reviews constructed. Exception Count: %d",exc_ct_n)

datetime.datetime.now()
with open('pos_BoW_tfidf_pkl.bin','wb') as outfile:
    pkl.dump(fvec_pos,outfile,pkl.HIGHEST_PROTOCOL)
with open('neg_BoW_tfidf_pkl.bin','wb') as outfile:
    pkl.dump(fvec_neg,outfile,pkl.HIGHEST_PROTOCOL)

datetime.datetime(2016, 10, 3, 2, 57, 1, 72651)

INFO:root:Feature vectors for positive reviews constructed. Exception Count: 849
INFO:root:Feature vectors for negative reviews constructed. Exception Count: 986


datetime.datetime(2016, 10, 3, 2, 58, 59, 344864)