## Part 1: Parsing the dataset

In [1]:
import os.path
import tarfile
from urllib.request import urlretrieve

url = """http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz"""
if not os.path.exists(url.split("/")[-1]):
  urlretrieve(url, url.split("/")[-1])
  print("Downloaded", url)
  with tarfile.open('review_polarity.tar.gz', 'r:gz') as tar:
    tar.extractall()
  print("Extracted archive")

Extracted archive


In [1]:
import os
import numpy as np

def get_reviews(path):
    """ takes a path and returns a list with the content of all files at that path """
    filenames_lst = os.listdir(path)
    reviews_lst = []
    for i in range(len(filenames_lst)):
        path_full = os.path.join(path, filenames_lst[i])
        with open(path_full, 'r') as f:
            reviews_lst.append(f.read())
    return reviews_lst

folder_name = 'txt_sentoken'
neg_path = os.path.join(folder_name, 'neg')
pos_path = os.path.join(folder_name, 'pos')
neg_reviews = get_reviews(neg_path)
pos_reviews = get_reviews(pos_path)

In [2]:
# check that both lists are the correct length
assert len(neg_reviews) == 1000
assert len(neg_reviews) == len(pos_reviews)

In [3]:
# create the train and test lists
split_point = int(len(neg_reviews)*0.8)
X_train = neg_reviews[:split_point] + pos_reviews[:split_point]
y_train = [-1 for i in range(split_point)] + [1 for i in range(split_point)]
X_test = neg_reviews[split_point:] + pos_reviews[split_point:]
y_test = [-1 for i in range(int(len(X_test)/2))] + [1 for i in range(int(len(X_test)/2))]

In [4]:
# check the train and test lists are the correct lengths
assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

# check the train and test lists are the correct types / values
assert np.all([isinstance(x, str) for x in X_train])
assert np.all([isinstance(x, str) for x in X_test])
assert len(np.unique(y_train))==2
assert min(y_train) == -1
assert max(y_train) == 1
assert len(np.unique(y_test))==2
assert min(y_test) == -1
assert max(y_test) == 1

## Part 2: Feature extraction

In [22]:
# get the list of all unique tokens from all the reviews
# I include both train and test sets for building the vocabulary, in order to avoid having to handle unseen words from the test set

def get_token_lst(txt):
    tokens = []
    for line in txt.split('\n'):
        for token in line.split(' '):
            tokens.append(token)
    return tokens

X_full = X_train + X_test
vocab_set = set()
for review in X_full:
    vocab_set.update(get_token_lst(review))
vocab = list(vocab_set)

In [26]:
# get a binary bag-of-words matrix from a dataset
def get_bog(x_in, vocabulary):
    x_out = np.zeros((len(x_in), len(vocabulary)))
    for i, txt in enumerate(x_in):
        tokens = list(set(get_token_lst(txt)))
        for token in tokens:
            x_out[i, vocab.index(token)] = 1
    return x_out

In [27]:
X_train_bog = get_bog(X_train, vocab)
X_test_bog = get_bog(X_test, vocab)

In [None]:
class Vectorizer:
    def __init__(self):
        pass

## Part 3: Learning framework

In [None]:
class Classifier:
    def __init(self):
        pass