# Question 1

In [1]:
# import the necessary libraries
import random
import string
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# constants
MAX_FEATURES = 10000
SAVE_VOCAB_FILE = 1
SAVE_DATA_FILE = 1

### Read datasets

In [3]:
# import datasets
yelp_train = pd.read_csv("datasets/raw/yelp-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_valid = pd.read_csv("datasets/raw/yelp-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_test = pd.read_csv("datasets/raw/yelp-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
IMDB_train = pd.read_csv("datasets/raw/IMDB-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
IMDB_valid = pd.read_csv("datasets/raw/IMDB-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
IMDB_test = pd.read_csv("datasets/raw/IMDB-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])

In [4]:
# categorize datasets
datasets = {
    'yelp': {'train': yelp_train, 'valid': yelp_valid, 'test': yelp_test},
    'IMDB': {'train': IMDB_train, 'valid': IMDB_valid, 'test': IMDB_test},
}

In [5]:
# check dataset sizes
for group_name, group in datasets.items():
    for dataset_name, dataset in group.items():
        print(group_name, dataset_name, 'size:', dataset.shape[0])

yelp train size: 7000
yelp valid size: 1000
yelp test size: 2000
IMDB train size: 15000
IMDB valid size: 10000
IMDB test size: 25000


In [6]:
# check dataset contents
for group_name, group in datasets.items():
    print(group_name, 'train:')
    print(group['train'].head(), '\n')

yelp train:
                                              review  label
0  I can't believe I haven't yelped about the pla...      5
1  Best nights to go to Postino's are Mondays and...      5
2  Went here tonight with the padres and husband....      5
3  I must be spoiled and realize that this is not...      3
4  Normally, love this store & have been a member...      2 

IMDB train:
                                              review  label
0  For a movie that gets no respect there sure ar...      1
1  Bizarre horror movie filled with famous faces ...      1
2  A solid, if unremarkable film. Matthau, as Ein...      1
3  It's a strange feeling to sit alone in a theat...      1
4  You probably all already know this by now, but...      1 



### Preprocess data

In [7]:
# strip all non-word, non-space characters, <br /> tags
for group in datasets.values():
    for dataset in group.values():
        dataset['review'] = dataset['review'].str.replace('<br /><br />', ' ').str.replace('[^\w\s]', '').str.lower()

In [8]:
# check dataset contents
for group_name, group in datasets.items():
    print(group_name, 'train:')
    print(group['train'].head(), '\n')

yelp train:
                                              review  label
0  i cant believe i havent yelped about the place...      5
1  best nights to go to postinos are mondays and ...      5
2  went here tonight with the padres and husband ...      5
3  i must be spoiled and realize that this is not...      3
4  normally love this store  have been a member f...      2 

IMDB train:
                                              review  label
0  for a movie that gets no respect there sure ar...      1
1  bizarre horror movie filled with famous faces ...      1
2  a solid if unremarkable film matthau as einste...      1
3  its a strange feeling to sit alone in a theate...      1
4  you probably all already know this by now but ...      1 



### Create vocabulary

In [9]:
vocab = {}
for group_name, group in datasets.items():
    list_all_words = [word for sentence in group['train']['review'].str.split().tolist() for word in sentence]
    list_freq_words = Counter(list_all_words).most_common(MAX_FEATURES)
    vocab[group_name] = {word[0]: i for i, word in enumerate(list_freq_words)}
    # save "-vocab.txt" file
    if SAVE_VOCAB_FILE:
        file_vocab = pd.DataFrame(list_freq_words)
        file_vocab[2] = np.arange(0, MAX_FEATURES) # word IDs
        file_vocab.to_csv('./datasets/' + group_name + '-vocab.txt', sep='\t', header=False, index=False, columns=[0, 2, 1])
    # save "-train.txt", "-valid.txt", "-test.txt" file
    if SAVE_DATA_FILE:
        for dataset_name, dataset in group.items():
            with open('./datasets/' + group_name + '-' + dataset_name + '.txt', 'w') as file:
                for i in range(len(dataset)):
                    file.write(' '.join([str(vocab[group_name][word]) for word in dataset.iloc[i, 0].split() if word in vocab[group_name]]) 
                               + '\t' + str(dataset.iloc[i, 1]) + '\n')   

### Convert to BoW representations