In [165]:
# import essential libraries

import random
import string
import numpy as np
import pandas as pd
import operator as op
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
# ...
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import f1_score
# for classifiers
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
import ast
from collections import Counter
from tqdm import tqdm

In [166]:
yelp_tr = pd.read_csv("hwk3_datasets/yelp-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_te = pd.read_csv("hwk3_datasets/yelp-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
yelp_va = pd.read_csv("hwk3_datasets/yelp-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_tr = pd.read_csv("hwk3_datasets/IMDB-train.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_te = pd.read_csv("hwk3_datasets/IMDB-test.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])
imdb_va = pd.read_csv("hwk3_datasets/IMDB-valid.txt", sep='\t', lineterminator='\n', header=None, names=['review', 'label'])

In [167]:
# categories of given dataset
hw3_datasets = {
    'Yelp': {'train': yelp_tr, 'valid': yelp_va, 'test': yelp_te},
    'IMDB': {'train': imdb_tr, 'valid': imdb_va, 'test': imdb_te},
}

In [171]:
#Pre-processing:
#You make the sentences to lower case

for dataset in hw3_datasets.values():
    for df in dataset.values():
        df['review'] = df['review'].str.lower()
        df['review'] = df['review'].str.replace('<br /><br />', ' ').str.replace('[^\w\s]', '')
    
    


In [163]:
# check dataset sizes
for d_n, d in hw3_datasets.items():
    for s_n, s in d.items():
        print(d_n, s_n, ' is of size:', s.shape[0])
# check dataset contents
for d_n, d in hw3_datasets.items():
    print(d_n, 'train:')
    print(d['train'].head(), '\n')

Yelp train  is of size: 7000
Yelp valid  is of size: 1000
Yelp test  is of size: 2000
IMDB train  is of size: 15000
IMDB valid  is of size: 10000
IMDB test  is of size: 25000
Yelp train:
                                              review  label
0  i cant believe i havent yelped about the place...      5
1  best nights to go to postinos are mondays and ...      5
2  went here tonight with the padres and husband ...      5
3  i must be spoiled and realize that this is not...      3
4  normally love this store  have been a member f...      2 

IMDB train:
                                              review  label
0  for a movie that gets no respect there sure ar...      1
1  bizarre horror movie filled with famous faces ...      1
2  a solid if unremarkable film matthau as einste...      1
3  its a strange feeling to sit alone in a theate...      1
4  you probably all already know this by now but ...      1 



In [174]:
vocab = {}
#We exclude the words that do not have much semantic value: such as "the"
#NLTK's stop words list
stops = {'the','a','i','me', 'youre', 'not', 'my', 'myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once', 'there','when','where','why','how','all','any','both','each','most','other','some','such','nor','only','so','than','too','very','s','t','can','will','just','don','should','now', 'an', 'They', 'So'}   
for group_name, group in hw3_datasets.items():
    list_all_words = [word for sentence in group['train']['review'].str.split().tolist() for word in sentence]
    list_freq_words = Counter(word for word in list_all_words if word not in stops).most_common(10000)
    vocab[group_name] = {word[0]: i for i, word in enumerate(list_freq_words)}
    # save "-vocab.txt" file
    if 1:
        file_vocab = pd.DataFrame(list_freq_words)
        file_vocab[2] = np.arange(0, 10000) # word IDs
        file_vocab.to_csv('./hwk3_datasets/submission/' + group_name + '-vocab.txt', sep='\t', header=False, index=False, columns=[0, 2, 1])
    # save "-train.txt", "-valid.txt", "-test.txt" file
    if 1:
        for dataset_name, dataset in group.items():
            with open('./hwk3_datasets/submission/' + group_name + '-' + dataset_name + '.txt', 'w') as file:
                for i in range(len(dataset)):
                    file.write(' '.join([str(vocab[group_name][word]) for word in dataset.iloc[i, 0].split() if word in vocab[group_name]]) 
                               + '\t' + str(dataset.iloc[i, 1]) + '\n')   
        