In [33]:
import pandas as pd
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegressionCV

## Load data and preprocess

In [57]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = LancasterStemmer()

def load_and_clean_data(file): 
    fstream = open(file,'r')
    x = []
    y = []
    for line in fstream:
        line = line.split('\t')
        line[1] = line[1].rstrip()
        y.append(line[1])
        
        # lowercase all of the words and get rid of punctuations
        line[0] = line[0].lower().translate(str.maketrans('', '', string.punctuation)).split()
        
        # lemmatize all the words and get rid of stop words
        line[0] = [stemmer.stem(word) for word in line[0] if not word in stop_words]
        
        x.append(line[0])
    
    fstream.close()
    return x, y


x_yelp, y_yelp = load_and_clean_data('data/yelp_labelled.txt')
x_amazon, y_amazon = load_and_clean_data('data/amazon_cells_labelled.txt')
x_imdb, y_imdb = load_and_clean_data('data/imdb_labelled.txt')

print(len(x_yelp), len(y_yelp))
print(len(x_amazon), len(y_amazon))
print(len(x_imdb), len(y_imdb))

print(x_yelp[:10], y_yelp[:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ctc316/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
1000 1000
1000 1000
1000 1000
[['wow', 'lov', 'plac'], ['crust', 'good'], ['tasty', 'text', 'nasty'], ['stop', 'lat', 'may', 'bank', 'holiday', 'rick', 'stev', 'recommend', 'lov'], ['select', 'menu', 'gre', 'pric'], ['get', 'angry', 'want', 'damn', 'pho'], ['honesl', 'didnt', 'tast', 'fresh'], ['potato', 'lik', 'rub', 'could', 'tel', 'mad', 'ahead', 'tim', 'kept', 'warm'], ['fri', 'gre'], ['gre', 'touch']] ['1', '0', '0', '1', '1', '0', '0', '0', '1', '1']


## Split into 2 half set and both have test and training data for 1 : 4

In [38]:
def data_split(xs, ys):
    x_train = []
    x_test = []
    y_train = []
    y_test = []
    for i in range(len(xs)):
        x_tra, x_tes, y_tra, y_tes = train_test_split(xs[i], ys[i], test_size=0.2)
        x_train.extend(x_tra)
        y_train.extend(y_tra)
        x_test.extend(x_tes)
        y_test.extend(y_tes)
        
    return x_train, x_test, y_train, y_test

half_size = int(len(x_yelp) / 2)

xs = [x_yelp[:half_size], x_amazon[:half_size], x_imdb[:half_size]]
ys = [y_yelp[:half_size], y_amazon[:half_size], y_imdb[:half_size]]
x_train1, x_test1, y_train1, y_test1 = data_split(xs, ys)

xs = [x_yelp[half_size:], x_amazon[half_size:], x_imdb[half_size:]]
ys = [y_yelp[half_size:], y_amazon[half_size:], y_imdb[half_size:]]
x_train2, x_test2, y_train2, y_test2 = data_split(xs, ys)

print("data set 1:", len(x_train1), len(x_test1), len(y_train1), len(y_test1))
print("data set 2:", len(x_train2), len(x_test2), len(y_train2), len(y_test2))

data set 1: 1200 300 1200 300
data set 2: 1200 300 1200 300


## Bag of Words Model

#### Use training set to build dictionary of unique words (bags)

In [51]:
def createBags(x):
    bags = set()
    for line in x:
        bags |= set(line)
    return list(bags)

bags = createBags(x_train1)
print("bags num:", len(bags), bags[:10])

bags num: 2206 ['wild', 'creamy', 'repres', 'friday', 'surv', 'retard', 'driv', 'ingredy', 'destin', 'sev']


#### Transform traning data and test data into bag of words model

In [60]:
def bag_of_words(data, bags, bags_idx_map):
    n = len(bags)
    result = []
    for line in data:
        occurence = [0 for i in range(n)]
        for word in line:
            if word not in bags_idx_map: 
                continue
            occurence[bags_idx_map[word]] += 1
        result.append(occurence)
    return result


bags_idx_map = {}
for i, word in enumerate(bags):
    bags_idx_map[word] = i

    
train_x = bag_of_words(x_train1, bags, bags_idx_map)
test_x = bag_of_words(x_test1, bags, bags_idx_map)


print(train_x[0][:500])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 