### 5.2 {-}

In [10]:
import math
import random


def preprocess():
    '''
    creates two lists:
    reviews: list of dictionaries, each dictionary maps to a review 
    where the keys are the index of a word in the vocabulary and the value is 1
    rev_sent: list of review sentiment, elements are either 1 or 0
    '''
    vocab_dict = {}
    with open('dict.txt') as f:
        for line in f:
            vocab_dict[line.split(" ")[0].strip("\n")] = int(line.split(" ")[1].strip("\n"))
    vocab_dict # vocab dictionary

    reviews = [] # list of dictionaries of word indecies per review
    rev_sent = [] # list of review sentiments
    i = 0
    with open("moviereview.tsv") as f:
        for line in f:
            feature_vector = {}
            l=line.split('\t')
            for word in l[1].split(' '):
                if word in vocab_dict.keys():
                    feature_vector[vocab_dict[word]] = 1
            reviews.append(feature_vector)
            i += 1
            for review in l[0]:
                rev_sent.append(int(review))
                
    return reviews, len(vocab_dict.keys()), rev_sent


### 5.3 {-}

In [11]:
def train_test_split(reviews, rev_sent):
    '''
    split data into train and test subsets
    '''
    X_test = []
    y_test = []
    test_perc = .2 # test percentage
    num_obs = len(rev_sent)
    while len(y_test) < num_obs*test_perc:
        index = random.randint(0,len(reviews)-1)
        X_test.append(reviews.pop(index))
        y_test.append(rev_sent.pop(index))
    return reviews , rev_sent, X_test, y_test

In [12]:
def weight_update(data_points, epochs, feature_vec_len, review_list):
    '''
    iterate through t epochs
    update weight at each epoch
    '''
    feature_vector = {i:0 for i in range(feature_vec_len)}
    for t in range(epochs): # iterate through epochs t
        vector_update = []
        for i in range(len(feature_vector.keys())): # iterate through theta vector i
            grad_sum = 0 # sum of gradients for all observations for each theta
            for j in range(len(data_points)): # iterate through observations j
                if i in data_points[j].keys(): # if feature i is in review j
                    vec_i_sum = 0 # dot product sum
                    for k in data_points[j].keys(): # iterate through words in (observation j) k
                        vec_i_sum += feature_vector[int(k)]

                    grad = (-1)*review_list[j] + (1/(1+math.e**(-vec_i_sum))) # gradient
                    grad_sum += grad
            vector_update.append(.1*(1/len(data_points))*grad_sum)
        for theta in range(len(vector_update)): # update thetas at end of epoch
            feature_vector[theta] = feature_vector[theta] - vector_update[theta] 
                
    return feature_vector

In [13]:
def predict(theta, x):
    '''
    create list of predicted y values 
    thershold is set at .5
    '''
    prob = []
    pred_y = []
    for words in x:
        sum = 0
        for i in words:
            sum += theta[i]
        p = 1/(1+math.e**(-sum))
        prob.append(p)
        py = 0
        if p >= .5:
            py = 1
        pred_y.append(py)
    return pred_y

In [14]:
reviews, feature_vec_len, rev_sent = preprocess()
X_train , y_train, X_test, y_test = train_test_split(reviews, rev_sent)
epochs = 30
feature_vector = weight_update(X_train, epochs, feature_vec_len, rev_sent)
pred_y = predict(feature_vector, X_train)

# get number of correct training data predictions
correct_count = 0
for i in range(len(pred_y)):
    if pred_y[i] == y_train[i]:
        correct_count += 1


training_accuracy =  correct_count/len(pred_y)*100

print('#################')
print('training accuracy:', training_accuracy )
print('#################')

pred_y = predict(feature_vector, X_test)

correct_count = 0
for i in range(len(pred_y)):
    if pred_y[i] == y_test[i]:
        correct_count += 1

testing_accuracy =  correct_count/len(pred_y)*100

print('#################')
print('testing accuracy:', testing_accuracy )
print('#################')




#################
training accuracy: 95.625
#################
#################
testing accuracy: 86.25
#################
