In [None]:
import numpy as np
import string
from nltk.tokenize import word_tokenize
from collections import OrderedDict
import codecs
import random
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid

def data_loader(v):
    testf=data=open('hwk3_datasets/'+ v +'-test.txt', 'r')
    trainf=data=open('hwk3_datasets/'+ v + '-train.txt', 'r')
    validf=data=open('hwk3_datasets/'+ v + '-valid.txt', 'r')
    replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
    train_tokens=trainf.read().lower().translate(None, string.punctuation).split("\n")
    valid_tokens=validf.read().lower().translate(None, string.punctuation).split("\n")
    test_tokens=testf.read().lower().translate(None, string.punctuation).split("\n")
    return tokenize(train_tokens), tokenize(valid_tokens), tokenize(test_tokens)

def tokenize(data):
    output=[]
    for element in data:
        if len(element)==0:
            pass
        else:
            tokens=word_tokenize(element)
            review= tokens[0:-1]
            review_score= tokens[-1]
            output.append((review, int(review_score)))
    return output

def get_vocab(data):
    vocab={}
    count=0
    for element in data:
        for word in element[0]:
            if word in vocab:
                pass
            else:
                vocab[word]=count
                count+=1
    return vocab

def get_frequency(data):
    vocab={}
    for element in data:
        for word in element[0]:
            if word in vocab:
                vocab[word]+=1
            else:
                vocab[word]=1
    sorted_vocab= sorted(vocab.iteritems(), key=lambda (k,v): (v,k), reverse=True)
    return sorted_vocab[:10000]

def get_feature_set(data):
    word_frequencies=get_frequency(data)
    feature_set=[]
    for element in word_frequencies:
        feature_set.append(element[0])
    return feature_set
        
def generate_binary_feature_vector(data, feature_set):
    vector=[]
    for element in data:
        row=[]
        for word in feature_set:
            if word in element[0]:
                row.append(1)
            else:
                row.append(0)
        vector.append(row)
    return vector

def binary_bag_of_words(data, feature_set):
    vector=generate_binary_feature_vector(data, feature_set)
    print "Vector has been produced"
    return vector

def generate_frequency_feature_vector(data, feature_set):
    vector=[]
    for element in data:
        row=np.pad([], (0,10000), 'constant')
        for word in element[0]:
            if word in feature_set:
                row[feature_set.index(word)]+=1
            else:
                pass
        vector.append(row)
    return vector

def normalize(vector):
    for row in vector:
        total=0
        for element in row:
            total+=element
        if total==0:
            pass
        else:
            for i in range(0, len(row)):
                row[i]=float(row[i])/total
    return vector

def frequency_bag_of_words(data, feature_set):
    vector=generate_frequency_feature_vector(data, feature_set)
    print "Vector has been produced"
    return normalize(vector)

def write_vocab(version, f):
    to_write=codecs.open(f, 'w', 'utf-8')
    train, valid, test=data_loader(version)
    vocab=get_vocab(train)
    frequency=get_frequency(train)
    for element in frequency:
        to_write.write(unicode(str(element[0]) + ' ' + str(vocab[element[0]])+ ' ' + str(element[1]) + '\n', errors='ignore'))

def datasets(data, f):
    vocab=get_vocab(data)
    for element in data:
        for i in range (0, len(element[0])):
            if i==len(element[0]):
                f.write(str(vocab[element[0][i]]))
            else:
                f.write(str(vocab[element[0][i]]) + ' ')
        f.write('\t'+ str(element[1]) + '\n')
    
def write_datasets(version, f1, f2, f3):
    to_write1=codecs.open(f1, 'w', 'utf-8')
    to_write2=codecs.open(f2, 'w', 'utf-8')
    to_write3=codecs.open(f3, 'w', 'utf-8')
    train, valid, test=data_loader(version)
    datasets(train, to_write1)
    datasets(valid, to_write2)
    datasets(test, to_write3)

def random_classifier(data, version, classes, avg):
    prediction=[]
    for element in data:
        selection=random.choice(classes)
        prediction.append(selection)
    correct=get_y(data)
    print "The F1 for the random classifier for " + version +" is " + str(f1_score(correct, prediction, average=avg))

def majority_classifier(train_data, test_data, version, avg):
    class_counts={}
    for element in train_data:
        if element[1] in class_counts:
            class_counts[element[1]]+=1
        else:
            class_counts[element[1]]=1
    sorted_classes=sorted(class_counts.iteritems(), key=lambda (k,v): (v,k), reverse=True)
    majority=sorted_classes[0][0]
    output=[]
    for element in test_data:
        output.append(majority)
    correct=get_y(test_data)
    print "The F1 for the majority classifier for " + version +" is " + str(f1_score(correct, output, average=avg))

def get_y(data):
    vector=[]
    for element in data:
        vector.append(element[1])
    return vector

def tune(x, y, valid_vector, valid_y, classifier, avg):
    if classifier=='naive':
        param_grid={'alpha': [0, 0.01, 0.03, 0.05, 0.07, 0.09, 0.1], 'fit_prior': [True, False]}
        params=list(ParameterGrid(param_grid))
        best_score=0.0
        best_params=[[]]
        for param in params:
            clf = BernoulliNB(alpha=param['alpha'], fit_prior=param['fit_prior'])
            clf.fit(x, y)
            predicted_valid=clf.predict(valid_vector)
            f1=f1_score(valid_y ,predicted_valid , average=avg)
            if f1>best_score:
                best_score=f1
                best_params[0]=param
            print "tuning..."
    if classifier=='tree':
        param_grid={'splitter': ['best', 'random'], 'max_depth': [1, 500, 1000], 'min_samples_split':[2, 500, 1000], 'min_samples_leaf':[1, 5, 10], 'max_features':[1000, 5000, 10000]}
        params=list(ParameterGrid(param_grid))
        best_score=0.0
        best_params=[[]]
        for param in params:
            clf = tree.DecisionTreeClassifier(splitter=param['splitter'], max_depth=param['max_depth'], min_samples_split=param['min_samples_split'], min_samples_leaf=param['min_samples_leaf'], max_features=param['max_features'])
            clf.fit(x, y)
            predicted_valid=clf.predict(valid_vector)
            f1=f1_score(valid_y ,predicted_valid , average=avg)
            if f1>best_score:
                best_score=f1
                best_params[0]=param
            print "tuning..."
    if classifier=='svm':
        param_grid={'C': [1, 50, 100], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
        params=list(ParameterGrid(param_grid))
        best_score=0.0
        best_params=[{}]
        for param in params:
            clf = svm.SVC(C=param['C'], kernel=param['kernel'])
            clf.fit(x, y)
            predicted_valid=clf.predict(valid_vector)
            f1=f1_score(valid_y ,predicted_valid , average=avg)
            if f1>best_score:
                best_score=f1
                best_params[0]=param
            print "tuning..."
    print "The best parameters are " + str(best_params[0])
    return best_params[0]

def predict(x, y, valid_vector, valid_y, test_vector, test_y, binary, model, avg):
    if model=='naive':
        if binary:
            params=tune(x, y, valid_vector, valid_y, model, avg)
            clf = BernoulliNB(alpha=params['alpha'], fit_prior=params['fit_prior'])
        else:
            clf = GaussianNB()
    if model=='tree':
        params=tune(x, y, valid_vector, valid_y, model, avg)
        clf = tree.DecisionTreeClassifier(splitter=params['splitter'], max_depth=params['max_depth'], min_samples_split=params['min_samples_split'], min_samples_leaf=params['min_samples_leaf'], max_features=params['max_features'])
    if model=='svm':
        params=tune(x, y, valid_vector, valid_y, model, avg)
        clf = svm.SVC(C=params['C'], kernel=params['kernel'])
    clf.fit(x, y)
    print "I've fit my model"
    predicted_train=clf.predict(x)
    print "The F1 for train for " + model+  " is " + str(f1_score(y, predicted_train, average=avg ))
    predicted_valid=clf.predict(valid_vector)
    print "The F1 for valid for " + model+  " is " + str(f1_score(valid_y ,predicted_valid , average=avg ))
    predicted_test=clf.predict(test_vector)
    print "The F1 for test for " + model+  " is " + str(f1_score(test_y, predicted_test, average=avg ))
    
def tester(train, valid, test, binary, avg):
    output=[]
    feature_set=get_feature_set(train)
    if binary:
        x=binary_bag_of_words(train, feature_set)
        to_classify_valid=binary_bag_of_words(valid, feature_set)
        to_classify_test=binary_bag_of_words(test, feature_set)
    else:
        x=frequency_bag_of_words(train, feature_set)
        to_classify_valid=frequency_bag_of_words(valid, feature_set)
        to_classify_test=frequency_bag_of_words(test, feature_set)
    y=get_y(train)
    correct_y_valid=get_y(valid)
    correct_y_test=get_y(test)
    predict(x, y, to_classify_valid, correct_y_valid, to_classify_test, correct_y_test, binary, 'naive', avg)
    predict(x, y, to_classify_valid, correct_y_valid, to_classify_test, correct_y_test, binary, 'tree', avg)
    predict(x, y, to_classify_valid, correct_y_valid, to_classify_test, correct_y_test, binary, 'svm', avg)
def run_tests(version, train, valid, test, binary, classes, avg):
    random_classifier(train, 'train', classes, avg)
    random_classifier(valid, 'valid', classes, avg)
    random_classifier(test, 'test', classes, avg)
    
    majority_classifier(train, train, 'train', avg)
    majority_classifier(train, valid, 'valid', avg)
    majority_classifier(train, test, 'test', avg)
    
    tester(train, valid, test, binary, avg)

def tests(version, binary):
    train, valid, test=data_loader(version)
    if version=='yelp':
        run_tests('yelp', train, valid, test, binary, [1, 2, 3, 4, 5], 'macro')
    else:
        run_tests('IMDB', train, valid, test, binary, [0,1], 'binary')

def main():
    tests('yelp', True)
    tests('yelp', False)
    tests('IMDB', True)
    tests('IMDB', False)

main()

The F1 for the random classifier for train is 0.1851455543
The F1 for the random classifier for valid is 0.189607003066
The F1 for the random classifier for test is 0.182934662295
The F1 for the majority classifier for train is 0.104267004647
The F1 for the majority classifier for valid is 0.105014749263
The F1 for the majority classifier for test is 0.103923019985


  'precision', 'predicted', average, warn_for)


Vector has been produced
Vector has been produced
Vector has been produced


  'setting alpha = %.1e' % _ALPHA_MIN)


tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
The best parameters are {'alpha': 0.01, 'fit_prior': False}
I've fit my model
The F1 for train for naive is 0.773283909627
The F1 for valid for naive is 0.386017630302
The F1 for test for naive is 0.372590712913
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning..

The best parameters are {'kernel': 'rbf', 'C': 50}
I've fit my model
The F1 for train for svm is 0.931373195333
The F1 for valid for svm is 0.878193701723
The F1 for test for svm is 0.874227294341
The F1 for the random classifier for train is 0.500798722045
The F1 for the random classifier for valid is 0.500798722045
The F1 for the random classifier for test is 0.496231005464
The F1 for the majority classifier for train is 0.666666666667
The F1 for the majority classifier for valid is 0.666666666667
The F1 for the majority classifier for test is 0.666666666667
Vector has been produced
Vector has been produced
Vector has been produced
I've fit my model
The F1 for train for naive is 0.85661971831
The F1 for valid for naive is 0.747273689783
The F1 for test for naive is 0.663398984048
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning...
tuning.