In [37]:
import pandas as pd
import numpy as np
import scipy as sp
import string

#read file
def readFile(path):
    with open(path,"r") as fh:
        lines = fh.readlines()
        docs_raw = [l.split() for l in lines]
        res = [int(docs_raw[i][0])for i in range(len(docs_raw))]
    return docs_raw, res

def readTest(path):
    with open(path,"r") as fh:
        lines = fh.readlines()
        docs_raw = [l.split() for l in lines]
        #res = [int(docs_raw[i][0])for i in range(len(docs_raw))]
    return docs_raw


#filter doc length
def filterLen(docs, minlen):
    docs_raw4 = [ [t for t in d if len(t) >= minlen ] for d in docs ]
    s = " "
    #docs = [s.join(d) for d in docs_raw4]
    docs = [(s.join(d)).translate(None, string.punctuation) for d in docs_raw4]
    return docs

# Train on a 80/20 split
def splitSet(docs,res , testSize, stateNum):
    from sklearn.model_selection import train_test_split
    X_train, X_test,y_train,y_test = train_test_split(docs,res, test_size=testSize, random_state =stateNum)
    return X_train, X_test, y_train, y_test

#make a lemmatizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
         self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

#convert into a matrix of token counts
def makeVectorizer(idf,ngram, max_df,min_df,norm,stopwords,lemmatizer):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
        
    
    if idf:
        vectorizer = TfidfVectorizer(lowercase = True,
                                    stop_words = stopwords,
                                     use_idf = idf,
                                     max_df = max_df,
                                     min_df = min_df,
                                     ngram_range = ngram,
                                     norm = norm,
                                     tokenizer = lemmatizer)
    else:
        vectorizer = CountVectorizer(lowercase = True,
                                    stop_words = stopwords,
                                     max_df = max_df,
                                     min_df = min_df,
                                    ngram_range = ngram)
                                    
    return vectorizer

#l2 norm
def l2Norm(data):
    from sklearn.preprocessing import Normalizer
    normalizerl2 = Normalizer(norm = "l2")
    normalizerl2.fit(data)
    data_l2 = normalizerl2.transform(data)
    return data_l2


# compute the similary of the test set data to all the data in the training set
# save the pairwise similary into a similarity matrix for later knn use
def cosSim(test, train):
    from sklearn.metrics.pairwise import linear_kernel
    cosine_similarities = linear_kernel(test, train)
    return cosine_similarities

#sort the top k sim 
def sortInd(cosSim,k):
    sortedInd = np.argpartition(cosSim, -k)[:,-k:]
    return sortedInd

#knn
def knnDis(topInd_mat, cosSim_mat,threshold):
    rows = topInd_mat.shape[0]
    y_test_dis =[]
    res_sim = []
    for i in range(rows):
        topInd_vec = topInd_mat[i]
        res =sum(int(y_train[j]) * cosSim_mat[i][j] for j in topInd_vec)/sum([cosSim_mat[i][j] for j in topInd_vec])
        
        res_sim.append(res)
        if res >=threshold:
            y_test_dis.append(+1)
        else:
            y_test_dis.append(-1)
    return y_test_dis, res_sim

#write the results to a file
def writeToFile(testRes,filename):
    resFile = open(filename, 'w')
    for i in range (len(testRes)):
        if testRes[i] == 1:
            s = "+1"
        else:
            s = "-1"
        resFile.write(s +'\n')


In [21]:
docs_raw, res = readFile("train.dat")


In [38]:
docs = filterLen(docs_raw, 3)
docs[0]

'This book such life saver has been helpful able back track trends answer pediatrician questions communicate with each other when you are different times the night with newborn think one those things that everyone should required have before they leave the hospital went through all the pages the newborn version then moved the infant version and will finish the second infant book third total right our baby turns See other things that are must haves for baby '

In [27]:
X_train, X_test,y_train,y_test = splitSet(docs, res, 0.2,5)

In [28]:
len(y_train)

14804

In [29]:
lemmatizer = LemmaTokenizer()

In [93]:
vectorizer_idf = makeVectorizer(True,(1,3),0.05,0.0008,'l2','english',lemmatizer)
vectorizer_idf.fit(X_train)
X_test_idf_l2_dtm = vectorizer_idf.transform(X_test)
X_train_idf_l2_dtm =vectorizer_idf.transform(X_train)


In [94]:
cosine_sim_idf = cosSim(X_test_idf_l2_dtm, X_train_idf_l2_dtm)

In [95]:
ind_cosine_sim_idf = sortInd(cosine_sim_idf,850)


In [96]:
y_test_dis_idf,res_idf = knnDis(ind_cosine_sim_idf,cosine_sim_idf,0.08)



In [97]:
diff_dis_idf = 0
mis_ind = []
for i in range(len(y_test)):
    diff_dis_idf += abs(int(y_test[i]) - y_test_dis_idf[i])
    if abs(int(y_test[i]) - y_test_dis_idf[i]) >0:
        mis_ind.append(i)
diff_dis_idf = diff_dis_idf/2

print(diff_dis_idf)
print(diff_dis_idf/(len(y_test)*1.0))

719
0.194219340897
