In [102]:
import pandas as pd
import numpy as np
import scipy as sp

#read file
def readFile(path):
    with open(path,"r") as fh:
        lines = fh.readlines()
        docs_raw = [l.split() for l in lines]
        res = []
    for i in range(len(docs_raw)):
        res.append(int(docs_raw[i][0]))
    return docs_raw, res

#filter doc length
def filterLen(docs, minlen):
    docs_raw4 = [ [t for t in d if len(t) >= minlen ] for d in docs ]
    s = " "
    docs = [s.join(d) for d in docs_raw4]
    return docs

# Train on a 80/20 split
def splitSet(docs,res , testSize, stateNum):
    from sklearn.model_selection import train_test_split
    X_train, X_test,y_train,y_test = train_test_split(docs,res, test_size=testSize, random_state =stateNum)
    return X_train, X_test, y_train, y_test

#convert into a matrix of token counts
def makeVectorizer(idf,ngram, max_df,min_df,norm,stopwords):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    if idf:
        vectorizer = TfidfVectorizer(lowercase = True,
                                    stop_words = stopwords,
                                     use_idf = idf,
                                     max_df = max_df,
                                     min_df = min_df,
                                     ngram_range = ngram,
                                     norm = norm)
    else:
        vectorizer = CountVectorizer(lowercase = True,
                                    stop_words = stopwords,
                                     max_df = max_df,
                                     min_df = min_df,
                                    ngram_range = ngram)
                                    
    return vectorizer

#l2 norm
def l2Norm(data):
    from sklearn.preprocessing import Normalizer
    normalizerl2 = Normalizer(norm = "l2")
    normalizerl2.fit(data)
    data_l2 = normalizerl2.transform(data)
    return data_l2


# compute the similary of the test set data to all the data in the training set
# save the pairwise similary into a similarity matrix for later knn use
def cosSim(test, train):
    from sklearn.metrics.pairwise import linear_kernel
    cosine_similarities = linear_kernel(test, train)
    return cosine_similarities

#sort the top k sim 
def sortInd(cosSim,k):
    sortedInd = np.argpartition(cosSim, -k)[:,-k:]
    return sortedInd

#knn
def knnDis(topInd_mat, cosSim_mat,threshold):
    rows = topInd_mat.shape[0]
    y_test_dis =[]
    res_sim = []
    for i in range(rows):
        topInd_vec = topInd_mat[i]
        res =sum(int(y_train[j]) * cosSim_mat[i][j] for j in topInd_vec)/sum([cosSim_mat[i][j] for j in topInd_vec])
        res_sim.append(res)
        if res >=threshold:
            y_test_dis.append(1)
        else:
            y_test_dis.append(-1)
    return y_test_dis, res_sim


In [100]:
#prepare doc data
docs_raw, res = readFile("train.dat")
docs = filterLen(docs_raw, 3)
docs[0]
X_train, X_test,y_train,y_test = splitSet(docs, res, 0.2,5)

#make csr matrix
#vectorizer_Nonidf = makeVectorizer(False,(1,3),1.0,0.0005,'l2','english') # idf, n-gram_range, maxdf, mindf,norm, stopwords
#vectorizer_Nonidf.fit(X_train)
vectorizer_idf = makeVectorizer(True,(1,3),1.0,0.0005,'l2','english')
vectorizer_idf.fit(X_train)
X_test_idf_l2_dtm = vectorizer_idf.transform(X_test)
X_train_idf_l2_dtm =vectorizer_idf.transform(X_train)

#do the norm (this is only for nonidf)
#X_test_Nonidf_dtm_l2 = l2Norm(X_test_Nonidf_dtm)
#X_train_Nonidf_dtm_l2 = l2Norm(X_train_Nonidf_dtm)

#compute the cosine sim
#cosine_sim_Nonidf = cosSim(X_test_Nonidf_dtm_l2, X_train_Nonidf_dtm_l2)
cosine_sim_idf = cosSim(X_test_idf_l2_dtm, X_train_idf_l2_dtm)

#sort and get top k sim
#ind_cosine_sim_Nonidf= sortInd(cosine_sim_Nonidf,500)
ind_cosine_sim_idf = sortInd(cosine_sim_idf,200)

#get the results
#y_test_dis_Nonidf,red_non = knnDis(ind_cosine_sim_Nonidf,cosine_sim_Nonidf)
y_test_dis_idf,res_idf = knnDis(ind_cosine_sim_idf,cosine_sim_idf,0.12)



In [101]:
diff_dis_idf = 0
mis_ind = []
for i in range(len(y_test)):
    diff_dis_idf += abs(int(y_test[i]) - y_test_dis_idf[i])
    if abs(int(y_test[i]) - y_test_dis_idf[i]) >0:
        mis_ind.append(i)
diff_dis_idf = diff_dis_idf/2

print(diff_dis_idf)
print(diff_dis_idf/(len(y_test)*1.0))



750
0.202593192869


In [None]:
#knn

In [174]:
def knnMajor(topInd_mat):
    rows = topInd_mat.shape[0]
    y_test_res = []
    for i in range(rows):
        topInd_vec = topInd_mat[i]
        res_sum = sum([int(y_train[j]) for j in topInd_vec])
        if res_sum >=0:
            y_test_res.append(1)
        else:
            y_test_res.append(-1)
    return y_test_res

In [353]:
y_test_res_idf = knnMajor(ind_cosine_sim_idf)
y_test_res_Nonidf = knnMajor(ind_cosine_sim_Nonidf)

In [354]:
diff_idf = 0
for i in range(len(y_test)):
    diff_idf += abs(int(y_test[i]) - y_test_res_idf[i])
    
diff_idf = diff_idf/2

In [355]:
diff_Nonidf = 0
for i in range(len(y_test)):
    diff_Nonidf += abs(int(y_test[i]) - y_test_res_Nonidf[i])
    
diff_Nonidf = diff_Nonidf/2

In [56]:
def knnDis(topInd_mat, cosSim_mat):
    rows = topInd_mat.shape[0]
    y_test_dis =[]
    res_sim = []
    for i in range(rows):
        topInd_vec = topInd_mat[i]
        res =sum(int(y_train[j]) * cosSim_mat[i][j] for j in topInd_vec)/sum([cosSim_mat[i][j] for j in topInd_vec])
        res_sim.append(res)
        if res >=0.12:
            y_test_dis.append(1)
        else:
            y_test_dis.append(-1)
    return y_test_dis, res_sim

In [57]:
y_test_dis_idf,res_idf = knnDis(ind_cosine_sim_idf,cosine_sim_idf)
y_test_dis_Nonidf,red_non = knnDis(ind_cosine_sim_Nonidf,cosine_sim_Nonidf)



In [58]:
diff_dis_idf = 0
mis_ind = []
for i in range(len(y_test)):
    diff_dis_idf += abs(int(y_test[i]) - y_test_dis_idf[i])
    if abs(int(y_test[i]) - y_test_dis_idf[i]) >0:
        mis_ind.append(i)
diff_dis_idf = diff_dis_idf/2

In [59]:
diff_dis_Nonidf = 0
for i in range(len(y_test)):
    diff_dis_Nonidf += abs(int(y_test[i]) - y_test_dis_Nonidf[i])
    
diff_dis_Nonidf = diff_dis_Nonidf/2

In [60]:
print(diff_dis_Nonidf)
print(diff_dis_Nonidf/(len(y_test)*1.0))

817
0.220691518098


In [61]:
print(diff_dis_idf)
print(diff_dis_idf/(len(y_test)*1.0))

706
0.190707725554
