data preprocessing

In [1]:
import pandas as pd
import numpy as np
import string
from collections import Counter

traindata = pd.read_csv(
    filepath_or_buffer='train.dat', 
    header=None, 
    sep='\t')

traincategory = traindata.iloc[:, 0]
traindata = traindata.iloc[:, 1]

testdata = pd.read_csv(
    filepath_or_buffer='test.dat', 
    header=None, 
    sep='\t')

# make all data lowercase
traindata = traindata.apply(lambda x: x.lower()).to_numpy()
testdata = testdata[0].apply(lambda x: x.lower()).to_numpy()

# remove punctuation
for row in range(len(traindata)):
  traindata[row] = traindata[row].replace('-', ' ').translate(str.maketrans('', '', string.punctuation))

# remove punctuation
for row in range(len(testdata)):
  testdata[row] = testdata[row].replace('-', ' ').translate(str.maketrans('', '', string.punctuation))

In [2]:
# create list of c-mers for the row
# this grabs three letters at a time
# cmer refers to a count of characters
def cmer(row, c=3):
  # Given a row and parameter c, return the vector of c-mers associated with the row

  if len(row) < c:
    return [row]
  cmers = []
  for i in range(len(row)-c+1):
    cmers.append(row[i:(i+c)])
  return cmers

In [3]:
def wmer(row, w=3):
   # Given a row and parameter w, return the vector of w-mers associated with the row
    row = row.split()
    if len(row) < w:
      return [row]
    wmers = []
    for i in range(len(row)-w+1):
      wmers.append(row[i:(i+w)])
    return wmers

In [4]:
from scipy.sparse import csr_matrix
#idx is dictionary as inpuit
def build_train_matrix(data, idx = {}):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    mat = [wmer(row, 1) for row in data]

    nrows = len(mat)
    tid = 0
    nnz = 0

    for d in mat:
      wordlist = [x[0] for x in d]
      nnz += len(set(wordlist))
      d = wordlist
      for w in d: #can change here to differen cmer/wmer
          if w not in idx:
            idx[w] = tid
            tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in mat:
        listofwords = [x[0] for x in d]
        cnt = Counter(listofwords) #same as above with cmer/wemer
        keys = list(k for k,_ in cnt.most_common() if k in idx)
        l = len(keys)

        for j, k in enumerate(keys):
            ind[j + n] = idx[k]
            val[j + n] = cnt[k]

        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    return mat, idx

In [5]:
def build_test_matrix(data, idx = {}):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    #mat = data
    mat = [wmer(row, 1) for row in data]

    nrows = len(mat)
    tid = 0
    nnz = 0

    for d in mat:
      nnz += len([x[0] for x in d if x[0] in idx])
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in mat:
        listofwords = [x[0] for x in d]
        cnt = Counter(listofwords) #same as above with cmer/wemer
        keys = list(k for k,_ in cnt.most_common() if k in idx)
        l = len(keys)

        for j, k in enumerate(keys):
            ind[j + n] = idx[k]
            val[j + n] = cnt[k]

        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    return mat

In [6]:
# term frequency inverse document frequency
def tfidf(tfv):
  df = (tfv > 0).sum(axis = 0)
  idf = np.log(tfv.getnnz() / df)
  tf_idf = tfv * idf
  return tf_idf

Knn classification

In [7]:
traintfv, idx = build_train_matrix(traindata)
testtfv = build_test_matrix(testdata, idx)

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

traintfv = csr_l2normalize(traintfv, copy=True)
testtfv = csr_l2normalize(testtfv, copy=True)

In [8]:
def cosineSim(item, tf_idf):
  a = item
  B = tf_idf

  dot = a.multiply(B).sum()

  a_len = np.sqrt(a.multiply(a).sum())
  b_len = np.sqrt(B.multiply(B).sum(axis=1))

  cos_similarities = pd.DataFrame(dot / (a_len * b_len))[0]
  cos_similarities = cos_similarities.sort_values(ascending=False)
  return cos_similarities

In [9]:
def get_neighbors(dataset, traintfv, tfvtest, labels, k):
  #tfv has potential neighbors -- for each row of dataset, find neighbors from tfv
  distances = pd.DataFrame(columns = ['train row', 'dist'])
  row = []
  distance = []
  #cate = []
  for index in range(len(dataset)):
    row.append(index)
    sim = getneighbors(index, tfvtest)[0:k]
    distance.append(sim)
    #cate.append(labels[sim[0][0]]) #might be weird
  distances['train row'] = row
  distances['dist'] = distance
  #distances['category'] = cate
  return distances

In [10]:
def getneighbors(row, tfv):
  x = tfv[row,:]
  dots = x.dot(tfv.T)
  dots[0, row] = -1 # invalidate self-similarity
  sims = list(zip(dots.indices, dots.data))
  sims.sort(key=lambda x: x[1], reverse=True)
  return sims

In [11]:
trainneighbors = get_neighbors(traindata, traintfv, testtfv, traincategory, 3)

KeyboardInterrupt: 

In [12]:
def vote(neighbors):
  #operates on one specific row
  class_counter = Counter()
  dist = neighbors['dist']
  for i in range(len(neighbors)):
    # need to check categories of each neighbor
    # add one at the class category col that corresponds to the category of the particular item in dist
    class_counter[traincategory[dist[i][0]]] += 1
  return class_counter.most_common(1)[0][0]

In [13]:
from sklearn.metrics import f1_score
def trainaccuracy(nearest):

  trainpredict = []
  #for each row, use nearest neighbors to vote
  for index in range(len(nearest.values)):
    trainpredict.append(vote(nearest.iloc[index]))

  return f1_score(traincategory, trainpredict, average=None)

In [None]:
print(trainaccuracy(trainneighbors))

In [14]:
testneighbors = get_neighbors(testdata, traintfv, testtfv, traincategory, 3)

In [None]:
def testpredictions(nearest):
  testpredict = []
  for rindex in range(len(nearest.values)):
    print(nearest.iloc[rindex])
    testpredict.append(vote(nearest.iloc[rindex]))

  return testpredict

In [None]:
predicted_df = testpredictions(testneighbors)

In [None]:
test_predictions_file = open('output.dat', 'w+')
pd.Series(predicted_df).to_csv("output.dat", index=False, header=None)