data preprocessing

In [15]:
import pandas as pd
import numpy as np
import string
from collections import Counter

# potentially do cross validation here, start without

traindata = pd.read_csv(
    filepath_or_buffer='train.dat', 
    header=None, 
    sep='\n')

testdata = pd.read_csv(
    filepath_or_buffer='test.dat', 
    header=None, 
    sep='\n')

# make all data lowercase
traindata = traindata[0].apply(lambda x: x.lower()).to_numpy()
testdata = testdata[0].apply(lambda x: x.lower()).to_numpy()

# separate category and data, remove punctuation
traincategory = []

for row in range(len(traindata)):
  traindata[row] = traindata[row].replace('-', ' ').translate(str.maketrans('', '', string.punctuation))
  traincategory.append(traindata[row][0])
  traindata[row] = traindata[row][2:]

# remove punctuation
for row in range(len(testdata)):
  testdata[row] = testdata[row].replace('-', ' ').translate(str.maketrans('', '', string.punctuation))


In [16]:
# create list of c-mers for the row
# this grabs three letters at a time
# cmer refers to a count of characters
def cmer(row, c=3):
  # Given a row and parameter c, return the vector of c-mers associated with the row

  if len(row) < c:
    return [row]
  cmers = []
  for i in range(len(row)-c+1):
    cmers.append(row[i:(i+c)])
  return cmers

In [17]:
def wmer(row, w=3):
   # Given a row and parameter w, return the vector of w-mers associated with the row
    row = row.split()
    if len(row) < w:
      return [row]
    wmers = []
    for i in range(len(row)-w+1):
      wmers.append(row[i:(i+w)])
    return wmers

Knn classification

In [18]:
# build a term frequency vector and append to dataframe
def buildtfv(data, num):
  # create large matrix of all c/wmers for each row
  # to switch between cmer and wmer, change line below
  mat = pd.Series([wmer(row, num) for row in data])

  # #TFV = term frequency vector
  # tfv = pd.DataFrame()
  templist = []
  for index, row in mat.iteritems():
    unique, counts = np.unique(row, return_counts=True)
    doc = pd.DataFrame([dict(zip(unique, counts))])
    templist.append(doc)
  tfv = pd.concat(templist, ignore_index=True)
  # tfv = pd.DataFrame.from_dict(map(dict, templist))
  return tfv.fillna(0)

In [19]:
# term frequency inverse document frequency
def tfidf(tfv):
  df = (tfv > 0).sum(axis = 0)
  idf = np.log(len(tfv) / df)
  tf_idf = tfv * idf
  return tf_idf

tfvpre = buildtfv(traindata, 1)
tfvpost = tfidf(tfvpre)

In [20]:
def cosineSim(tf_idf, row):
  a = tf_idf.iloc[row] # might need to be [row, :]
  B = tf_idf
  dot = a.multiply(B).sum(axis=1)

  a_len = np.sqrt(a.multiply(a).sum())
  b_len = np.sqrt(B.multiply(B).sum(axis=1))
  b_len

  dot / (a_len * b_len)

  cos_similarities = pd.DataFrame(dot / (a_len * b_len))[0]
  most_similar = cos_similarities.sort_values(ascending=False)

  return most_similar

In [21]:
def get_train_neighbors(training_set, labels, k):
  distances = pd.DataFrame(columns = ['train row', 'dist', 'category'])
  row = []
  distance = []
  cate = []
  for index in range(len(training_set)):
    row.append(training_set[index])
    distance.append(cosineSim(tfvpost, index)[:k])
    cate.append(labels[index])

  distances['train row'] = row
  distances['dist'] = distance
  distances['category'] = cate
  return distances

In [22]:
def vote(neighbors):
  #operates on one specific row
  class_counter = Counter()
  dist = neighbors['dist']
  for i in range(len(dist)):
    # need to check categories of each neighbor
    class_counter[traincategory[dist.index[i]]] += 1
  return class_counter.most_common(1)[0][0]

In [23]:
from sklearn.metrics import f1_score
def trainaccuracy():
  #access indices of each neighbor in print(cosineSim(tfvpost, index)[:k].index[1])
  nearest = get_train_neighbors(traindata, traincategory, 5)

  trainpredict = []
  for rindex in range(len(nearest.values)):
    trainpredict += vote(nearest.iloc[rindex])

  return f1_score(traincategory, trainpredict, average=None)

In [24]:
# print(trainaccuracy())

In [25]:
def testpredictions():
  nearestN = get_train_neighbors(testdata, traincategory, 5)

  testpredict = []
  for rindex in range(len(nearestN.values)):
    testpredict += vote(nearestN.iloc[rindex])

  return testpredict

In [None]:
test_predictions_file = open('output.dat', 'w+')
predicted_df = pd.Series(testpredictions())
predicted_df.to_csv("output.dat", index=False, header=None)