In [40]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
import string
import nltk as nltk 
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

data_df = pd.read_csv('train.dat', sep='\t', header=None)
testdata_df = pd.read_csv('test.dat', sep='\t', header=None)

def cmer(name, c=3):
    r""" Given a name and parameter c, return the vector of c-mers associated with the name
    """
    name = name.lower()
    size = len(name) + 1    
    v = []
    for x in range(c, size):
        tt = name[x-c:x]
        v.append(tt)
    
    return v

def cmerwords(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

# either will use cmerwords or tfidf with n-grams and will check the performance
#need to try euclidean distance


exclude = set(string.punctuation)
stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()

# clean data : lower case the string, remove punctuation, remove stop words, apply lemma.
def clean(data_df, index):
    data_df_lower = data_df[index].str.lower()
    data_df_punc_free = data_df_lower.apply(lambda x:''.join([i for i in x if i not in exclude]))
    data_df_stop = data_df_punc_free.apply(lambda x: ' '.join(k for k in x.split() if k not in stop))
    normalized = data_df_stop.apply(lambda x: ' '.join(lemma.lemmatize(word, pos="v") for word in x.split()))
    normalized_Stem = normalized.apply(lambda x: ' '.join(stemmer.stem(word) for word in x.split()))
    return normalized_Stem

def preprocessing(train_df, test_df, ngram_min, ngram_max):
    # 1 index being the column 1
    cleaned_train = clean(data_df, 1)
    train_size = len(cleaned_train)
    cleaned_test = clean(testdata_df, 0)
    test_size = len(cleaned_test)

    combined_arr = np.append(cleaned_train, cleaned_test)
    combined_arr_size = len(combined_arr)


    # apply tf-idf vectorizer with 1 k-mer. and normalize the vectors
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(norm=None, ngram_range=(ngram_min, ngram_max))
    combinedVectorize = vectorizer.fit_transform(combined_arr)
    trainedVec = combinedVectorize[0:train_size]
    testVec = combinedVectorize[train_size:combined_arr_size]
    
    return trainedVec, testVec


def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

trainedVec, testVec = preprocessing(data_df, testdata_df, 1, 1)

csr_l2normalize(trainedVec)
csr_l2normalize(testVec)
clsTrue = data_df[0]
# print(type(cls.values))
print(trainedVec.shape)
print(testVec.shape)
print(clsTrue.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/carora/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
(14438, 61252)
(14442, 61252)
(14438,)


In [42]:
def splitData(mat, clsTrue, fold=1, d=10):
    r""" Split the matrix and class info into train and test data using d-fold hold-out
    """
    n = mat.shape[0]
    r = int(np.ceil(n*1.0/d))
    mattr = []
    clstr = []
    # split mat and clsTrue into d folds
    for f in range(d):
        if f+1 != fold:
            mattr.append( mat[f*r: min((f+1)*r, n)] )
            clstr.extend( clsTrue[f*r: min((f+1)*r, n)] )
    # join all fold matrices that are not the test matrix
    train = sp.vstack(mattr, format='csr')
    # extract the test matrix and class values associated with the test rows
    test = mat[(fold-1)*r: min(fold*r, n), :]
    clste = clsTrue[(fold-1)*r: min(fold*r, n)]

    return train, clstr, test, clste


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
def classification(mat, clsTrue, k=3, d=10):
    r""" Classify names using c-mer frequency vector representations of the names and kNN classification with 
    cosine similarity and 10-fold cross validation
    """
    
    def classifyy(x, classifier):
        r""" Classify vector x using kNN and majority vote rule given training data and associated classes
        """
        # find nearest neighbors for x
        return classifier.predict(x)

        
    macc = 0.0
    for f in range(d):
        # split data into training and testing
        train, clstr, test, clste = splitData(mat, clsTrue, f+1, d)
        # predict the class of each test sample
        classifier = KNeighborsClassifier(k)
        classifier.fit(train, clstr) 
        clspr = [ classifyy(test[i,:], classifier) for i in range(test.shape[0]) ]
        # compute the accuracy of the prediction
        F1 = f1_score(clste, clspr, average='micro')
        macc += F1
        
    return macc/d


k=3
fileName = "run.dat"
for k in range(52, 53):
    key = "k="+str(k)
    print(key)
    val = classification(trainedVec, clsTrue.values, k)
    f = open(fileName,"a+")
    f.write("highest F1 parameters: " + key + " F1:"  + str(val)+'\n')
    print("highest F1 parameters: " + key + " F1:"  + str(val))
    f.close()


k=52
highest F1 parameters: k=52 F1:0.6203754788094406


In [45]:
k=52
fileName = "output_1_52_classification.dat"

classifier = KNeighborsClassifier(k)
classifier.fit(trainedVec, clsTrue.values) 
clsFinal = [ classifier.predict(testVec[i,:]) for i in range(testVec.shape[0]) ]
#clsFinal = [ classify(testVec[i,:], trainedVec, cls.values, k) for i in range(testVec.shape[0])]

print(len(clsFinal))
f= open(fileName,"w+")
for i in range(len(clsFinal)):
     f.write(str(int(clsFinal[i]))+'\n')
f.close() 

oup = pd.read_csv(fileName, sep='\t', header=None)

print(oup)

14442
       0
0      5
1      5
2      1
3      2
4      1
5      5
6      5
7      5
8      1
9      5
10     5
11     4
12     4
13     5
14     5
15     1
16     5
17     1
18     5
19     4
20     3
21     1
22     4
23     3
24     1
25     5
26     4
27     4
28     1
29     1
...   ..
14412  4
14413  3
14414  1
14415  5
14416  5
14417  3
14418  2
14419  5
14420  1
14421  4
14422  5
14423  5
14424  3
14425  1
14426  1
14427  3
14428  4
14429  5
14430  5
14431  2
14432  1
14433  4
14434  3
14435  5
14436  5
14437  3
14438  5
14439  1
14440  5
14441  5

[14442 rows x 1 columns]
