In [2]:
import numpy as np
import pandas as pd

**Getting the Data**

In [3]:
# getting training data from file
with open('train.dat', 'r') as f:
    lines = f.readlines()
#print(lines[0])

In [4]:
# separating the labels from rest of data
labels = []
for line in lines:
    labels.append(line[:1])
#print(classes[0])

In [5]:
# starting a list of all documents using training data (excluding labels)
docs = []
for line in lines:
    docs.append(line[2:])
#print(docs[0])

In [6]:
# creating dataframe from training data to split into training and test sets
corpus = pd.DataFrame({'document': docs, 'labels': labels})
# corpus

Unnamed: 0,document,labels
0,Catheterization laboratory events and hospital...,4
1,Renal abscess in children. Three cases of rena...,5
2,Hyperplastic polyps seen at sigmoidoscopy are ...,2
3,Subclavian artery to innominate vein fistula a...,5
4,Effect of local inhibition of gamma-aminobutyr...,4
...,...,...
14433,Quadricuspid aortic valve and aortic regurgita...,4
14434,Mammographic measurements before and after aug...,1
14435,Use of leukocyte-depleted platelet concentrate...,1
14436,Complications of Tenckhoff catheters post remo...,2


In [7]:
from sklearn.model_selection import train_test_split

# creating test and training sets
X_train, X_test = train_test_split(corpus, test_size = 0.2, stratify = corpus['labels'])
# print(X_train.shape)
# print(X_test.shape)

(11550, 2)
(2888, 2)


In [8]:
#X_train

Unnamed: 0,document,labels
10227,Induction of in vitro graft-versus-leukemia ac...,1
13477,Idiopathic sclerosing peritonitis in a man. Id...,5
11839,Buschke-Loewenstein tumor: verrucous carcinoma...,1
456,A phase II trial of carboplatin and vinblastin...,1
14359,Circumferential quantitative analysis of plana...,4
...,...,...
1347,Nonpeptide angiotensin II receptor antagonists...,4
403,Antibiotic use among children in an urban Braz...,2
3089,Gastrointestinal tuberculosis. Report of four ...,2
8340,Effect of the topical anesthetic EMLA on the e...,1


In [9]:
# making complete list of documents used for training
training_docs = X_train['document'].tolist()
training_docs.extend(X_test['document'].tolist())
# print(len(training_docs))

# making complete list of labels used for training
training_labels = X_train['labels'].tolist()
training_labels.extend(X_test['labels'].tolist())
# print(len(training_labels))

14438
14438


In [10]:
# getting the testing data (which doesn't have labels) and them to the list of all documents
with open('test.dat', 'r') as file:
    lines2 = file.readlines()
for l in lines2:
    docs.append(l)

In [11]:
# printing sizes of what we've extracted to ensure we got all the data
# print(len(labels))
# print(len(lines2))
# print(len(docs))
# print(len(labels) + len(lines2))

14438
14442
28880
28880


**Preprocessing the Data**

In [12]:
# converting all text of documents to lower case
for i in range(len(training_docs)):
    training_docs[i] = training_docs[i].lower()

In [13]:
for i in range(len(docs)):
    docs[i] = docs[i].lower()

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 

# print(training_docs[0])

# removing punctuation from the documents
punctuation = '''!()[]{};:'"\,+=<>./?@#$%^&*_~'''
for i in range(len(training_docs)):
    for c in training_docs[i]:
        if c in punctuation:
            training_docs[i] = training_docs[i].replace(c, "")

# removing stopwords from documents
cached_words = stopwords.words('english')
for i in range(len(training_docs)):
    text_tokens = word_tokenize(training_docs[i])
    tokens_without_sw = [word for word in text_tokens if not word in cached_words]
    training_docs[i] = (" ").join(tokens_without_sw)
    
# removing numbers from documents
for i in range(len(training_docs)):
    text_tokens = word_tokenize(training_docs[i])
    tokens_without_nums = [word for word in text_tokens if not word.isdigit()]
    training_docs[i] = (" ").join(tokens_without_nums)
            

# print(training_docs[0])
# print("done")

induction of in vitro graft-versus-leukemia activity following bone marrow transplantation for chronic myeloid leukemia. we studied the in vitro effects of lymphokine-activated killer (lak) cells from the peripheral blood of chronic myeloid leukemia (cml) patients after allogeneic and syngeneic bone marrow transplantation (bmt). lak cells were generated by incubating peripheral blood mononuclear cells from patients post-bmt with recombinant interleukin-2 (il-2) (500 u/ml) in 10% ab serum for 7 days. they were phenotyped and tested for activity in a standard 4-hour 51cr release assay (n = 37) and in a cfu-gm assay (n = 24). we found that the lak cells were mainly activated natural killer cells, but some were cd3+ t cells. in the 51cr release assay lak cells from 20 of 33 (61%) allogeneic and 2 of 4 syngeneic recipients killed recipient cml cells and in 22 of 37 (60%) cases also killed the hla disparate cml cells. in the cfu-gm assay the lak cells incubated together with the cml cells in

In [15]:
# doing same steps for docs

#print(docs[0])

# removing punctuation from the documents
punctuation = '''!()[]{};:'"\,+=<>./?@#$%^&*_~'''
for i in range(len(docs)):
    for c in docs[i]:
        if c in punctuation:
            docs[i] = docs[i].replace(c, "")

# removing stopwords from documents
cached_words = stopwords.words('english')
for i in range(len(docs)):
    text_tokens = word_tokenize(docs[i])
    tokens_without_sw = [word for word in text_tokens if not word in cached_words]
    docs[i] = (" ").join(tokens_without_sw)
    
# removing numbers from documents
for i in range(len(docs)):
    text_tokens = word_tokenize(docs[i])
    tokens_without_nums = [word for word in text_tokens if not word.isdigit()]
    docs[i] = (" ").join(tokens_without_nums)
            

# #print(docs[0])
# print("done")

done


In [16]:
# print(training_docs[0])

from nltk.stem import PorterStemmer

# stemming words from documents
ps = PorterStemmer()
stems = []
for i in range(len(training_docs)):
    text_tokens = word_tokenize(training_docs[i])
    for t in text_tokens:
        stems.append(ps.stem(t))
    training_docs[i] = (" ").join(stems)
    stems.clear()
print(training_docs[0])
print("done")

induction vitro graft-versus-leukemia activity following bone marrow transplantation chronic myeloid leukemia studied vitro effects lymphokine-activated killer lak cells peripheral blood chronic myeloid leukemia cml patients allogeneic syngeneic bone marrow transplantation bmt lak cells generated incubating peripheral blood mononuclear cells patients post-bmt recombinant interleukin-2 il-2 uml ab serum days phenotyped tested activity standard 4-hour 51cr release assay n cfu-gm assay n found lak cells mainly activated natural killer cells cd3 cells 51cr release assay lak cells allogeneic syngeneic recipients killed recipient cml cells cases also killed hla disparate cml cells cfu-gm assay lak cells incubated together cml cells liquid culture plating inhibited p less colony growth allogeneic syngeneic recipients cell-cell contact necessary optimal effect little inhibition proliferation donor marrow cfu-gm vitro graft-versus-leukemia gvl effect could also demonstrated lak effectors deplet

In [17]:
# doing the same for docs

# stemming words from documents
ps = PorterStemmer()
stems = []
for i in range(len(docs)):
    text_tokens = word_tokenize(docs[i])
    for t in text_tokens:
        stems.append(ps.stem(t))
    docs[i] = (" ").join(stems)
    stems.clear()
#print(docs[0])
print("done")

done


In [18]:
# making dictionary with all words in documents as keys and assigning them ID values (ended up not using but kept here anyway)

# dictionary = {}
# word_ID = 0
# # creating dictionary of all words in the documents
# for d in docs:
#     words = d.split()
#     for word in words:
#         if word not in docs:
#             dictionary[word] = word_ID
#             word_ID = word_ID + 1
#     words.clear()

**Creating Term Frequency Matrix for Words in Documents**

In [20]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer()
dense = cv.fit_transform(training_docs).todense()
df = pd.DataFrame(dense)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29750,29751,29752,29753,29754,29755,29756,29757,29758,29759
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# rows = documents, columns = words
df.columns = cv.get_feature_names()
#df = df.T
df

Unnamed: 0,00001abstract,00005,0001,0005,0006,0008,0009,001,0018,0021,...,zygomycet,zygomycosi,zygos,zylor,zymodem,zymogen,zymogram,zymographi,zymosan,zzygo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# doing the same for docs

count_vect = CountVectorizer()
dense_matrix = count_vect.fit_transform(docs).todense()
data_frame = pd.DataFrame(dense_matrix)
data_frame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41932,41933,41934,41935,41936,41937,41938,41939,41940,41941
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28875,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# rows = documents, columns = words
data_frame.columns = count_vect.get_feature_names()
#df = df.T
data_frame

Unnamed: 0,00,000,00001abstract,00005,000054,000057min,0001,0001abstract,00027min,00038,...,zygomycosi,zygos,zygot,zylor,zymodem,zymogen,zymogram,zymographi,zymosan,zzygo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28875,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Converting Term Frequency Matrix to 2D Array**
(used for calculating cosine similarity of 2 vectors/documents)

In [24]:
training_vector = df.to_numpy()
training_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
test_vector = data_frame.to_numpy()
test_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
# # separating the training and testing sets
X_train = training_vector[:11550]
print(X_train.shape)

X_test = training_vector[11550:]
print(X_test.shape)


X_train_complete = test_vector[:14438]
print(X_train_complete.shape)

X_test_complete = test_vector[14438:]
print(X_test_complete.shape)

(11550, 29760)
(2888, 29760)
(14438, 41942)
(14442, 41942)


**K Nearest Neighbor Algorithm**

In [28]:
from collections import Counter

# def euclidean_distance(x1, x2):
#      return np.sqrt(np.sum((x1 - x2)**2))

def L2_norm(v):
    return np.sqrt(np.sum(np.square(v)))

# computes cosine similarity of 2 vectors
def cosine_similarity(x1, x2):
    dot_product = np.dot(x1, x2)
    norm_x1 = L2_norm(x1)
    norm_x2 = L2_norm(x2)
    return dot_product / (norm_x1 * norm_x2)
    
def KNN(X_train, y_train, x, k):
    # compute distances
    similarities = [cosine_similarity(x, x_train) for x_train in X_train]
    
    # get k nearest samples and their labels
    k_indicies = np.argsort(similarities)[-k:]
    k_nearest_labels = [y_train[i] for i in k_indicies]
    
    # use majority vote to choose label
    most_common = Counter(k_nearest_labels).most_common(1)
    return most_common[0][0]

def predict(X_train, y_train, X_test, k):
    predicted_labels = [KNN(X_train, y_train, x, k) for x in X_test]
    return np.array(predicted_labels)



**Training KNN**

In [None]:
k = 3
# test_subset = X_test_complete[:10]
predicted = predict(X_train, training_labels, X_test, k)

In [None]:
from sklearn.metrics import f1_score

true_train_labels = training_labels[11550:]
print(f1_score(true_train_labels, predicted))

**Testing KNN**

In [None]:
test_predictions = predict(X_train_complete, labels, X_test_complete, k)
file = open('predictions.txt','w')
for p in test_predictions:
    file.writelines(p+'\n')
file.close()