In [173]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import operator
import math

# Read the data
file_path = 'gene_files/Genes_relation.data'

# Import the data and specify the column names
column_names = ['GeneID', 'Essential', 'Class', 'Complex', 'Phenotype', 'Motif', 'Chromosome', 'Function', 'Localization']
df = pd.read_csv(file_path, names=column_names, header=0, na_values='?')

# Drop function 
df = df.drop(['Function'], axis=1)

df.head()



Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00824,1.0,cytoplasm
1,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,cytoplasm
2,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,cytoplasm
3,G234065,Non-Essential,ATPases,,,,1.0,cytoplasm
4,G234065,Non-Essential,ATPases,,,,1.0,cytoplasm


## 1. Label Encode the Categorical data

In [174]:
le = LabelEncoder()
df_label_encoded = df.copy()
df_label_encoded['GeneID'] = le.fit_transform(df_label_encoded['GeneID'])
df_label_encoded['Essential'] = le.fit_transform(df_label_encoded['Essential'])
df_label_encoded['Class'] = le.fit_transform(df_label_encoded['Class'])
df_label_encoded['Complex'] = le.fit_transform(df_label_encoded['Complex'])
df_label_encoded['Phenotype'] = le.fit_transform(df_label_encoded['Phenotype'])
df_label_encoded['Motif'] = le.fit_transform(df_label_encoded['Motif'])
df_label_encoded['Chromosome'] = le.fit_transform(df_label_encoded['Chromosome'])
df_label_encoded['Localization'] = le.fit_transform(df_label_encoded['Localization'])

df_label_encoded.head(10)

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,0,1,6,46,12,175,0,2
1,0,1,6,46,12,176,0,2
2,0,1,6,46,12,176,0,2
3,1,2,0,51,12,235,0,2
4,1,2,0,51,12,235,0,2
5,1,2,0,51,12,235,0,2
6,1,2,0,51,12,235,0,2
7,1,2,10,51,12,235,0,2
8,1,2,10,51,12,235,0,2
9,1,2,10,51,12,235,0,2


## 2. Fill in empty data

In [175]:
# Fill missing values with the mean column values
df_label_encoded.fillna(df_label_encoded.mean(), inplace=True)

df_label_encoded.head(10)

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,0,1,6,46,12,175,0,2
1,0,1,6,46,12,176,0,2
2,0,1,6,46,12,176,0,2
3,1,2,0,51,12,235,0,2
4,1,2,0,51,12,235,0,2
5,1,2,0,51,12,235,0,2
6,1,2,0,51,12,235,0,2
7,1,2,10,51,12,235,0,2
8,1,2,10,51,12,235,0,2
9,1,2,10,51,12,235,0,2


## 3. Run coorelation analysis to find important attributes

In [176]:
# Find the correlation between the features and the target by using information gain
df_label_encoded.corr()

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
GeneID,1.0,0.029503,0.662908,0.183476,0.04989,0.246065,0.214724,-0.095205
Essential,0.029503,1.0,-0.010133,0.10675,0.091305,0.036457,0.019697,-0.004759
Class,0.662908,-0.010133,1.0,0.013874,0.054108,0.212992,-0.020329,-0.129265
Complex,0.183476,0.10675,0.013874,1.0,0.114788,-0.040084,-0.047835,0.027184
Phenotype,0.04989,0.091305,0.054108,0.114788,1.0,-0.005322,-0.022047,0.001903
Motif,0.246065,0.036457,0.212992,-0.040084,-0.005322,1.0,0.078439,0.020144
Chromosome,0.214724,0.019697,-0.020329,-0.047835,-0.022047,0.078439,1.0,0.015491
Localization,-0.095205,-0.004759,-0.129265,0.027184,0.001903,0.020144,0.015491,1.0


## 4. We will now split the dataset into a training set and a test set with a 20% hold out

In [177]:
# Split the data into a training set and a validation set
train_data, test_data, train_labels, test_labels = train_test_split(df_label_encoded.drop('Localization', axis=1), df_label_encoded['Localization'], test_size=0.2)

# Print the number of rows in each set
print("Training set: {} samples".format(train_data.shape[0]))
print("Test set: {} samples".format(test_data.shape[0]))

Training set: 3476 samples
Test set: 869 samples


## 5. Now that the data has been prepared we will implement the K-NN algorithm from scratch 

[Reference Algorithm](https://colab.research.google.com/github/akshayrb22/playing-with-data/blob/master/supervised_learning/KNN/KNN.ipynb#scrollTo=jqJptfIChiwY)

First create the Node class that creates nodes for the Decision Tree

In [178]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

## 6. Calculate K-NN

In [179]:
def getNeighbors(train_set, test_set, k):
    distances = []
    length = len(test_set)-1
    for x in range(len(train_set)):
        dist = euclideanDistance(test_set, train_set[x], length)
        distances.append((train_set[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [180]:
def getResponse(neighbors):
    classVotes = {}  #Dictionary to store the votes of each class
    for x in range(len(neighbors)):
        response = neighbors[x][-1]  #The last column of the dataset is the class
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)  #Sorting the dictionary based on the votes
    return sortedVotes[0][0]  #Returning the class with the highest votes

## 6. Make a function to get the accuracy

In [181]:
def getAccuracy(test_set, predictions):
    correct = 0
    for x in range(len(test_set)):
        if test_set[x][-1] == predictions[x]:
            correct += 1
    return (correct / float(len(test_set))) * 100.0

## 7. Implement on current dataset

In [182]:
predictions = []  #List to store the predicted values
k = 2  # 1-Nearest Neighbors
training_set = train_data.values.tolist()  #List containing training data
test_set = test_data.values.tolist()  #List containing test data
for x in range(len(test_set)):
    neighbors = getNeighbors(training_set, test_set[x], k)
    result = getResponse(neighbors)
    predictions.append(result)  # Storing the predicted values
    print('> predicted=' + repr(result) + ', actual=' + repr(test_set[x][-1]))

> predicted=12, actual=12
> predicted=12, actual=12
> predicted=12, actual=12
> predicted=9, actual=9
> predicted=7, actual=7
> predicted=13, actual=13
> predicted=1, actual=1
> predicted=3, actual=3
> predicted=14, actual=14
> predicted=9, actual=9
> predicted=11, actual=14
> predicted=4, actual=4
> predicted=13, actual=13
> predicted=9, actual=9
> predicted=11, actual=11
> predicted=11, actual=11
> predicted=6, actual=6
> predicted=3, actual=3
> predicted=7, actual=7
> predicted=13, actual=13
> predicted=10, actual=10
> predicted=13, actual=13
> predicted=4, actual=4
> predicted=14, actual=14
> predicted=8, actual=8
> predicted=7, actual=7
> predicted=4, actual=4
> predicted=14, actual=14
> predicted=1, actual=1
> predicted=12, actual=12
> predicted=3, actual=3
> predicted=3, actual=1
> predicted=7, actual=7
> predicted=11, actual=11
> predicted=11, actual=11
> predicted=10, actual=10
> predicted=2, actual=2
> predicted=1, actual=1
> predicted=14, actual=14
> predicted=10, actual=10


## 8. Print the accuracy of the current model

In [183]:
# Accuracy of the model
accuracy = getAccuracy(test_set, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

Accuracy: 96.66283084004603%


## 9. Export the result

In [184]:
# Export of the results <GeneID, Localization>
df_results = pd.DataFrame({'GeneID': test_data['GeneID'], 'Localization': predictions})
df_results.to_csv('results.csv', index=False)