In [184]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import operator
import math

# Read the data
file_path_train = 'gene_files/Genes_relation.data'

# Import the data and specify the column names
column_names = ['GeneID', 'Essential', 'Class', 'Complex', 'Phenotype', 'Motif', 'Chromosome', 'Function', 'Localization']
df_train = pd.read_csv(file_path_train, names=column_names, header=0, na_values='?')

# join interaction data iff interaction[0] = df_train[geneID], then df_train[Interactions] = interaction[1]
df_train_interactions = pd.read_csv('gene_files/Interactions_relation.data', names=['GeneID1', 'GeneID2', 'Type', 'ExpressionCorr'], header=0, na_values='?')
df_train_interactions.drop(["Type", "ExpressionCorr"], axis=1, inplace=True)
df_train_interactions = df_train_interactions.groupby('GeneID1')['GeneID2'].apply(list).reset_index(name='Interactions')

df_train = df_train.merge(df_train_interactions, left_on='GeneID', right_on='GeneID1', how='left')

# Drop function 
df_train = df_train.drop(['Function', 'GeneID1'], axis=1)

# Move Localization to the end
cols = list(df_train.columns.values)
cols.pop(cols.index('Localization'))
df_train = df_train[cols+['Localization']]

df_train.head()



Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Interactions,Localization
0,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00824,1.0,"[G234126, G235065]",cytoplasm
1,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,"[G234126, G235065]",cytoplasm
2,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,"[G234126, G235065]",cytoplasm
3,G234065,Non-Essential,ATPases,,,,1.0,"[G234854, G234371]",cytoplasm
4,G234065,Non-Essential,ATPases,,,,1.0,"[G234854, G234371]",cytoplasm


## 1. Label Encode the training and test data

In [185]:
# Label encode the training data set
label_encoder = LabelEncoder()
for column in df_train.columns:
    df_train[column] = label_encoder.fit_transform(df_train[column].astype(str))

df_train.head()

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Interactions,Localization
0,0,1,6,46,12,175,0,14,2
1,0,1,6,46,12,176,0,14,2
2,0,1,6,46,12,176,0,14,2
3,1,2,0,51,12,235,0,124,2
4,1,2,0,51,12,235,0,124,2


## 2. We will now specify the training and test set

In [186]:
# Split the data into a training set and a validation set
train_data, test_data, train_labels, test_labels = train_test_split(df_train.drop('Localization', axis=1), df_train['Localization'], test_size=0.2)

# Print the number of rows in each set
print("Training set: {} samples".format(train_data.shape[0]))
print("Test set: {} samples".format(test_data.shape[0]))

test_set = test_data.values.tolist() 
print(test_set[3])

Training set: 3476 samples
Test set: 869 samples
[481, 1, 23, 35, 12, 235, 3, 370]


## 3. Now that the data has been prepared we will implement the K-NN algorithm from scratch 

[Reference Algorithm](https://colab.research.google.com/github/akshayrb22/playing-with-data/blob/master/supervised_learning/KNN/KNN.ipynb#scrollTo=jqJptfIChiwY)

### Create custom weighted distance measure

In [187]:
# Custom distance function for the KNN algorithm where Complex=1000, Class=100, Motif=10, Interactions=1
# If the test gene and the training gene have the same value for an attribute, then the weight is added to the distance
# In this case the larger the weight, the smaller the distance
def distance(test_gene, train_gene):
    distance = math.inf
    for i in range(len(test_gene)):
        if test_gene[i] == train_gene[i]:
            if i == 3:
                distance -= 1000
            elif i == 2:
                distance -= 100
            elif i == 4:
                distance -= 10
            elif i == 5:
                distance -= 1
    return distance

## 4. Calculate K-NN

The function getNeighbors takes in three inputs: train_set, test_set, and k. train_set is a 2D numpy array where each row represents a gene in the training dataset, test_set is a 1D numpy array representing the test gene, and k is the number of nearest neighbors you want to find.

The function first calculates the distance between the test gene and each gene in the training dataset using the distance function. The distances are stored in a list of tuples, where each tuple contains the training gene and its corresponding distance from the test gene.

Next, the list of tuples is sorted in ascending order based on the distances using the sort function and the key parameter set to operator.itemgetter(1). This means that the sorting will be based on the second element of each tuple, i.e., the distances.

Finally, the function returns a list of the k nearest neighbors by selecting the first k elements of the sorted list of tuples.

In [188]:
def getNeighbors(train_set, test_set, k):
    distances = []
    length = len(test_set)-1
    for x in range(len(train_set)):
        dist = distance(test_set, train_set[x])
        distances.append((train_set[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [189]:
def getResponse(neighbors):
    classVotes = {}  #Dictionary to store the votes of each class
    for x in range(len(neighbors)):
        response = neighbors[x][-1]  #The last column of the dataset is the class
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)  #Sorting the dictionary based on the votes
    return sortedVotes[0][0]  #Returning the class with the highest votes

## 5. Make a function to get the accuracy

In [190]:
def getAccuracy(test_set, predictions):
    correct = 0
    for x in range(len(test_set)):
        if test_set[x][-1] == predictions[x]:
            correct += 1
    return (correct / float(len(test_set))) * 100.0

## 6. Implement on current dataset

In [191]:
predictions = []  #List to store the predicted values
k = 2  # 1-Nearest Neighbors
training_set = train_data.values.tolist()  #List containing training data
test_set = test_data.values.tolist()  #List containing test data
for x in range(len(test_set)):
    neighbors = getNeighbors(training_set, test_set[x], k)
    result = getResponse(neighbors)
    predictions.append(result)  # Storing the predicted values
    print('> predicted=' + repr(result) + ', actual=' + repr(test_set[x][-1]))

> predicted=393, actual=337
> predicted=393, actual=393
> predicted=393, actual=23
> predicted=393, actual=370
> predicted=393, actual=226
> predicted=393, actual=393
> predicted=393, actual=94
> predicted=393, actual=169
> predicted=393, actual=393
> predicted=393, actual=393
> predicted=393, actual=116
> predicted=393, actual=17
> predicted=393, actual=393
> predicted=393, actual=374
> predicted=393, actual=169
> predicted=393, actual=25
> predicted=393, actual=393
> predicted=393, actual=23
> predicted=393, actual=68
> predicted=393, actual=393
> predicted=393, actual=250
> predicted=393, actual=393
> predicted=393, actual=393
> predicted=393, actual=39
> predicted=393, actual=393
> predicted=393, actual=393
> predicted=393, actual=296
> predicted=393, actual=393
> predicted=393, actual=6
> predicted=393, actual=393
> predicted=393, actual=393
> predicted=393, actual=386
> predicted=393, actual=90
> predicted=393, actual=256
> predicted=393, actual=393
> predicted=393, actual=94
> p

## 7. Print the accuracy of the current model

In [192]:
# Accuracy of the model
accuracy = getAccuracy(test_set, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

Accuracy: 45.109321058688145%


## 8. Export the result

In [193]:
# Export the results to a csv file
df = pd.DataFrame(predictions)
df.to_csv('predictions.csv', index=False, header=False)