In [232]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import operator
import math

# Read the data
file_path_train = 'gene_files/Genes_relation.data'

# Import the data and specify the column names
column_names = ['GeneID', 'Essential', 'Class', 'Complex', 'Phenotype', 'Motif', 'Chromosome', 'Function', 'Localization']
df_train = pd.read_csv(file_path_train, names=column_names, header=0, na_values='?')

# Drop function 
df_train = df_train.drop(['Function'], axis=1)

# convert nan to 0
df_train = df_train.fillna(0)

df_train.head(10)



Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,0,PS00824,1.0,cytoplasm
1,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,0,PS00825,1.0,cytoplasm
2,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,0,PS00825,1.0,cytoplasm
3,G234065,Non-Essential,ATPases,0,0,0,1.0,cytoplasm
4,G234065,Non-Essential,ATPases,0,0,0,1.0,cytoplasm
5,G234065,Non-Essential,ATPases,0,0,0,1.0,cytoplasm
6,G234065,Non-Essential,ATPases,0,0,0,1.0,cytoplasm
7,G234065,Non-Essential,Molecular chaperones,0,0,0,1.0,cytoplasm
8,G234065,Non-Essential,Molecular chaperones,0,0,0,1.0,cytoplasm
9,G234065,Non-Essential,Molecular chaperones,0,0,0,1.0,cytoplasm


## 1. Label Encode the training and test data

In [233]:
label_encoder = LabelEncoder()
for column in df_train.columns:
    df_train[column] = label_encoder.fit_transform(df_train[column].astype(str))

df_train.head()

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,0,2,7,47,0,176,1,2
1,0,2,7,47,0,177,1,2
2,0,2,7,47,0,177,1,2
3,1,3,1,0,0,0,1,2
4,1,3,1,0,0,0,1,2


## 2. We will now specify the training and test set

In [234]:
# Split the data into a training set and a test set
# Localization is the target
train_data, test_data = train_test_split(df_train, test_size=0.2, random_state=42)


# Print the number of rows in each set
print("Training set: {} samples".format(train_data.shape[0]))
print("Test set: {} samples".format(test_data.shape[0]))

test_set = test_data.values.tolist() 


train_data.head()

Training set: 3476 samples
Test set: 869 samples


Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
3666,682,3,0,0,0,0,11,10
2281,387,3,0,48,0,0,14,0
736,111,3,10,0,0,69,15,12
3416,618,3,0,0,0,0,14,5
3434,624,3,0,0,0,23,5,2


## 3. Now that the data has been prepared we will implement the K-NN algorithm from scratch 

[Reference Algorithm](https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/)

### Create custom weighted distance measure

In [235]:
# Custom distance function for the KNN algorithm where Complex=1000, Class=100, Motif=10, Interactions=1
# If the test gene and the training gene have the same value for an attribute, then the weight is added to the distance
# In this case the larger the weight, the smaller the distance
# def distance(test_gene, train_gene):
#     distance = math.inf
#     for i in range(len(test_gene)):
#         if test_gene[i] == train_gene[i]:
#             if i == 3:
#                 distance -= 1000
#             elif i == 2:
#                 distance -= 100
#             elif i == 4:
#                 distance -= 10
#             elif i == 5:
#                 distance -= 1
#     return distance

# Hamming distance
def distance(test_gene, train_gene):
    distance = 0
    for i in range(len(test_gene)):
        if test_gene[i] != train_gene[i]:
            distance += 1
    return distance

## 4. Calculate K-NN

The function getNeighbors takes in three inputs: train_set, test_set, and k. train_set is a 2D numpy array where each row represents a gene in the training dataset, test_set is a 1D numpy array representing the test gene, and k is the number of nearest neighbors you want to find.

The function first calculates the distance between the test gene and each gene in the training dataset using the distance function. The distances are stored in a list of tuples, where each tuple contains the training gene and its corresponding distance from the test gene.

Next, the list of tuples is sorted in ascending order based on the distances using the sort function and the key parameter set to operator.itemgetter(1). This means that the sorting will be based on the second element of each tuple, i.e., the distances.

Finally, the function returns a list of the k nearest neighbors by selecting the first k elements of the sorted list of tuples.

In [236]:
def getNeighbors(train_set, test_set, k):
    distances = []
    length = len(test_set)-1
    for x in range(len(train_set)):
        dist = distance(test_set, train_set[x])
        distances.append((train_set[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

print(getNeighbors(train_data.values.tolist(), test_set[3], 5))

[[83, 2, 5, 0, 3, 0, 13, 6], [83, 2, 5, 0, 6, 0, 13, 6], [83, 2, 5, 0, 5, 0, 13, 6], [83, 2, 5, 0, 6, 0, 13, 6], [83, 2, 5, 0, 11, 0, 13, 6]]


" The most similar neighbors collected from the training dataset can be used to make predictions.

In the case of classification, we can return the most represented class among the neighbors.

We can achieve this by performing the max() function on the list of output values from the neighbors. Given a list of class values observed in the neighbors, the max() function takes a set of unique class values and calls the count on the list of class values for each class value in the set. "

In [237]:
# Predict the class of a test gene
# Convert from unhashable type: 'list' to hashable type: 'tuple'
def predict(train_set, test_set, k):
    neighbors = getNeighbors(train_set, test_set, k)
    output_values = [row[-1] for row in neighbors]
    output_values = [int(i) for i in output_values]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

## 5. Make a function to get the accuracy

In [238]:
def getAccuracy(test_set, predictions):
    correct = 0
    for x in range(len(test_set)):
        if test_set[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(test_set))) * 100.0

## 6. Implement on current dataset

In [239]:
predictions = []  #List to store the predicted values
k = 5 # 1-Nearest Neighbors
training_set = train_data.values.tolist()  #List containing training data
test_set = test_data.values.tolist()  #List containing test data
for x in range(len(test_set)):
    result = predict(training_set, test_set[x], k)
    predictions.append(result)  # Storing the predicted values
    # Convert labelEncoder to original value
    result = label_encoder.inverse_transform([result]).item()
    actual = label_encoder.inverse_transform([test_set[x][-1]]).item()
    print('> predicted=' + repr(result) + ', actual=' + repr(actual))

> predicted='nucleus', actual='nucleus'
> predicted='mitochondria', actual='mitochondria'
> predicted='nucleus', actual='nucleus'
> predicted='golgi', actual='golgi'
> predicted='nucleus', actual='nucleus'
> predicted='golgi', actual='golgi'
> predicted='nucleus', actual='nucleus'
> predicted='cytoskeleton', actual='cytoskeleton'
> predicted='cytoplasm', actual='cytoplasm'
> predicted='cytoskeleton', actual='cytoskeleton'
> predicted='ER', actual='ER'
> predicted='mitochondria', actual='mitochondria'
> predicted='nucleus', actual='peroxisome'
> predicted='golgi', actual='golgi'
> predicted='ER', actual='ER'
> predicted='cytoplasm', actual='cytoplasm'
> predicted='nucleus', actual='nucleus'
> predicted='cytoskeleton', actual='cytoskeleton'
> predicted='cytoplasm', actual='cytoplasm'
> predicted='nucleus', actual='nucleus'
> predicted='nucleus', actual='nucleus'
> predicted='cytoplasm', actual='cytoplasm'
> predicted='nucleus', actual='nucleus'
> predicted='cytoskeleton', actual='cytoske

## 7. Print the accuracy of the current model

In [240]:
# Accuracy of the model
accuracy = getAccuracy(test_set, predictions)
print('Accuracy: ' + repr(accuracy) + '%')

Accuracy: 98.96432681242808%


## 8. Export the result

In [241]:
# Export the results to a csv file
df = pd.DataFrame(predictions)
df.to_csv('predictions.csv', index=False, header=False)