In [1094]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Read the data
file_path = 'gene_files/Genes_relation.data'

# Import the data and specify the column names
column_names = ['GeneID', 'Essential', 'Class', 'Complex', 'Phenotype', 'Motif', 'Chromosome', 'Function', 'Localization']
df = pd.read_csv(file_path, names=column_names, header=0, na_values='?')

# Drop function 
df = df.drop(['Function'], axis=1)

df.head()



Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00824,1.0,cytoplasm
1,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,cytoplasm
2,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,cytoplasm
3,G234065,Non-Essential,ATPases,,,,1.0,cytoplasm
4,G234065,Non-Essential,ATPases,,,,1.0,cytoplasm


## 1. Label Encode the Categorical data

In [1095]:
le = LabelEncoder()
df_label_encoded = df.copy()
df_label_encoded['GeneID'] = le.fit_transform(df_label_encoded['GeneID'])
df_label_encoded['Essential'] = le.fit_transform(df_label_encoded['Essential'])
df_label_encoded['Class'] = le.fit_transform(df_label_encoded['Class'])
df_label_encoded['Complex'] = le.fit_transform(df_label_encoded['Complex'])
df_label_encoded['Phenotype'] = le.fit_transform(df_label_encoded['Phenotype'])
df_label_encoded['Motif'] = le.fit_transform(df_label_encoded['Motif'])
df_label_encoded['Chromosome'] = le.fit_transform(df_label_encoded['Chromosome'])
df_label_encoded['Localization'] = le.fit_transform(df_label_encoded['Localization'])

df_label_encoded.head(10)

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,0,1,6,46,12,175,0,2
1,0,1,6,46,12,176,0,2
2,0,1,6,46,12,176,0,2
3,1,2,0,51,12,235,0,2
4,1,2,0,51,12,235,0,2
5,1,2,0,51,12,235,0,2
6,1,2,0,51,12,235,0,2
7,1,2,10,51,12,235,0,2
8,1,2,10,51,12,235,0,2
9,1,2,10,51,12,235,0,2


## 2. Fill in empty data

In [1096]:
# Fill missing values with the mean column values
df_label_encoded.fillna(df_label_encoded.mean(), inplace=True)

df_label_encoded.head(10)

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
0,0,1,6,46,12,175,0,2
1,0,1,6,46,12,176,0,2
2,0,1,6,46,12,176,0,2
3,1,2,0,51,12,235,0,2
4,1,2,0,51,12,235,0,2
5,1,2,0,51,12,235,0,2
6,1,2,0,51,12,235,0,2
7,1,2,10,51,12,235,0,2
8,1,2,10,51,12,235,0,2
9,1,2,10,51,12,235,0,2


## 3. Run coorelation analysis to find important attributes

In [1097]:
# Find the correlation between the features and the target by using information gain
df_label_encoded.corr()

Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Localization
GeneID,1.0,0.029503,0.662908,0.183476,0.04989,0.246065,0.214724,-0.095205
Essential,0.029503,1.0,-0.010133,0.10675,0.091305,0.036457,0.019697,-0.004759
Class,0.662908,-0.010133,1.0,0.013874,0.054108,0.212992,-0.020329,-0.129265
Complex,0.183476,0.10675,0.013874,1.0,0.114788,-0.040084,-0.047835,0.027184
Phenotype,0.04989,0.091305,0.054108,0.114788,1.0,-0.005322,-0.022047,0.001903
Motif,0.246065,0.036457,0.212992,-0.040084,-0.005322,1.0,0.078439,0.020144
Chromosome,0.214724,0.019697,-0.020329,-0.047835,-0.022047,0.078439,1.0,0.015491
Localization,-0.095205,-0.004759,-0.129265,0.027184,0.001903,0.020144,0.015491,1.0


## 4. We will now split the dataset into a training set and a test set with a 20% hold out

In [1098]:
# Split the data into a training set and a validation set
train_data, test_data, train_labels, test_labels = train_test_split(df_label_encoded.drop('Localization', axis=1), df_label_encoded['Localization'], test_size=0.2)

# Print the number of rows in each set
print("Training set: {} samples".format(train_data.shape[0]))
print("Test set: {} samples".format(test_data.shape[0]))

Training set: 3476 samples
Test set: 869 samples


## 5. Now that the data has been prepared we will implement the decision tree algorithm from scratch 

[Reference Algorithm](https://towardsdatascience.com/decision-tree-algorithm-in-python-from-scratch-8c43f0e40173)

First create the Node class that creates nodes for the Decision Tree

In [1099]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

Initialize the Decision Tree class

In [1100]:
class DecisionTree:
    def __init__(self, min_samples_split=100, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        # grow tree
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _gini(self, y):
        # count number of samples at each label
        num_samples_per_class = {label: np.sum(y == label) for label in np.unique(y)}
        # sum weighted Gini impurity for each label
        gini = 1.0
        for label in num_samples_per_class:
            prob_of_label = num_samples_per_class[label] / len(y)
            gini -= prob_of_label ** 2
        return gini

    def _best_split(self, X, y):
        # number of samples and features
        m, n = X.shape
        # we cannot make a split if there are not enough samples
        if m <= self.min_samples_split:
            return None, None
        # initial values
        best_gini = 1.0
        best_idx, best_thr = None, None
        # grow a decision tree
        for idx in range(n):
            # values of feature `idx`
            values = X[:, idx]
            # unique values of feature `idx`
            thresholds = np.unique(values)
            # try all thresholds
            for thr in thresholds:
                # split data
                indices_left = values <= thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                # skip if the split is not valid
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
                # compute weighted Gini impurity
                gini = (len(y_left) * self._gini(y_left) + len(y_right) * self._gini(y_right)) / len(y)
                # update best
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = thr
        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        # number of samples at each label
        num_samples_per_class = {label: np.sum(y == label) for label in np.unique(y)}
        # prediction is the label that has the most samples
        prediction = max(num_samples_per_class, key=num_samples_per_class.get)
        # create node
        node = Node(
            value=prediction
        )
        # split data
        idx, thr = self._best_split(X, y)
        # grow tree
        if idx is not None:
            # indices of data points going left and right
            indices_left = X[:, idx] <= thr
            # grow left and right subtrees
            if depth < self.max_depth:
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _traverse_tree(self, x, node):
        # if we have reached a leaf node, return its value
        if node.is_leaf_node():
            return node.value
        # traverse left or right subtree
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    

## 6. Make a call to the Decision Tree and train it

In [1101]:
tree = DecisionTree(max_depth=5)
tree.fit(train_data.values, train_labels.values)

In [1102]:
# Predict the labels for the test data
preds = tree.predict(test_data.values)

# Calculate the accuracy
accuracy = np.sum(preds == test_labels.values) / len(preds)
print("The test accuracy is {:.1f}%".format(accuracy * 100))

# Export the the results to a CSV file
output = pd.DataFrame({'Id': test_data.index, 'Localization': preds}) 
output.to_csv('submission.csv', index=False)

The test accuracy is 41.1%
