In [731]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Read the data
file_path = 'gene_files/Genes_relation.data'

# Import the data and specify the column names
column_names = ['GeneID', 'Essential', 'Class', 'Complex', 'Phenotype', 'Motif', 'Chromosome', 'Function', 'Localization']
df = pd.read_csv(file_path, names=column_names, header=0, na_values='?')

# Print the first 5 rows of the data
df.head()



Unnamed: 0,GeneID,Essential,Class,Complex,Phenotype,Motif,Chromosome,Function,Localization
0,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00824,1.0,PROTEIN SYNTHESIS,cytoplasm
1,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,CELLULAR ORGANIZATION (proteins are localized ...,cytoplasm
2,G234064,Essential,GTP/GDP-exchange factors (GEFs),Translation complexes,,PS00825,1.0,PROTEIN SYNTHESIS,cytoplasm
3,G234065,Non-Essential,ATPases,,,,1.0,"CELL RESCUE, DEFENSE, CELL DEATH AND AGEING",cytoplasm
4,G234065,Non-Essential,ATPases,,,,1.0,CELLULAR ORGANIZATION (proteins are localized ...,cytoplasm


Before running any analysis we will run pre-processing on the data to make sure there are no duplicates and provide some normalization

In [732]:
print("Number of duplicated rows: {}.".format(df.duplicated().sum()))

Number of duplicated rows: 0.


There are no duplicates but there are many missing values

In [733]:
# Use the isna method to identify missing values
missing = df.isna()

# Count the number of missing values in each column
missing_counts = missing.sum()

# Print the number of missing values in each column
print(missing_counts[missing_counts > 0])

Essential      133
Class         2657
Complex       1890
Phenotype     1064
Motif         2239
Chromosome       2
dtype: int64


We will do Label encoding to allow for better normalization of the dataset

In [734]:
# Drop the rows with missing values
df = df.dropna()

# Print the number of rows and columns in the dataset
print("The dataset now contains {} rows and {} columns.".format(*df.shape))

# Print the number of unique values for each column
print(df.nunique())

The dataset now contains 438 rows and 9 columns.
GeneID          48
Essential        3
Class           15
Complex         22
Phenotype        9
Motif           34
Chromosome      15
Function        13
Localization     7
dtype: int64


We will now split the dataset into a training set and a test set with a 20% hold out

In [735]:
# Split the data into a training set and a validation set
train_data, test_data, train_labels, test_labels = train_test_split(df.drop('Localization', axis=1), df['Localization'], test_size=0.2)

# Print the number of rows in each set
print("Training set: {} samples".format(train_data.shape[0]))
print("Test set: {} samples".format(test_data.shape[0]))

Training set: 350 samples
Test set: 88 samples


## Now that the data has been prepared we will implement the decision tree algorithm from scratch 

[Reference Algorithm](https://towardsdatascience.com/decision-tree-algorithm-in-python-from-scratch-8c43f0e40173)

First create the Node class that creates nodes for the Decision Tree

In [736]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

Initialize the Decision Tree class

In [737]:
class DecisionTree:
    def __init__(self, min_samples_split=50, max_depth=10, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        # grow tree
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _gini(self, y):
        # count number of samples at each label
        num_samples_per_class = {label: np.sum(y == label) for label in np.unique(y)}
        # sum weighted Gini impurity for each label
        gini = 1.0
        for label in num_samples_per_class:
            prob_of_label = num_samples_per_class[label] / len(y)
            gini -= prob_of_label ** 2
        return gini

    def _best_split(self, X, y):
        # number of samples and features
        m, n = X.shape
        # we cannot make a split if there are not enough samples
        if m <= self.min_samples_split:
            return None, None
        # initial values
        best_gini = 1.0
        best_idx, best_thr = None, None
        # grow a decision tree
        for idx in range(n):
            # values of feature `idx`
            values = X[:, idx]
            # unique values of feature `idx`
            thresholds = np.unique(values)
            # try all thresholds
            for thr in thresholds:
                # split data
                indices_left = values <= thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                # skip if the split is not valid
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
                # compute weighted Gini impurity
                gini = (len(y_left) * self._gini(y_left) + len(y_right) * self._gini(y_right)) / len(y)
                # update best
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = thr
        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        # number of samples at each label
        num_samples_per_class = {label: np.sum(y == label) for label in np.unique(y)}
        # prediction is the label that has the most samples
        prediction = max(num_samples_per_class, key=num_samples_per_class.get)
        # create node
        node = Node(
            value=prediction
        )
        # split data
        idx, thr = self._best_split(X, y)
        # grow tree
        if idx is not None:
            # indices of data points going left and right
            indices_left = X[:, idx] <= thr
            # grow left and right subtrees
            if depth < self.max_depth:
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _traverse_tree(self, x, node):
        # if we have reached a leaf node, return its value
        if node.is_leaf_node():
            return node.value
        # traverse left or right subtree
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    

Make a call to the Decision Tree and train it

In [738]:
tree = DecisionTree(max_depth=5)
tree.fit(train_data.values, train_labels.values)

In [739]:
# Predict the labels for the test data
preds = tree.predict(test_data.values)

# Calculate the accuracy
accuracy = np.sum(preds == test_labels.values) / len(preds)
print("The test accuracy is {:.1f}%".format(accuracy * 100))

The test accuracy is 48.9%
