In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

In [27]:
df = pd.read_csv("cancer_data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Data Preprocessing

In [28]:
data = df.drop('id', axis=1)

le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])

X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

scaler = StandardScaler()
X = scaler.fit_transform(X)

# SVM

In [29]:
class SVM:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        self.w = np.zeros(n_features)
        self.b = 0
        
        # gradient descent
        for _ in range(self.n_iters):
            for i, x_i in enumerate(X):
                condition = y[i] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y[i]))
                    self.b -= self.lr * y[i]
                    
    def predict(self, X):
        linear_output = np.dot(X, self.w) - self.b
        return np.sign(linear_output)

In [30]:
Xk = data.iloc[:, 2:].values
yk = data.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(Xk, yk, test_size=0.2, random_state=42)

svm = SVM()
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.37719298245614036


# Decision Tree

In [31]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)
        
    def predict(self, X):
        return [self._predict_input(self.tree, x) for x in X]
        
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))
        
        # Stopping criteria
        if (depth == self.max_depth or n_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return {'type': 'leaf', 'value': leaf_value}
        
        # Splitting criteria
        feature_indices = np.random.choice(n_features, self.n_features, replace=False)
        best_feature, best_threshold = self._best_criteria(X, y, feature_indices)
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
        left_tree = self._grow_tree(X[left_indices, :], y[left_indices], depth+1)
        right_tree = self._grow_tree(X[right_indices, :], y[right_indices], depth+1)
        return {'type': 'split', 'feature': best_feature, 'threshold': best_threshold, 'left': left_tree, 'right': right_tree}
    
    def _best_criteria(self, X, y, feature_indices):
        best_gain = -1
        split_idx, split_threshold = None, None
        for i in feature_indices:
            feature_values = X[:, i]
            for threshold in np.unique(feature_values):
                gain = self._information_gain(y, feature_values, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = i
                    split_threshold = threshold
        return split_idx, split_threshold
    
    def _information_gain(self, y, X, split_threshold):
        parent_entropy = self._entropy(y)
        left_indices, right_indices = self._split(X, split_threshold)
        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0
        n = len(y)
        nl, nr = len(left_indices), len(right_indices)
        el, er = self._entropy(y[left_indices]), self._entropy(y[right_indices])
        child_entropy = (nl/n) * el + (nr/n) * er
        return parent_entropy - child_entropy
    
    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / np.sum(counts)
        entropy = np.sum(probabilities * -np.log2(probabilities))
        return entropy
    
    def _split(self, X, split_threshold):
        left_indices = np.argwhere(X <= split_threshold).flatten()
        right_indices = np.argwhere(X > split_threshold).flatten()
        return left_indices, right_indices
    
    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common
    
    def _predict_input(self, tree, x):
        if tree['type'] == 'leaf':
            return tree['value']
        feature_value = x[tree['feature']]
        if feature_value <= tree['threshold']:
            return self._predict_input(tree['left'], x)
        else:
            return self._predict_input(tree['right'], x)

In [32]:
tree = DecisionTree(max_depth=5, min_samples_split=5)

tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.9385964912280702
