In [27]:
# Import libraries
import pandas as pd
import numpy as np
from skimage import io
from skimage.io import imread
from skimage.feature import greycomatrix, greycoprops
from scipy.stats import mode, kurtosis, skew, entropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter


"""
Features Extraction Function
"""
# Get an image and returns first and second order features of the pixels array 
def get_image_features(image_path):
    # Load the image and converts it into a 2-dimensional numpy array with the pixels values
    image = io.imread(image_path, as_gray = True) * 255
    # Converts the float values of the 2-dimensional pixels array into uint8
    image = image.astype(np.uint8)
    
    # Calculate the mean, variance and standard deviation of the 2-dimensional pixels array with numpy functions
    mean = np.mean(image)
    variance = np.var(image)
    std = np.std(image)
    
    # Converts the 2-dimensional pixels array into 1-dimensional
    image_1da = image.flatten()
    
    # Calculate the skewness, kurtosis and entropy of the 1-dimensional array with scipy.stats functions
    skewness = skew(image_1da)
    kurtos = kurtosis(image_1da)
    entro = entropy(image_1da)

    # Calculate the grey-level-co-ocurrence matrix with skimage functions
    # The pixel pair distance offset used is 1
    # The pixel pair angles used are 0, pi/4, pi/2 and 3pi/4
    GLCM = greycomatrix(image, [1], [0, np.pi/4, np.pi/2, 3*np.pi/4])

    # Calculate texture properties of the grey-level-co-ocurrence matrix 
    contrast = greycoprops(GLCM, 'contrast')[0, 0]
    dissimilarity = greycoprops(GLCM, 'dissimilarity')[0, 0]
    homogeneity = greycoprops(GLCM, 'homogeneity')[0, 0]
    asm = greycoprops(GLCM, 'ASM')[0, 0]
    energy = greycoprops(GLCM, 'energy')[0, 0]
    correlation = greycoprops(GLCM, 'correlation')[0, 0]
    
    # Returns all the features values of the image
    return mean, variance, std, skewness, kurtos, entro, contrast, dissimilarity, homogeneity, asm, energy, correlation


"""
K-Nearest Neighbors Algorithm Functions
"""
# Gets two points and calculate the euclidean distance between them
def euclidean_distance(p1, p2):
    ed = np.sqrt(np.sum((p1 - p2) ** 2))
    return ed


# Function to predict the class with knn model
def knn_predict(x_train, y_train, x_input, n_neighbors):
    # List to store the predictions
    predictions = []
     
    # Loop through the datapoints to be classified
    for i in x_input:   
        # List to store the distances
        distances = []
         
        # Loop through each training data
        for j in range(len(x_train)): 
            # Calculate the euclidean distance
            ed = euclidean_distance(np.array(x_train[j, :]), i) 
            
            # Add the calculated euclidean distance to the list
            distances.append(ed) 
            
        # Convert the list into a numpy array
        distances = np.array(distances) 
         
        # Sort the array while preserving the index
        # Keep the first n_neighbors datapoints
        dist_sorted = np.argsort(distances)[:n_neighbors] 
         
        # Labels of the n_neighbors datapoints from above
        labels = y_train[dist_sorted]
         
        # Determine the majority label in labels
        label = mode(labels).mode[0] 
        predictions.append(label)
        
    # Returns a list with the predictions
    return predictions


"""
Decision Trees Algorithm Functions
"""
def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


class Node:
    def __init__(
        self, feature=None, threshold=None, left=None, right=None, *, value=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (
            depth >= self.max_depth
            or n_labels == 1
            or n_samples < self.min_samples_split
        ):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)

        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        # parent loss
        parent_entropy = entropy(y)

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # information gain is difference in loss before vs. after split
        ig = parent_entropy - child_entropy
        return ig

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common


"""
Naive Bayes Algorithm Functions
"""
def prior(df, class_column):
    classes = sorted(list(df[class_column].unique()))
    priors = []
    
    for i in classes:
        priors.append(len(df[df[class_column] == i]) / len(df))
    return priors

def likelihood_gaussian(df, feat_name, feat_val, class_column, label):
    feat = list(df.columns)
    df = df[df[class_column] == label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((feat_val - mean) ** 2 / (2 * std ** 2)))
    return p_x_given_y

def nb_predict(df, x_input, class_column):
    features = list(df.columns)[:-1]
    
    priors = prior(df, class_column)
    
    predictions = []
    
    for x in x_input:
        labels = sorted(list(df[class_column].unique()))
        likelihood = [1] * len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= likelihood_gaussian(df, features[i], x[i], class_column, labels[j])
                
        post_prob = [1] * len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * priors[j]
            
        predictions.append(np.argmax(post_prob))
        
    return predictions


"""
Support Vector Machines Algorithm Functions
"""


'\nSupport Vector Machines Algorithm Functions\n'

In [28]:
dataset = pd.read_csv('data2/brain_tumor_dataset.csv', index_col = 0)

dataset = dataset.drop(['image_name', 'label_name'], axis = 1)

dataset

Unnamed: 0,mean,variance,std,skewness,kurtosis,entropy,contrast,dissimilarity,homogeneity,asm,energy,correlation,label
0,89.561561,9343.869199,96.663691,0.736733,-1.030931,10.861659,2699.608802,26.014204,0.342795,0.044369,0.210640,0.855638,1
1,179.895315,6163.545466,78.508251,-0.628850,-0.691907,13.739950,1141.569875,9.663990,0.499163,0.158578,0.398218,0.907401,1
2,98.602458,7601.647718,87.187429,0.426999,-1.113201,9.839692,5988.536370,46.487696,0.134679,0.001684,0.041042,0.606500,0
3,65.034851,4564.125810,67.558314,0.539535,-1.191914,11.910393,110.697566,4.444838,0.443628,0.023536,0.153414,0.987896,0
4,62.801068,4135.241987,64.305847,0.891289,0.375827,10.233660,981.757594,15.637248,0.355614,0.064710,0.254382,0.881328,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,62.721276,4139.500199,64.338948,0.887192,0.367179,10.230091,967.030336,15.461293,0.360820,0.067232,0.259292,0.883231,0
2996,32.039761,1132.826041,33.657481,1.339844,2.320698,12.359607,66.300139,2.533283,0.642299,0.035815,0.189248,0.970741,0
2997,74.909966,7029.327783,83.841086,0.992125,-0.160940,10.164796,479.430181,9.872701,0.436753,0.079691,0.282296,0.965930,1
2998,201.249399,3284.097731,57.307048,-1.297994,2.359920,13.602579,1794.020654,11.715118,0.501405,0.133459,0.365321,0.725175,1


In [29]:
data = dataset.iloc[:2565, :]
np_data = dataset.iloc[2565:, :].reset_index(drop = True)

In [30]:
train, test = train_test_split(data, test_size = 0.15)

x_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values

x_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

In [31]:
x_np = np_data.iloc[:, :-1].values
y_np = np_data.iloc[:, -1].values

# KNN

In [32]:
knn_val = knn_predict(x_train, y_train, x_test, n_neighbors = 5)
knn_preds = knn_predict(x_train, y_train, x_np, n_neighbors = 5)

print(accuracy_score(y_test, knn_val))

0.8753246753246753


# NB

In [156]:
nb_val = nb_predict(train, x_test, class_column = 'label')
nb_preds = nb_predict(train, x_np, class_column = 'label')

print(accuracy_score(y_test, nb_val))

0.5922077922077922


In [164]:
nb_preds

[1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,


# DTS

In [34]:
dt = DecisionTree(max_depth = 10)
dt.fit(x_train, y_train)

dts_val = dt.predict(x_test)
dts_preds = dt.predict(x_np)

accuracy_score(y_test, dts_val)

0.9012987012987013

# Meta model df

In [35]:
meta_model_df = pd.DataFrame(columns = ('knn', 'dts', 'nb', 'true_label'))

meta_model_df['knn'] = knn_preds
meta_model_df['dts'] = dts_preds
meta_model_df['nb'] = nb_preds
meta_model_df['true_label'] = y_np

meta_model_df

Unnamed: 0,knn,dts,nb,true_label
0,1,1,1,1
1,1,0,1,0
2,1,1,1,1
3,0,0,0,0
4,1,1,1,1
...,...,...,...,...
430,0,0,0,0
431,0,0,0,0
432,0,1,1,1
433,1,1,1,1


In [36]:
stacking_test = pd.DataFrame(columns = ('knn', 'dts', 'nb', 'true_label'))

stacking_test['knn'] = knn_val
stacking_test['dts'] = dts_val
stacking_test['nb'] = nb_val
stacking_test['true_label'] = y_test

stacking_test

Unnamed: 0,knn,dts,nb,true_label
0,0,1,1,0
1,0,0,1,0
2,0,0,0,0
3,0,1,1,1
4,1,1,1,1
...,...,...,...,...
380,0,0,1,1
381,1,1,0,1
382,1,1,1,1
383,1,1,0,1


In [37]:
x_test_s = stacking_test.iloc[:, :-1].values
y_test_s = stacking_test.iloc[:, -1].values

# KNN Meta Model

In [38]:
train2, test2 = train_test_split(meta_model_df, test_size = 0.2)

x_train2 = train2.iloc[:, :-1].values
y_train2 = train2.iloc[:, -1].values

x_test2 = test2.iloc[:, :-1].values
y_test2 = test2.iloc[:, -1].values

In [39]:
from sklearn.linear_model import LogisticRegression

# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(x_train2, y_train2)
lrm = model.predict(x_test2)
lr_s = model.predict(x_test_s)

print(accuracy_score(y_test2, lrm))
print(accuracy_score(y_test_s, lr_s))

0.8620689655172413
0.9012987012987013


In [160]:
path = 'data2/Training/no/no333.jpg'

features = np.array([get_image_features(path)])
features

array([[3.85975145e+01, 2.52348609e+03, 5.02343119e+01, 1.86531611e+00,
        4.11410178e+00, 5.37373285e+00, 9.56961209e+02, 1.55562482e+01,
        3.80352525e-01, 9.50622927e-02, 3.08321736e-01, 8.10577985e-01]])

In [161]:
knn_x = knn_predict(x_train, y_train, features, n_neighbors = 5)
nb_x = nb_predict(train, features, class_column = 'label')
dts_x = dt.predict(features)

news = np.array([[knn_x[0], nb_x[0], dts_x[0]]])
news

array([[1, 0, 0]], dtype=int64)

In [162]:
model.fit(x_train2, y_train2)
model.predict(news)

array([0], dtype=int64)