In [1]:
# Import libraries
import pandas as pd
import numpy as np
from skimage import io
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import greycomatrix, greycoprops
from scipy.stats import mode, kurtosis, skew, entropy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter


"""
Features Extraction Function
"""
# Get an image and returns first and second order features of the pixels array 
def get_image_features(image_path):
    # Load the image and converts it into a 2-dimensional numpy array with the pixels values
    image = io.imread(image_path, as_gray = True) * 255
    # Resize image to 512 x 512
    image = resize(image, (512, 512))
    # Converts the float values of the 2-dimensional pixels array into uint8
    image = image.astype(np.uint8)
    
    # Calculate the mean, variance and standard deviation of the 2-dimensional pixels array with numpy functions
    mean = np.mean(image)
    variance = np.var(image)
    std = np.std(image)
    
    # Converts the 2-dimensional pixels array into 1-dimensional
    image_1da = image.flatten()
    
    # Calculate the skewness, kurtosis and entropy of the 1-dimensional array with scipy.stats functions
    skewness = skew(image_1da)
    kurtos = kurtosis(image_1da)
    entro = entropy(image_1da)

    # Calculate the grey-level-co-ocurrence matrix with skimage functions
    # The pixel pair distance offset used is 1
    # The pixel pair angles used are 0, pi/4, pi/2 and 3pi/4
    GLCM = greycomatrix(image, [1], [0, np.pi/4, np.pi/2, 3*np.pi/4])

    # Calculate texture properties of the grey-level-co-ocurrence matrix 
    contrast = greycoprops(GLCM, 'contrast')[0, 0]
    dissimilarity = greycoprops(GLCM, 'dissimilarity')[0, 0]
    homogeneity = greycoprops(GLCM, 'homogeneity')[0, 0]
    asm = greycoprops(GLCM, 'ASM')[0, 0]
    energy = greycoprops(GLCM, 'energy')[0, 0]
    correlation = greycoprops(GLCM, 'correlation')[0, 0]
    
    # Returns all the features values of the image
    return mean, variance, std, skewness, kurtos, entro, contrast, dissimilarity, homogeneity, asm, energy, correlation


"""
K-Nearest Neighbors Algorithm Functions
"""
# Gets two points and calculate the euclidean distance between them
def euclidean_distance(p1, p2):
    ed = np.sqrt(np.sum((p1 - p2) ** 2))
    return ed

# Function to predict the class with knn model
def knn_predict(x_train, y_train, x_input, n_neighbors):
    # List to store the predictions
    predictions = []
     
    # Loop through the datapoints to be classified
    for i in x_input:   
        # List to store the distances
        distances = []
         
        # Loop through each training data
        for j in range(len(x_train)): 
            # Calculate the euclidean distance
            ed = euclidean_distance(np.array(x_train[j, :]), i) 
            
            # Add the calculated euclidean distance to the list
            distances.append(ed) 
            
        # Convert the list into a numpy array
        distances = np.array(distances) 
         
        # Sort the array while preserving the index
        # Keep the first n_neighbors datapoints
        dist_sorted = np.argsort(distances)[:n_neighbors] 
         
        # Labels of the n_neighbors datapoints from above
        labels = y_train[dist_sorted]
         
        # Determine the majority label in labels
        label = mode(labels).mode[0] 
        predictions.append(label)
        
    # Returns a list with the predictions
    return predictions


"""
Decision Trees Algorithm Functions
"""
def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


class Node:
    def __init__(
        self, feature=None, threshold=None, left=None, right=None, *, value=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_feats=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_feats = n_feats
        self.root = None

    def fit(self, X, y):
        self.n_feats = X.shape[1] if not self.n_feats else min(self.n_feats, X.shape[1])
        self.root = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # stopping criteria
        if (
            depth >= self.max_depth
            or n_labels == 1
            or n_samples < self.min_samples_split
        ):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_features, self.n_feats, replace=False)

        # greedily select the best split according to information gain
        best_feat, best_thresh = self._best_criteria(X, y, feat_idxs)

        # grow the children that result from the split
        left_idxs, right_idxs = self._split(X[:, best_feat], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feat, best_thresh, left, right)

    def _best_criteria(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(y, X_column, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_thresh = threshold

        return split_idx, split_thresh

    def _information_gain(self, y, X_column, split_thresh):
        # parent loss
        parent_entropy = entropy(y)

        # generate split
        left_idxs, right_idxs = self._split(X_column, split_thresh)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        # compute the weighted avg. of the loss for the children
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = entropy(y[left_idxs]), entropy(y[right_idxs])
        child_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        # information gain is difference in loss before vs. after split
        ig = parent_entropy - child_entropy
        return ig

    def _split(self, X_column, split_thresh):
        left_idxs = np.argwhere(X_column <= split_thresh).flatten()
        right_idxs = np.argwhere(X_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def _most_common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common


"""
Naive Bayes Algorithm Functions
"""

def prior(df, class_column):
    classes = sorted(list(df[class_column].unique()))
    priors = []
    
    for i in classes:
        priors.append(len(df[df[class_column] == i]) / len(df))
    return priors

def likelihood_gaussian(df, feat_name, feat_val, class_column, label):
    feat = list(df.columns)
    df = df[df[class_column] == label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((feat_val - mean) ** 2 / (2 * std ** 2)))
    return p_x_given_y

def nb_predict(df, x_input, class_column):
    features = list(df.columns)[:-1]
    
    priors = prior(df, class_column)
    
    predictions = []
    
    for x in x_input:
        labels = sorted(list(df[class_column].unique()))
        likelihood = [1] * len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= likelihood_gaussian(df, features[i], x[i], class_column, labels[j])
                
        post_prob = [1] * len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * priors[j]
            
        predictions.append(np.argmax(post_prob))
        
    return predictions

In [2]:
dataset = pd.read_csv('data/brain_tumor_dataset.csv', index_col = 0)

dataset = dataset.drop(['image_name', 'label_name'], axis = 1)

dataset

Unnamed: 0,mean,variance,std,skewness,kurtosis,entropy,contrast,dissimilarity,homogeneity,asm,energy,correlation,label
0,47.957241,1534.028907,39.166681,0.591976,-0.146808,12.115296,54.121755,3.947006,0.408638,0.005599,0.074824,0.982349,3
1,53.239227,1702.933157,41.266611,1.525444,3.256895,12.212518,39.458877,2.382121,0.663735,0.071248,0.266923,0.988422,0
2,43.551712,1860.386608,43.132199,1.174871,1.526766,11.981298,42.525494,3.121216,0.510266,0.012075,0.109885,0.988572,1
3,70.573677,2225.436749,47.174535,0.879047,0.867972,12.257411,36.775501,3.006803,0.535195,0.026063,0.161442,0.991737,2
4,49.876183,2688.290875,51.848731,0.695864,-0.464646,11.865826,34.988839,2.842435,0.565111,0.036918,0.192142,0.993493,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3201,43.240494,1728.160768,41.571153,1.032431,1.358818,11.996204,35.655661,2.656304,0.561187,0.019953,0.141256,0.989684,2
3202,31.026756,2054.392251,45.325404,1.474903,1.355120,11.522565,42.386963,2.596273,0.676621,0.109545,0.330977,0.989695,2
3203,43.290009,1392.145868,37.311471,1.077291,2.438517,12.079829,42.669891,2.780008,0.536012,0.009523,0.097585,0.984667,1
3204,46.065979,2176.462016,46.652567,0.798026,-0.141716,11.928128,28.031548,2.543993,0.570965,0.021920,0.148054,0.993561,2


In [21]:
# Cross Validation
acc_knn = []
acc_nb = []
acc_dts = []
acc_s = []
for i in range(4):
    train, test = train_test_split(dataset, test_size = 0.2)

    x_train = train.iloc[:, :-1].values
    y_train = train.iloc[:, -1].values

    x_test = test.iloc[:, :-1].values
    y_test = test.iloc[:, -1].values
    
    knn_test = knn_predict(x_train, y_train, x_test, n_neighbors = 5)
    nb_test = nb_predict(train, x_test, class_column = 'label')
    
    dts_test = dt.predict(x_test)
    
    stacking_test = pd.DataFrame(columns = ('knn', 'dts', 'nb', 'true_label'))
    stacking_test['knn'] = knn_test
    stacking_test['dts'] = dts_test
    stacking_test['nb'] = nb_test
    stacking_test['true_label'] = y_test
    
    x_test_s = stacking_test.iloc[:, :-1].values
    
    s_test = knn_predict(x_train_mm, y_train_mm, x_test_s, n_neighbors = 5)

    acc_knn.append(accuracy_score(y_test, knn_test))
    acc_nb.append(accuracy_score(y_test, nb_test))
    acc_dts.append(accuracy_score(y_test, dts_test))
    acc_s.append(accuracy_score(y_test, s_test))
    
print('KNN: %.4f' % (np.mean(acc_knn)))
print('NB: %.4f' % (np.mean(acc_nb)))
print('DT: %.4f' % (np.mean(acc_dts)))
print('Stacking: %.4f' % (np.mean(acc_s)))

KNN: 0.5829
NB: 0.5565
DT: 0.7321
Stacking: 0.6698


In [3]:
train, test = train_test_split(dataset, test_size = 0.2)

x_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values

x_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values

# KNN

In [4]:
knn_preds = knn_predict(x_train, y_train, x_train, n_neighbors = 5)
knn_tests = knn_predict(x_train, y_train, x_test, n_neighbors = 5)

# NB

In [5]:
nb_preds = nb_predict(train, x_train, class_column = 'label')
nb_tests = nb_predict(train, x_test, class_column = 'label')

# DTS

In [7]:
dt = DecisionTree(max_depth = 7)
dt.fit(x_train, y_train)
dts_preds = dt.predict(x_train)
dts_tests = dt.predict(x_test)

# Meta model df

In [8]:
meta_model_df = pd.DataFrame(columns = ('knn', 'dts', 'nb', 'true_label'))

meta_model_df['knn'] = knn_preds
meta_model_df['dts'] = dts_preds
meta_model_df['nb'] = nb_preds
meta_model_df['true_label'] = y_train

meta_model_df.head()

Unnamed: 0,knn,dts,nb,true_label
0,1,1,1,1
1,3,3,3,3
2,3,3,3,3
3,3,0,0,0
4,1,1,1,1


In [10]:
stacking_test = pd.DataFrame(columns = ('knn', 'dts', 'nb', 'true_label'))

stacking_test['knn'] = knn_tests
stacking_test['dts'] = dts_tests
stacking_test['nb'] = nb_tests
stacking_test['true_label'] = y_test

stacking_test

Unnamed: 0,knn,dts,nb,true_label
0,1,1,1,1
1,1,2,1,2
2,2,0,3,2
3,3,3,3,3
4,1,2,1,2
...,...,...,...,...
637,1,1,1,1
638,2,2,2,1
639,2,2,2,1
640,1,3,0,3


In [20]:
x_test_s = stacking_test.iloc[:, :-1].values
y_test_s = stacking_test.iloc[:, -1].values

array([[0, 0, 1],
       [1, 1, 1],
       [1, 1, 1],
       ...,
       [1, 2, 1],
       [3, 3, 3],
       [2, 2, 2]], dtype=int64)

# KNN Meta Model

In [13]:
train_mm, test_mm = train_test_split(meta_model_df, test_size = 0.2)

x_train_mm = train_mm.iloc[:, :-1].values
y_train_mm = train_mm.iloc[:, -1].values

x_test_mm = test_mm.iloc[:, :-1].values
y_test_mm = test_mm.iloc[:, -1].values

In [16]:
mm_test = knn_predict(x_train_mm, y_train_mm, x_test_mm, n_neighbors = 5)
mm_preds = knn_predict(x_train_mm, y_train_mm, x_test_s, n_neighbors = 5)

print(accuracy_score(y_test_mm, mm_test))
print(accuracy_score(y_test_s, mm_preds))

0.7485380116959064
0.6401869158878505


In [None]:
path = 'data/Testing/glioma_tumor/gt_83.jpg'

features = np.array([get_image_features(path)])
features

In [None]:
knn_x = knn_predict(x_train, y_train, features, n_neighbors = 5)
dts_x = dt.predict(features)
nb_x = nb_predict(train, features, class_column = 'label')

news = np.array([[knn_x[0], dts_x[0], nb_x[0]]])
news

In [None]:
knn_predict(x_train2, y_train2, news, n_neighbors = 5)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import export_graphviz

tree_clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 7) 
tree_clf.fit(x_train, y_train)
tree.plot_tree(tree_clf)
[...]

In [None]:
import warnings 
import numpy as np
import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from sklearn import neighbors
from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# filter warnings
warnings.filterwarnings("ignore")

def accuracy(k, X_train, y_train, X_test, y_test):
    '''
    compute accuracy of the classification based on k values 
    '''
    # instantiate learning model and fit data
    knn = KNeighborsClassifier(n_neighbors=k)    
    knn.fit(X_train, y_train)

    # predict the response
    pred = knn.predict(X_test)

    # evaluate and return  accuracy
    return accuracy_score(y_test, pred)

def classify_and_plot(X, y):
    ''' 
    split data, fit, classify, plot and evaluate results 
    '''
    # split data into training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 41)

    # init vars
    n_neighbors = 5
    h           = .02  # step size in the mesh

    # Create color maps
    cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
    cmap_bold  = ListedColormap(['#FF0000', '#0000FF'])

    rcParams['figure.figsize'] = 5, 5
    for weights in ['uniform', 'distance']:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
        clf.fit(X_train, y_train)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        fig = plt.figure()
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points, x-axis = 'Glucose', y-axis = "BMI"
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)   
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.title("0/1 outcome classification (k = %i, weights = '%s')" % (n_neighbors, weights))
        plt.show()
        fig.savefig(weights +'.png')

        # evaluate
        y_expected  = y_test
        y_predicted = clf.predict(X_test)

        # print results
        print('----------------------------------------------------------------------')
        print('Classification report')
        print('----------------------------------------------------------------------')
        print('\n', classification_report(y_expected, y_predicted))
        print('----------------------------------------------------------------------')
        print('Accuracy = %5s' % round(accuracy(n_neighbors, X_train, y_train, X_test, y_test), 3))
        print('----------------------------------------------------------------------')


# load your data 
data  = pd.read_csv('data/brain_tumor_dataset.csv')
dataset = data.drop(['image_name', 'label_name'], axis = 1)

names = list(dataset.columns)

# we only take the best two features and prepare them for the KNN classifier
rows_nbr = dataset.shape[0]
X_prime  = np.array(dataset.iloc[:rows_nbr, [1,5]])
X        = X_prime # preprocessing.scale(X_prime)
y        = np.array(dataset.iloc[:rows_nbr, 8])

# classify, evaluate and plot results
classify_and_plot(X, y)