In [34]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
import seaborn as sns

# classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
# Load in file

expr_data = pd.read_csv("cancer_cells_expression.csv")

cancer_types = list(expr_data["cancer_target"].unique())

In [16]:
# one hot encode target 

category_array = np.asarray([[cancer_type] for cancer_type in cancer_types]) 
ohe = OneHotEncoder(sparse=False)  
encoded_category = ohe.fit_transform(category_array) 
category_oh_encoded = dict(zip(cancer_types, 
                            encoded_category)) 

# label encode target
le = LabelEncoder()
le_encoded_category = le.fit_transform(cancer_types)
category_le_encoded = dict(zip(cancer_types, 
                           le_encoded_category)) 


In [11]:
# create input array and target array

target_array = expr_data["cancer_target"].values
target_array = np.array([category_le_encoded[cancer_type] for cancer_type in list(target_array)])

input_array = expr_data.drop(["cancer_target"], axis=1).values

In [24]:
# normalize with standard scaler

scaler = StandardScaler()
scaler.fit(input_array)
scaled_input = scaler.transform(input_array)
scaled_input.shape

(518, 11926)

In [27]:
# test_train split

X_train, X_test, y_train, y_test = train_test_split(scaled_input,
                                                    target_array.ravel(),
                                                    test_size=0.2,
                                                    random_state=42)

In [29]:
# Benchmark with dummy

dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test, y_test)

0.2403846153846154

In [31]:
# classifiers

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(),
    SVC(kernel="linear", C=0.025),
    SVC(gamma="scale", C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=40),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [35]:
# compare different classifiers

for name, clf in zip(names, classifiers):
    model = clf
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    report = classification_report(y_test, y_pred)
    print(name)
    print("accuracy: {}".format(accuracy))
    print("report:")
    print(report)


Nearest Neighbors
accuracy: 0.40384615384615385
report:
              precision    recall  f1-score   support

           0       1.00      0.17      0.29        29
           1       1.00      0.19      0.32        16
           2       0.36      1.00      0.53        34
           3       0.00      0.00      0.00        25

    accuracy                           0.40       104
   macro avg       0.59      0.34      0.28       104
weighted avg       0.55      0.40      0.30       104

Linear SVM
accuracy: 0.038461538461538464
report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.00      0.00      0.00        16
           2       0.09      0.12      0.10        34
           3       0.00      0.00      0.00        25

    accuracy                           0.04       104
   macro avg       0.02      0.03      0.03       104
weighted avg       0.03      0.04      0.03       104



  'precision', 'predicted', average, warn_for)


RBF SVM
accuracy: 0.3269230769230769
report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.00      0.00      0.00        16
           2       0.33      1.00      0.49        34
           3       0.00      0.00      0.00        25

    accuracy                           0.33       104
   macro avg       0.08      0.25      0.12       104
weighted avg       0.11      0.33      0.16       104



  'precision', 'predicted', average, warn_for)


Gaussian Process
accuracy: 0.3269230769230769
report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.00      0.00      0.00        16
           2       0.33      1.00      0.50        34
           3       0.00      0.00      0.00        25

    accuracy                           0.33       104
   macro avg       0.08      0.25      0.12       104
weighted avg       0.11      0.33      0.16       104

Decision Tree
accuracy: 0.6634615384615384
report:
              precision    recall  f1-score   support

           0       1.00      0.24      0.39        29
           1       0.42      0.69      0.52        16
           2       0.71      0.88      0.79        34
           3       0.72      0.84      0.78        25

    accuracy                           0.66       104
   macro avg       0.72      0.66      0.62       104
weighted avg       0.75      0.66      0.63       104

Random Forest
accuracy: 0



QDA
accuracy: 0.3173076923076923
report:
              precision    recall  f1-score   support

           0       0.40      0.28      0.33        29
           1       0.12      0.19      0.15        16
           2       0.45      0.50      0.47        34
           3       0.24      0.20      0.22        25

    accuracy                           0.32       104
   macro avg       0.30      0.29      0.29       104
weighted avg       0.33      0.32      0.32       104

