In [180]:
import pandas as pd 
import numpy as np

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Preparing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 

# Machine learning
from sklearn.neighbors import KNeighborsClassifier

# Evaluation
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, confusion_matrix

In [181]:
# Importing data
data = pd.read_csv('../data/processed/final_df.csv')

# columns that do not provide any useful information
data.drop(['Unnamed: 0', 'RGB1', 'RGB2', 'RGB3', 'RGB4'], axis = 1, inplace = True)
data.drop(['RGB1html', 'RGB2html', 'RGB3html', 'RGB4html'], axis = 1, inplace = True)

data.head()

Target = []

for i in data[['melanoma', 'seborrheic_keratosis']].itertuples():
    if int(i[1]) == 1:
        Target.append(1)
    elif int(i[2]) == 1:
        Target.append(2)
    elif int(i[1]) == 0 and i[2] == 0:
        Target.append(3)

data['Target'] = Target

data.drop(['seborrheic_keratosis', 'melanoma', 'help_int_for_colors'], axis = 1, inplace = True)

# Splitting data
data.set_index(['image_id'], inplace = True)

# 80# train 20% test

X_train = data.drop('Target', axis = 1)
y_train = data['Target']

data.head()

Unnamed: 0_level_0,RGB1_occ,RGB2_occ,RGB3_occ,RGB4_occ,border_diff_percent,image_symmetry,area,perimeter,Target
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ISIC_0001769,16,24,26,32,1.175166,1.098384,216160.0,2013.0,3
ISIC_0001852,15,19,19,45,1.14229,1.126816,130493.0,1372.0,3
ISIC_0001871,15,25,27,32,1.059655,1.070689,205116.0,1720.0,3
ISIC_0003462,18,19,30,31,1.167051,1.087893,161705.0,1344.0,3
ISIC_0003539,12,21,25,40,1.171263,1.071256,317040.0,2063.0,3


In [182]:
# Preprocessing

scaler = StandardScaler()


X_train[['RGB1_occ', 'RGB2_occ', 'RGB3_occ', 'RGB4_occ', 'border_diff_percent', 'area', 'perimeter', 'image_symmetry']] = scaler.fit_transform(X_train[['RGB1_occ', 'RGB2_occ', 'RGB3_occ', 'RGB4_occ', 'border_diff_percent', 'area', 'perimeter', 'image_symmetry']])

X_train.head()

Unnamed: 0_level_0,RGB1_occ,RGB2_occ,RGB3_occ,RGB4_occ,border_diff_percent,image_symmetry,area,perimeter
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ISIC_0001769,0.432246,0.748506,-0.562327,-0.47533,-0.519937,-0.26331,-0.493031,-0.674136
ISIC_0001852,0.248572,-0.515863,-2.442719,1.752778,-0.70473,0.118972,-0.507605,-0.795608
ISIC_0001871,0.248572,1.00138,-0.293699,-0.47533,-1.169212,-0.63568,-0.494909,-0.729661
ISIC_0003462,0.799594,-0.515863,0.512183,-0.646722,-0.565554,-0.404369,-0.502295,-0.800914
ISIC_0003539,-0.30245,-0.010115,-0.830954,0.895813,-0.541879,-0.628049,-0.475868,-0.664661


In [184]:
# Machine learning

# Initializing possible hyperparameters
param_grid_rbf = {'n_neighbors': [1, 2, 3, 5, 8, 10, 12, 14, 15, 16, 20, 25],  
              'p': [1, 2]}  

# p = 1, manhattan_distance
# p = 2, euclidean_distance

# Defining the GridSearch function by the Support Vector Machine method and initialized hyperparameters    
grid = GridSearchCV(KNeighborsClassifier(), param_grid_rbf, return_train_score = True)

# Fitting the grid
grid.fit(X_train, y_train) 

# Putting our grid results in a pandas dataframe to visualize
table = pd.DataFrame(grid.cv_results_)
table.sort_values(by = 'mean_test_score', ascending = False, inplace = True)
table.rename(columns = {'mean_test_score': 'mean_val_score'}, inplace = True)
table = table[['params', 'mean_train_score', 'mean_val_score', 'rank_test_score']].head()

table

Unnamed: 0,params,mean_train_score,mean_val_score,rank_test_score
18,"{'n_neighbors': 16, 'p': 1}",0.616667,0.6,1
14,"{'n_neighbors': 14, 'p': 1}",0.635,0.586667,2
22,"{'n_neighbors': 25, 'p': 1}",0.611667,0.586667,2
20,"{'n_neighbors': 20, 'p': 1}",0.62,0.586667,2
15,"{'n_neighbors': 14, 'p': 2}",0.621667,0.586667,2


In [189]:
# Machine learning

# Initializing possible hyperparameters
param_grid_poly = {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 3, 5, 10],  
              'degree': [1, 2, 3, 4],
              'kernel': ['poly']}

# Defining the GridSearch function by the Support Vector Machine method and initialized hyperparameters    
grid = GridSearchCV(svm.SVC(), param_grid_rbf, return_train_score = True)

# Fitting the grid
grid.fit(X_train, y_train) 

# Putting our grid results in a pandas dataframe to visualize
table = pd.DataFrame(grid.cv_results_)
table.sort_values(by = 'mean_test_score', ascending = False, inplace = True)
table.rename(columns = {'mean_test_score': 'mean_val_score'}, inplace = True)
table = table[['params', 'mean_train_score', 'mean_val_score', 'rank_test_score']].head()

table

Unnamed: 0,params,mean_train_score,mean_val_score,rank_test_score
86,"{'C': 0.005, 'gamma': 0.5, 'kernel': 'poly'}",0.641667,0.593333,1
137,"{'C': 0.2, 'gamma': 0.15, 'kernel': 'poly'}",0.641667,0.593333,1
49,"{'C': 0.003, 'gamma': 0.6, 'kernel': 'poly'}",0.641667,0.593333,1
325,"{'C': 5, 'gamma': 0.05, 'kernel': 'poly'}",0.641667,0.593333,1
104,"{'C': 0.01, 'gamma': 0.4, 'kernel': 'poly'}",0.641667,0.593333,1
