In [2]:
import pandas as pd
#scaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
#kfold
from sklearn.model_selection import KFold
#model
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
#score
from sklearn.metrics import accuracy_score
    
#load data
colnames = ['Sample code number','Clump Thickness','Uniformity of Cell Size'
            ,'Uniformity of Cell Shape','Marginal Adhension','Single Epithelial Cell Size'
            ,'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
data = pd.read_csv('breast-cancer-wisconsin.data', names = colnames)
print('DATA------------------------------------')
print(data.head(5))

#dirty data cleaning
#drop all Missing attribute values(? -> null -> drop)
data = data.replace('?', None)
data = data.dropna(axis = 0)

#feature engineering
#drop 'Sample code number' column
data.drop(['Sample code number'], axis = 1, inplace = True)

#change target 2 > 0, 4 > 1
data['Class'] = data['Class'].replace(2,0)
data['Class'] = data['Class'].replace(4,1)

#divide dataset to non_target and target
dataset_non_target = data.drop(['Class'], axis = 1, inplace = False)
dataset_target = pd.DataFrame(data['Class'], columns = ['Class'])

#function ML : do dataset scaling, make model, calculate accuracy score and return that info using dataframe form
# input : scaling_type, algorithm_name, dataset_no_target, dataset_target
# output : function_reusult_dataframe
def ML(scaling_type, algorithm_name, dataset_no_target, dataset_target):
    function_result_df = pd.DataFrame(columns = ['scaling_type','algorithm_name','k_fold_parameter','algorithm_parameter','accuracy_score'])
    function_result_cnt = 0
    #do scaling using scaling type
    scaler = None
    scaled_data = dataset_no_target
    
    # 3 type of scaling : none, standard, minmax
    if scaling_type == 'none':
        pass #do nothing
    elif scaling_type == 'standard':
        scaler = StandardScaler()
    elif scaling_type == 'minmax':
        scaler = MinMaxScaler()
    else:
        print('ERROR : scaling type error')
        return
    
    if scaler != None:
        scaled_data = scaler.fit_transform(dataset_no_target)
    dataset_no_target = pd.DataFrame(scaled_data, columns = dataset_no_target.columns)
    
    #do k-fold
    for k_fold_param in [5,10]:#k_fold_parameter : 5,10
        kfold = KFold(n_splits = k_fold_param)
        k_fold_num = 0
        for train_index, test_index in kfold.split(dataset_no_target):
            x_train, x_test = dataset_no_target.iloc[train_index,:], dataset_no_target.iloc[test_index,:]
            y_train, y_test = dataset_target.iloc[train_index,:], dataset_target.iloc[test_index,:]
            
            #4 type of model : ['decision_tree(entropy)','decision_tree(gini_index)','logistic_reg','svm']
            if algorithm_name == 'decision_tree(entropy)':
                for max_depth_param in [1, 10]: #max_depth_param 1, 10
                    model = DecisionTreeClassifier(criterion = 'entropy', max_depth = max_depth_param, random_state = 1)
                    model.fit(x_train,y_train)
                    pred = model.predict(x_test)
                    accuracy = model.score(x_test,y_test)
                    
                    k_fold_param_str = 'k_fold : {} - {}'.format(str(k_fold_param), str(k_fold_num))
                    algorithm_param = 'max_depth : {}'.format(str(max_depth_param))
                    result_list = make_result_list(scaling_type, algorithm_name, k_fold_param_str, algorithm_param, accuracy)
                    function_result_df.loc[function_result_cnt]=result_list
                    function_result_cnt = function_result_cnt + 1
            elif algorithm_name == 'decision_tree(gini_index)':
                for max_depth_param in [1, 10]: #max_depth_param 1, 10
                    model = DecisionTreeClassifier(criterion = 'gini', max_depth = max_depth_param, random_state = 1)
                    model.fit(x_train,y_train)
                    pred = model.predict(x_test)
                    accuracy = model.score(x_test,y_test)
                    
                    k_fold_param_str = 'k_fold : {} - {}'.format(str(k_fold_param), str(k_fold_num))
                    algorithm_param = 'max_depth : {}'.format(str(max_depth_param))
                    result_list = make_result_list(scaling_type, algorithm_name, k_fold_param_str, algorithm_param, accuracy)
                    function_result_df.loc[function_result_cnt]=result_list
                    function_result_cnt = function_result_cnt + 1
            elif algorithm_name == 'logistic_reg':
                for C_param in  [0.0001]: #C parameter : 0.0001
                    model = LogisticRegression(solver = 'lbfgs', C = C_param)
                    model.fit(x_train,y_train.values.ravel())
                    pred = model.predict(x_test)
                    accuracy = model.score(x_test,y_test)
                    
                    k_fold_param_str = 'k_fold : {} - {}'.format(str(k_fold_param), str(k_fold_num))
                    algorithm_param = 'C : {}'.format(str(C_param))
                    #algorithm_param = 'none'
                    result_list = make_result_list(scaling_type, algorithm_name, k_fold_param_str, algorithm_param, accuracy)
                    function_result_df.loc[function_result_cnt]=result_list
                    function_result_cnt = function_result_cnt + 1
            elif algorithm_name == 'svm':
                for kernel_name in ['rbf']:#kernel_name #['linear','rbf']
                      #kernel_name == 'rbf'
                    for C_param in  [0.0001, 0.001]: #C parameter : 0.0001 0.001
                        model = SVC(kernel = 'rbf', C = C_param, random_state = 1)
                        model.fit(x_train,y_train.values.ravel())
                        pred = model.predict(x_test)
                        accuracy = model.score(x_test,y_test)
                                
                        k_fold_param_str = 'k_fold : {} - {}'.format(str(k_fold_param), str(k_fold_num))
                        algorithm_param = 'kernel : {}, C: {}'.format(str(kernel_name),str(C_param))
                        result_list = make_result_list(scaling_type, algorithm_name, k_fold_param_str, algorithm_param, accuracy)
                        function_result_df.loc[function_result_cnt]=result_list
                        function_result_cnt = function_result_cnt + 1               
            else:
                print('ERROR : algorithm name error')
                return
            k_fold_num = k_fold_num + 1
        
    return function_result_df
#function make_result_list : make one line(list) of function ML's return dataframe
# input : scaling_type, algorithm_name, k_fold_param, algorithm_param, accuracy
# output : result_list
def make_result_list(scaling_type, algorithm_name, k_fold_param, algorithm_param, accuracy):
    result_list = []
    result_list.append(scaling_type)
    result_list.append(algorithm_name)
    result_list.append(k_fold_param)
    result_list.append(algorithm_param)
    result_list.append(accuracy)
    return result_list

#make list to store all type of scaling or algorithm
algorithm_list = ['decision_tree(entropy)','decision_tree(gini_index)','logistic_reg','svm']
scaling_type = ['none','standard','minmax']

#make dataframe to store all result
result = pd.DataFrame(columns = ['scaling_type','algorithm_name','k_fold_parameter','algorithm_parameter','accuracy_score'])

for scaling_name in scaling_type:
    for algorithm_name in algorithm_list:
        temp_result = ML(scaling_name, algorithm_name, dataset_non_target, dataset_target)
        result = pd.concat([result, temp_result], ignore_index = True)


#sorted result and find best accuracy score cases
result_sorted = result.sort_values(by = ['accuracy_score'], ascending = False)
print('TOP 5 ACCURACY SCORE INFO------------------------------------')
print(result_sorted.head(5))

DATA------------------------------------
   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             1016277                6                        8   
4             1017023                4                        1   

   Uniformity of Cell Shape  Marginal Adhension  Single Epithelial Cell Size  \
0                         1                   1                            2   
1                         4                   5                            7   
2                         1                   1                            2   
3                         8                   1                            3   
4                         1                   3                            2   

  Bare Nuclei  Bland Chromatin  Normal Nucleoli  Mitoses  Class  
0        

## - Result Analysis
Among the TOP 5 accuracy scores,
the scaling type has a distribution of standard(2), none(2), and minmax(1),
the algorithm name has a distribution of decision_tree(entropy) (3), logistic_regression(1), and decision_tree(gini_index)(1),
and k_fold_parameter has a distribution of 10(5).
In particular, TOP1 ~ 3, all had different scaling_type, but they all had the same accuracy score as model = decision_tree(entropy)(max_depth = 10), k(in k-fold hyperparameter) = 10.
Therefore, which model was used had the most influence on accuracy than scaling type and k_fold_parameter.
It is appropriate to satisfy the conditions of TOP1-TOP3 case(model = decision_tree(entropy)(max_depth = 10), k(in k-fold hyperparameter) = 10) to derive the best accuracy score.
