In [None]:
import pandas as pd
#load data
data = pd.read_csv('housing.csv')
print(data.head(5))
target_attr = 'median_house_value'
#Data preprocessing
#dirty data cleaning
#None data in 'total_bedrooms' attr -> drop
#20640 rows -> 20433 rows
original_length = len(data)
data = data.dropna(axis = 0)
print('number of deleted data : ',original_length,'->',len(data))
#feature engineering
#draw heatmap
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (16,5))

cor_matrix = data.corr()
ax = sns.heatmap(cor_matrix, annot = True, fmt = ".2f")
plt.title('HeatMap', fontsize = 20)
plt.show()

print('correlation with target_attr : best to worst')
cor_sorted = cor_matrix[target_attr].drop(target_attr).sort_values(key = abs, ascending = False)
print(cor_sorted)
# Attr with correlation(- 0.05 ~ + 0.05) can be dismissed -> Attr : total_bedrooms, longitude, population
#drop 3 attr
data = data.drop(['total_bedrooms','longitude','population'],axis = 1)
data = data.reset_index(drop= True)
#divide dataset to non_target and target
dataset_non_target = data.drop([target_attr], axis = 1, inplace = False)
dataset_target = pd.DataFrame(data[target_attr], columns = [target_attr])
#scaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
#encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
#train_test_split
from sklearn.model_selection import train_test_split
#model
from sklearn.cluster import KMeans
from sklearn import mixture
from pyclustering.cluster.clarans import clarans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AffinityPropagation
#score
from sklearn.metrics import silhouette_score
from sklearn import metrics
import numpy as np
import math
#visualize
from sklearn.decomposition import PCA
def AutoML(scaler_list, encoder_list, model_list, hyperparmeter_df, dataset_no_target, dataset_target, categorical_attr_list, measure_df):
    
    function_result_df = pd.DataFrame(columns = ['scaling_type','encoding_type','model_name','algorithm_parameters','knee_method','silhouette_score','purity','dist'])
    function_result_cnt = 0
    
    answer_df = pd.DataFrame(columns = ['k','answer'])
    sorted_dt = dataset_target.sort_values(by = [target_attr])#datset for make answer after
    sorted_dt['index'] = sorted_dt.index
    sorted_dt = sorted_dt.reset_index(drop = True)

    for k in range(2, 13):#arbitrarily k range
        first_idx = int(len(sorted_dt) / k)

        split_val_list = []
        #split to cluster and store each cluster's original dataset index
        temp = sorted_dt
        for i in range(1, k):#because there is multiplication, we use range start at 1 to k.
            idx = first_idx * i
            split_val = sorted_dt.iloc[idx, 0]
            split_val_list.append(temp[temp[target_attr] <= split_val]['index'].sort_values())
            temp = temp[temp[target_attr] > split_val]
        split_val_list.append(temp['index'].sort_values())
        
        #calculate all cluster answer and store it to cluster_answer dataframe
        cluster_answer = pd.DataFrame()
        for i in range(len(split_val_list)):
            each_cluster = pd.DataFrame(index = split_val_list[i], columns = ['cluster'])
            each_cluster = each_cluster.fillna(i)
            cluster_answer = pd.concat([cluster_answer, each_cluster])
        
            cluster_answer = cluster_answer.sort_index()
        #store all it to answer_df
        answer_df.loc[len(answer_df)] = [k, cluster_answer['cluster'].to_list()]
    #Separate dataset to be scaled and dataset to be encoded
    if categorical_attr_list == ['none'] or len(categorical_attr_list) == 0:#there is no categorical attribute in dataset
        dt_bf_scaling = dataset_no_target
        dt_bf_encoding = None
    else:#there is some categorical attribute in dataset
        dt_bf_scaling = dataset_no_target.drop(categorical_attr_list, axis = 1, inplace = False)
        dt_bf_scaling = dt_bf_scaling.reset_index(drop= True)
        dt_bf_encoding = dataset_no_target[categorical_attr_list]

    #scaling
    for scaling_type in scaler_list:
        dt_after_scaling = None
        # 6 type of scaling : none, standard, minmax, amxabs, robust, norm
        if scaling_type == 'standard':
            scaler = StandardScaler()
        elif scaling_type == 'minmax':
            scaler = MinMaxScaler()
        elif scaling_type == 'maxabs':
            scaler = MaxAbsScaler()
        elif scaling_type == 'robust':
            scaler = RobustScaler()
        elif scaling_type == 'norm':
            scaler = Normalizer()
        else:
            print('ERROR : scaling type error')
            return
        
        scaled_data = scaler.fit_transform(dt_bf_scaling)
        dt_after_scaling = pd.DataFrame(scaled_data, columns = dt_bf_scaling.columns)
 
        #encoding
        #there is no encoding data
        for encoding_type in encoder_list:
            dt_after_encoding = None
            if dt_bf_encoding.empty :#there is no data to encoding
                pass
            #3 type of encoding : label, onehot, ordinal
            if encoding_type == 'label':
                encoder = LabelEncoder()
                    
                for categorical_attr in categorical_attr_list:
                    encoded_data = encoder.fit_transform(dt_bf_encoding[categorical_attr])
                    encoded_data_df = pd.DataFrame(encoded_data, columns = [categorical_attr])
                    dt_after_encoding = pd.concat([dt_after_encoding, encoded_data_df], axis = 1)
            elif encoding_type == 'onehot':
                dt_after_encoding = pd.get_dummies(dt_bf_encoding)
            elif encoding_type == 'ordinal':
                encoder = OrdinalEncoder()
               
                for categorical_attr in categorical_attr_list:
                    encoded_data = encoder.fit_transform(dt_bf_encoding[categorical_attr].values.reshape(-1, 1))
                    encoded_data_df = pd.DataFrame(encoded_data, columns = [categorical_attr])
                    dt_after_encoding = pd.concat([dt_after_encoding, encoded_data_df], axis = 1)
            else:
                print('ERROR : encoder type error')
                return  

            #merge dataset to be scaled and dataset to be encoded
            dataset_no_target = pd.concat([dt_after_scaling,dt_after_encoding], axis = 1)

            #make model
            model_cnt = 0
            for model_name in model_list:
                params = hyperparmeter_df.iloc[:, model_cnt][0]
                hyperparameter_dict = eval(str(params))#eval : function to string -> dict reference : https://blog.metafor.kr/224
                measure = measure_df.iloc[:,model_cnt][0]
                model_cnt = model_cnt + 1
                if model_name == 'K-means': 
                    if 'n_clusters' in hyperparameter_dict:
                        k_params = hyperparameter_dict['n_clusters']
                    else : 
                        k_params = 5
                    if 'algorithm' in hyperparameter_dict:
                        algorithm_params = hyperparameter_dict['algorithm']
                    else : 
                        algorithm_params = ['lloyd']
                    for algorithm_param in algorithm_params:    
                        for k_param in k_params:
                            model = KMeans(n_clusters = k_param, algorithm = algorithm_param)
                                
                            y_pred = model.fit_predict(dataset_no_target)
                            y_true = answer_df[answer_df['k'] == k_param]['answer'].to_list()

                            #do pca(change XD to 2D) and visualize(2D)
                            params = 'scaling : ' + scaling_type + '/encoding : ' + encoding_type + '/k_param : ' + str(k_param) + '/algorithm : ' + str(algorithm_param)
                            pca_visualize(dataset_no_target,y_pred,k_param,model_name,params)
                            
                            knee_method_result = None
                            silhouette_score_result = None
                            purity_result = None
                            if 'knee-method' in measure: 
                                knee_method_result = model.inertia_
                                #print('knee method', model.inertia_)
                            if 'silhouette_score' in measure:
                                silhouette_score_result = silhouette_score(dataset_no_target, y_pred)
                                #print('silhouette_score',silhouette_score_result)
                            if 'purity' in measure:
                                contingency_matrix = metrics.cluster.contingency_matrix(y_true,y_pred)
                                purity_result = np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix)
                                #print('purity',np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix))
        
                            original_center = cal_cluster_mean(dataset_no_target,answer_df, k_param)
                            dist_list = []
                            for i in range(0, k_param):
                                dist = math.dist(model.cluster_centers_[i], original_center[i])
                                dist_list.append(dist)
                            final_dst = sum(dist_list) / k_param
                            #print('avg_dist_of_centers',sum(dist_list) / k_param)
                            
                            params = 'k_param : ' + str(k_param) + ' algorithm : ' + str(algorithm_param)
                            result_list = make_result_list(scaling_type, encoding_type, model_name, params, 
                                                           knee_method_result, silhouette_score_result, purity_result, sum(dist_list) / k_param)
                            function_result_df.loc[function_result_cnt]=result_list
                            function_result_cnt = function_result_cnt + 1               
                elif model_name == 'EM':
                    if 'n_component' in hyperparameter_dict:
                        n_params = hyperparameter_dict['n_component']
                    else : 
                        n_params = range(5,6)
                    if 'covariance_type' in hyperparameter_dict:
                        covar_types = hyperparameter_dict['n_component']
                    else :
                        covar_types = ['full']
                    for type_param in hyperparameter_dict['covariance_type']:    
                        for n_param in hyperparameter_dict['n_component']:
                            model = mixture.GaussianMixture(n_components = n_param, covariance_type = type_param)
                                    
                            y_true = answer_df[answer_df['k'] == n_param]['answer'].to_list()
                            y_pred = model.fit_predict(dataset_no_target)
                            
                            #do pca(change XD to 2D) and visualize(2D)
                            params = 'scaling : ' + scaling_type + '/encoding : ' + encoding_type  + '/n_component : ' + str(n_param) + '/covariance_type : ' + str(type_param)
                            pca_visualize(dataset_no_target,y_pred,n_param,model_name,params)  
                                
                            knee_method_result = None
                            silhouette_score_result = None
                            purity_result = None
                            if 'silhouette_score' in measure:
                                silhouette_score_result = silhouette_score(dataset_no_target, y_pred)
                                #print('silhouette_score',silhouette_score_result)
                            if 'purity' in measure:
                                contingency_matrix = metrics.cluster.contingency_matrix(y_true,y_pred)
                                purity_result = np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix)
                                #print('purity',np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix))
                                
                            original_center = cal_cluster_mean(dataset_no_target,answer_df, n_param)
                            dist_list = []
                            for i in range(0, n_param):
                                dist = math.dist(model.means_[i], original_center[i])
                                dist_list.append(dist)
                            final_dst = sum(dist_list) / k_param
                            #print('avg_dist_of_centers',sum(dist_list) / n_param)
                            
                            params = 'n_component : ' + str(n_param) + ' covariance_type : ' + str(type_param)
                            result_list = make_result_list(scaling_type, encoding_type, model_name, params, 
                                                           knee_method_result, silhouette_score_result, purity_result, sum(dist_list) / k_param)
                            function_result_df.loc[function_result_cnt]=result_list
                            function_result_cnt = function_result_cnt + 1           
                                      
                elif model_name == 'CLARANS':
                    if 'number_clusters' in hyperparameter_dict:
                        n_c_params = hyperparameter_dict['number_clusters']
                    else :
                        n_c_params = [3]
                    if 'maxneighbor' in hyperparameter_dict:
                        m_n_params = hyperparameter_dict['maxneighbor']
                    else :
                        m_n_params = [0]
                    for n_c_param in n_c_params:
                        for m_n_param in m_n_params:
                            model  = clarans(dataset_no_target.values.tolist(), n_c_param, 2, m_n_param)
                            model.process()
                            cluster_index = model.get_clusters()

                            cluster_answer = pd.DataFrame()
                            for i in range(n_c_param):
                                each_cluster = pd.DataFrame(index = cluster_index[i], columns = ['cluster'])
                                each_cluster = each_cluster.fillna(i)
                                cluster_answer = pd.concat([cluster_answer, each_cluster])
        
                            cluster_answer = cluster_answer.sort_index()
                            y_pred = cluster_answer['cluster'].to_list()
                            y_true = answer_df[answer_df['k'] == n_c_param]['answer'].to_list()
                    
                            #do pca(change XD to 2D) and visualize(2D)
                            params = 'scaling : ' + scaling_type + '/encoding : ' + encoding_type + '/number_clusters : ' + str(n_c_param) + '/maxneighbor : ' + str(m_n_param)
                            pca_visualize(dataset_no_target,y_pred,n_c_param,model_name,params)
                            
                            knee_method_result = None
                            silhouette_score_result = None
                            purity_result = None
                            if 'silhouette_score' in measure:
                                silhouette_score_result = silhouette_score(dataset_no_target, y_pred)
                                #print('silhouette_score',silhouette_score_result)
                            if 'purity' in measure:
                                contingency_matrix = metrics.cluster.contingency_matrix(y_true,y_pred)
                                purity_result = np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix)
                                #print('purity',np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix))
                                
                            original_center = cal_cluster_mean(dataset_no_target,answer_df, n_c_param)
                            dist_list = []
                            for i in range(n_c_param):
                                dist = math.dist(dataset_no_target.iloc[model.get_medoids()[i]].to_list(), original_center[i])
                                dist_list.append(dist)
                            final_dst = sum(dist_list) / k_param
                            #print('avg_dist_of_centers',sum(dist_list) / n_c_param)
                            
                            params = 'number_clusters : ' + str(n_c_param) + ' maxneighbor : ' + str(m_n_param)
                            result_list = make_result_list(scaling_type, encoding_type, model_name, params, 
                                                           knee_method_result, silhouette_score_result, purity_result, sum(dist_list) / k_param)
                            function_result_df.loc[function_result_cnt]=result_list
                            function_result_cnt = function_result_cnt + 1   
                                    
                elif model_name == 'DBSCAN':
                    if 'eps' in hyperparameter_dict:
                        eps_params = hyperparameter_dict['eps']
                    else :
                        eps_params = [0.5]
                    if 'min_samples' in hyperparameter_dict:
                        min_sample_params = hyperparameter_dict['min_samples']
                    else :
                        min_sample_params = [2]
                    for eps_param in eps_params:
                        for min_sample_param in min_sample_params:
                            model = DBSCAN(eps = eps_param, min_samples = min_sample_param)
                            
                            y_pred = model.fit_predict(dataset_no_target)
                            if scaling_type == 'standard':
                                y_pred = y_pred + 1
                            
                            k_val = y_pred.max()
                            if k_val > 12 or k_val < 2:
                                if k_val > 12:
                                    knee_method_result = None
                                    silhouette_score_result = None
                                    purity_result = None
                                    if 'silhouette_score' in measure:
                                        silhouette_score_result = silhouette_score(dataset_no_target, y_pred)
                                    
                                    params = 'eps_param : ' + str(eps_param) + ' min_samples : ' + str(min_sample_param)
                                    result_list = make_result_list(scaling_type, encoding_type, model_name, params, 
                                                           knee_method_result, silhouette_score_result, purity_result, sum(dist_list) / k_param)
                                    function_result_df.loc[function_result_cnt]=result_list
                                    function_result_cnt = function_result_cnt + 1  
                            else:
                                y_true = answer_df[answer_df['k'] == k_val]['answer'].to_list()
                                #do pca(change XD to 2D) and visualize(2D)
                                params = 'scaling : ' + scaling_type + '/encoding : ' + encoding_type  + '/eps_param : ' + str(eps_param) + '/min_samples : ' + str(min_sample_param)
                                pca_visualize(dataset_no_target,y_pred,k_val,model_name,params)  
                            
                                knee_method_result = None
                                silhouette_score_result = None
                                purity_result = None
                                if 'silhouette_score' in measure:
                                    silhouette_score_result = silhouette_score(dataset_no_target, y_pred)
                                    #print('silhouette_score',silhouette_score_result)
                                if 'purity' in measure:
                                    contingency_matrix = metrics.cluster.contingency_matrix(y_true,y_pred)
                                    purity_result = np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix)
                                    #print('purity',purity_result)
                                
                                original_center = cal_cluster_mean(dataset_no_target,answer_df, k_val)
                                temp = pd.concat([dataset_no_target, pd.DataFrame(y_pred.tolist(), columns = ['cluster'])], axis = 1)
                                dist_list = []
                                for i in range(0, k_val):
                                    dist = math.dist(temp.groupby('cluster').mean().values.tolist()[i], original_center[i])
                                    dist_list.append(dist)
                                final_dst = sum(dist_list) / k_param
                                #print('avg_dist_of_centers',sum(dist_list) / k_val)
                                
                                params = 'eps_param : ' + str(eps_param) + ' min_samples : ' + str(min_sample_param)
                                result_list = make_result_list(scaling_type, encoding_type, model_name, params, 
                                                           knee_method_result, silhouette_score_result, purity_result, sum(dist_list) / k_param)
                                function_result_df.loc[function_result_cnt]=result_list
                                function_result_cnt = function_result_cnt + 1   
                                 
                elif model_name == 'AffinityPropagation':
                    if 'preference' in hyperparameter_dict:
                        preference_params = hyperparameter_dict['preference']
                    else :
                        preference_params = [15]
                    if 'max_iter' in hyperparameter_dict:
                        max_iter_params = hyperparameter_dict['max_iter']
                    else :
                        max_iter_params = [200]
                    for preference_param in preference_params:
                        for max_iter_param in max_iter_params:
                            model = AffinityPropagation(damping = 0.99, preference = preference_param, max_iter = max_iter_param)
                            
                            y_pred = model.fit_predict(dataset_no_target)
                            k_val = y_pred.max() + 1
                            
                            if k_val > 12 or k_val < 2:
                                if k_val > 12:
                                    knee_method_result = None
                                    silhouette_score_result = None
                                    purity_result = None
                                    if 'silhouette_score' in measure:
                                        silhouette_score_result = silhouette_score(dataset_no_target, y_pred)
                                
                                    params = 'preference_param : ' + str(preference_param) + ' max_iter : ' + str(max_iter_param)
                                    result_list = make_result_list(scaling_type, encoding_type, model_name, params, 
                                                           knee_method_result, silhouette_score_result, purity_result, sum(dist_list) / k_param)
                                    function_result_df.loc[function_result_cnt]=result_list
                                    function_result_cnt = function_result_cnt + 1   
                            else:
                                y_true = answer_df[answer_df['k'] == k_val]['answer'].to_list()
                            
                                #do pca(change XD to 2D) and visualize(2D)
                                params = 'scaling : ' + scaling_type + '/encoding : ' + encoding_type  + '/preference_param : ' + str(preference_param) + '/max_iter : ' + str(max_iter_param)
                                pca_visualize(dataset_no_target,y_pred,k_val,model_name,params)   
                            
                                knee_method_result = None
                                silhouette_score_result = None
                                purity_result = None
                                if 'silhouette_score' in measure:
                                    silhouette_score_result = silhouette_score(dataset_no_target, y_pred)
                                    #print('silhouette_score',silhouette_score_result)
                                if 'purity' in measure:
                                    contingency_matrix = metrics.cluster.contingency_matrix(y_true,y_pred)
                                    purity_result = np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix)
                                    #print('purity',np.sum(np.amax(contingency_matrix, axis=0)) /  np.sum(contingency_matrix))
                                
                                original_center = cal_cluster_mean(dataset_no_target,answer_df, k_val)
                                dist_list = []
                                for i in range(0, k_val):
                                    dist = math.dist(model.cluster_centers_[i], original_center[i])
                                    dist_list.append(dist)
                                final_dst = sum(dist_list) / k_param
                                #print('avg_dist_of_centers',sum(dist_list) / k_val)
                                
                                params = 'preference_param : ' + str(preference_param) + ' max_iter : ' + str(max_iter_param)
                                result_list = make_result_list(scaling_type, encoding_type, model_name, params, 
                                                           knee_method_result, silhouette_score_result, purity_result, sum(dist_list) / k_param)
                                function_result_df.loc[function_result_cnt]=result_list
                                function_result_cnt = function_result_cnt + 1              
                else:
                    print('ERROR : Model name error')
                    return
    return function_result_df
def cal_cluster_mean(dt_n_t, answer_df, k):
    train_answer_df = pd.concat([dt_n_t, pd.DataFrame(answer_df[answer_df['k'] == k]['answer'][k - 2], columns = ['answer'])], axis = 1)
    
    cluster_mean_list = []
    for i in range(0, k):
        temp_c = train_answer_df[train_answer_df['answer'] == i]
        temp_c_mean = temp_c.mean().to_list()
        del temp_c_mean[-1]
        cluster_mean_list.append(temp_c_mean)
    return cluster_mean_list
def pca_visualize(dataset_no_target, y_pred,k, model_name,params):
    fig = plt.figure(figsize=(5,5))
    ax2 = fig.add_subplot(111)
    cluster_colors = ['#FF9999', '#FFCC99', '#FFFF99', '#CCFF99', '#99FF99', '#99FFCC','#99FFFF', '#99CCFF', '#9999FF', 
                      '#CC99FF', '#FF99FF', '#FF99CC','#FFFFFF','#BE3559']
    color_index = 0
    temp = pd.concat([dataset_no_target, pd.DataFrame(y_pred, columns = ['pred'])], axis = 1)
    pca_2 = PCA(n_components=2,random_state = 42)
    for i in range(0, k):
        df_bf_pca = temp[temp['pred'] == i]
        if len(df_bf_pca) == 1:
            temp_list = df_bf_pca.values.tolist()
            temp_list = max(temp_list)
            ax2.scatter(np.mean(temp_list),np.mean(temp_list),color=cluster_colors[i],alpha=0.5,marker='o')
        else:
            df_af_pca_2 = pca_2.fit_transform(df_bf_pca)
            ax2.scatter(df_af_pca_2[:,0],df_af_pca_2[:,1],color=cluster_colors[i],alpha=0.5,marker='o')
    temp_string = model_name, params
    ax2.set_title(temp_string)
    labels = ["cluster "+str(k) for k in range(k)]
    fig.legend(labels, loc='lower center',ncol=len(labels), bbox_transform=(1,0),borderaxespad=-0.5)
    plt.show()
#'scaling_type','encoding_type','model_name','algorithm_parameters','measures'
def make_result_list(scaling_type, encoding_type, model_name, parameters, knee, score1, score2 , dist):
    result_list = []
    result_list.append(scaling_type)
    result_list.append(encoding_type)
    result_list.append(model_name)
    result_list.append(parameters)
    result_list.append(knee)
    result_list.append(score1)
    result_list.append(score2)
    result_list.append(dist)
    return result_list  
scaler_list = ['standard','minmax','maxabs','robust','norm']#5 scaler :none sacling
encoder_list = ['label', 'onehot', 'ordinal']
model_list = ['K-means','EM','CLARANS','DBSCAN','AffinityPropagation']#5 model

hyperparmeter_df = pd.DataFrame(columns = model_list)
hyperparmeter_df.loc[0] = [0 for i in range(len(model_list))]                      
hyperparmeter_df.iloc[:,0] = '{\'n_clusters\' : range(3, 7), \'algorithm\' : [\'lloyd\', \'elkan\']}' #first model(K-means)'s hyperparamter
hyperparmeter_df.iloc[:,1] = '{\'n_component\' : range(2, 4), \'covariance_type\' : [\'full\', \'tied\']}'
hyperparmeter_df.iloc[:,2] = '{\'number_clusters\' : range(3, 4), \'maxneighbor\' : [1, 0]}'
hyperparmeter_df.iloc[:,3] = '{\'eps\' : [0.5], \'min_samples\' : [9, 10]}'
hyperparmeter_df.iloc[:,4] = '{\'preference\' : [-10], \'max_iter\' : [10, 20]}'

categorical_attr_list = ['ocean_proximity']# ['a','b'] // 'none' //not declared

measure_df = pd.DataFrame(columns = model_list)
measure_df.loc[0] = [0 for i in range(len(model_list))]                     
measure_df.iloc[:,0] = 'knee-method, purity, silhouette_score' #first model(K-means)'s hyperparamter
measure_df.iloc[:,1] = 'purity, silhouette_score'
measure_df.iloc[:,2] = 'purity, silhouette_score'
measure_df.iloc[:,3] = 'purity, silhouette_score'
measure_df.iloc[:,4] = 'purity, silhouette_score'
                        
df = AutoML(scaler_list, encoder_list, model_list,hyperparmeter_df, dataset_non_target, dataset_target, categorical_attr_list, measure_df)
result_sorted_1 = df.sort_values(by = ['knee_method'], ascending = False)
print('TOP 5 knee_method INFO------------------------------------')
print(result_sorted_1.head(5))
result_sorted_2 = df.sort_values(by = ['silhouette_score'], ascending = False)
print('TOP 5 silhouette_score INFO------------------------------------')
print(result_sorted_2.head(5))
result_sorted_3 = df.sort_values(by = ['purity'], ascending = True)
print('TOP 5 purity INFO------------------------------------')
print(result_sorted_3.head(5))
 = df.sort_values(by = ['dist'], ascending = True)
print('TOP 5 dist INFO------------------------------------')
print(result_sorted_4.head(5))