<a href="https://colab.research.google.com/github/gladcolor/Housing_RandomForest/blob/master/CSCE822_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages

In [2]:
pip install category_encoders 

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 21.7 MB/s eta 0:00:01[K     |████████▏                       | 20 kB 28.6 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 25.2 MB/s eta 0:00:01[K     |████████████████▎               | 40 kB 20.0 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 8.6 MB/s eta 0:00:01[K     |████████████████████████▍       | 61 kB 9.9 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 9.1 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 5.0 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


# Import packages

In [94]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing


# Define functions

In [156]:
def print_str_unique(df):
    for col in df.columns:
        if original_data.dtypes[col] == np.object:            
            unique_cnt = len(df[col].unique())
            print(f'Column {col.rjust(13)} has {unique_cnt:5} unique values.')

def count_column_nan(df):
    row_cnt = len(df)
    for col in df.columns:
        nan_cnt = df[col].isna().sum()
        percent_str = f'({(nan_cnt / row_cnt * 100):3.1f}%)'.rjust(7)
        print(f'Column {col.rjust(13)} has {nan_cnt:4} {percent_str} nan values.')       

def impute_df(df, strategy="most_frequent"):
    
    numeric_cols = ['BuildingArea', 'YearBuilt', 'Car']
    nominal_cols = ['CouncilArea']

    my_imputer = SimpleImputer(strategy="most_frequent")
    council_area_with_imputed_values = my_imputer.fit_transform(df[nominal_cols])
    imputed_df = df.copy()
    imputed_df.loc[:, nominal_cols] = council_area_with_imputed_values


    if strategy == "most_frequent":
        my_imputer = SimpleImputer(strategy="most_frequent")
        data_with_imputed_values = my_imputer.fit_transform(df)        
        imputed_df.loc[:, :] = data_with_imputed_values

    if strategy == "mean":
        my_imputer = SimpleImputer(strategy="mean")
        area_year_with_imputed_values = my_imputer.fit_transform(df[numeric_cols])
        imputed_df.loc[:, numeric_cols] = area_year_with_imputed_values

    if strategy == "median":
        my_imputer = SimpleImputer(strategy="median")
        area_year_with_imputed_values = my_imputer.fit_transform(df[numeric_cols])
        imputed_df.loc[:, numeric_cols] = area_year_with_imputed_values

    return imputed_df

def encode_dates(imputed_df):
    imputed_df['Date'] = pd.to_datetime(imputed_df['Date']) 
    imputed_df['Ori_Date'] = pd.to_datetime('1970-01-01', format='YY-m-d', errors='ignore')
    imputed_df['Ori_Date'] = pd.to_datetime(imputed_df['Ori_Date'])
    imputed_df['delta_days'] = imputed_df['Date'] - imputed_df['Ori_Date']
    imputed_df['delta_days'] = imputed_df['delta_days'].dt.days
    imputed_df = imputed_df.drop(columns=['Date', 'Ori_Date'])
     
    return imputed_df


ENCODING_METHODS_DICT = {'one_hot': ce.OneHotEncoder, 
                         'hasing':ce.HashingEncoder, 
                         'leave_one_out':ce.LeaveOneOutEncoder, 
                         'target':ce.TargetEncoder,
                         'baseN':ce.BaseNEncoder,
                         'binary':ce.BinaryEncoder,
                         }

def encoder_nominals(imputed_df, encode_method='one_hot'):
    # print(f'Encode methods: {ENCODING_METHODS_DICT.keys()} \n')
    
    ce_encoder = ENCODING_METHODS_DICT[encode_method](cols = ENCODING_COLUMNS)

    y = imputed_df['Price_class'].copy()
    
    for drop_column in DROPPED_COLUMNS:
        try:
            imputed_df = imputed_df.drop(columns=drop_column).copy()
        except:
            pass
            # print(f'Columns: {drop_column} have already dropped before.')

    encoded_df = ce_encoder.fit_transform(imputed_df, y=y) 

    return encoded_df

def assign_price_class(imputed_df):
    row_cnt = len(imputed_df)
    price_class_cnt = 5
    class_step = int(row_cnt / price_class_cnt)
    price_bins = list(range(class_step, row_cnt,  class_step))

    imputed_df.loc[0:price_bins[0], 'Price_class'] = '0' # 'bottom_value'
    imputed_df.loc[price_bins[0]:price_bins[1], 'Price_class'] = '1' # 'low_value'
    imputed_df.loc[price_bins[1]:price_bins[2], 'Price_class'] = '2' # 'medium_value'
    imputed_df.loc[price_bins[2]:price_bins[3], 'Price_class'] = '3' # 'high_value'
    imputed_df.loc[price_bins[3]:row_cnt, 'Price_class'] = '4'  #  'top_value'

    imputed_df['Price_class'] = imputed_df['Price_class'].astype(int)

    # gb = imputed_df.groupby('Price_class')['Price_class'].count().to_frame()
    # gb.columns = ['Count']
    # custom_dict = {'bottom_value': 0, 'low_value': 1, 'medium_value': 2, 'high_value': 3, 'top_value': 4}
    # gb.sort_index(key=lambda x: x.map(custom_dict))
    # print("Price class counts:")
    return imputed_df    


def split_data(encoded_df):
    X = encoded_df.drop(columns=['Price_class'])
    y = encoded_df['Price_class']

    train_ratio = 0.75
    validation_ratio = 0.10
    test_ratio = 0.15

    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size= (1 - train_ratio), random_state = 0)

    xVal, xTest, yVal, yTest = train_test_split(xTest, yTest, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 0) 

    
    return xTrain, yTrain, xVal, yVal, xTest, yTest   

def standardize_data(encoded_df, class_col='Price_class'):
    labels = encoded_df[class_col].copy()
    data_df = encoded_df.drop(columns=[class_col])
    scaler = preprocessing.StandardScaler()
    scaler.fit(data_df) 
    data_df.iloc[:, :] = scaler.transform(data_df)
    data_df.loc[:, class_col] = labels
    return data_df
           

# Load and known the data

## Load data

In [20]:
data_csv = r'https://github.com/gladcolor/Housing_RandomForest/raw/master/melb_data.csv'
original_data = pd.read_csv(data_csv)
print("Data samples:")
original_data


Data samples:


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


## Types of data columns

In [76]:
print("Column data types: \n")
original_data.dtypes

Column data types: 



Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

## Unique values of nominal columns

In [77]:


print_str_unique(original_data)

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## Counts of missing values

In [78]:


print("Before imputing:")
count_column_nan(original_data)

Before imputing:
Column        Suburb has    0  (0.0%) nan values.
Column       Address has    0  (0.0%) nan values.
Column         Rooms has    0  (0.0%) nan values.
Column          Type has    0  (0.0%) nan values.
Column         Price has    0  (0.0%) nan values.
Column        Method has    0  (0.0%) nan values.
Column       SellerG has    0  (0.0%) nan values.
Column          Date has    0  (0.0%) nan values.
Column      Distance has    0  (0.0%) nan values.
Column      Postcode has    0  (0.0%) nan values.
Column      Bedroom2 has    0  (0.0%) nan values.
Column      Bathroom has    0  (0.0%) nan values.
Column           Car has   62  (0.5%) nan values.
Column      Landsize has    0  (0.0%) nan values.
Column  BuildingArea has 6450 (47.5%) nan values.
Column     YearBuilt has 5375 (39.6%) nan values.
Column   CouncilArea has 1369 (10.1%) nan values.
Column     Lattitude has    0  (0.0%) nan values.
Column    Longtitude has    0  (0.0%) nan values.
Column    Regionname has    0  (0

# Imputate missing values

In [6]:
IMPUTE_STRETEGY = ['most_frequent', 'mean', 'median']




imputed_df = impute_df(df=original_data, strategy='most_frequent')
print("After imputing:")
count_column_nan(imputed_df)

After imputing:


NameError: ignored

# Encode nominal columns (i.e., features)

## encode dates

In [7]:
from datetime import datetime



imputed_df = encode_dates(imputed_df)
imputed_df

KeyError: ignored

Print the unique value counts for each column.

No need to encode addresses.

In [40]:
print("Unique value count for each nominal column: \n")
print_str_unique(original_data)

Unique value count for each nominal column: 

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## encode nominal values

In [78]:
import category_encoders as ce

ENCODING_COLUMNS = ['CouncilArea', 'Regionname', 'Type', 'Method']

DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'Date']

# DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'delta_days']


encoded_df = encoder_nominals(imputed_df, encode_method='target')

encoded_df

Encode methods: dict_keys(['one_hot', 'hasing', 'leave_one_out', 'target', 'baseN', 'binary']) 

Columns: Suburb have already dropped before.
Columns: SellerG have already dropped before.
Columns: Address have already dropped before.
Columns: Date have already dropped before.


  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Rooms,Type_1,Type_2,Type_3,Price,Method_1,Method_2,Method_3,Method_4,Method_5,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,CouncilArea_1,CouncilArea_2,CouncilArea_3,CouncilArea_4,CouncilArea_5,CouncilArea_6,CouncilArea_7,CouncilArea_8,CouncilArea_9,CouncilArea_10,CouncilArea_11,CouncilArea_12,CouncilArea_13,CouncilArea_14,CouncilArea_15,CouncilArea_16,CouncilArea_17,CouncilArea_18,CouncilArea_19,CouncilArea_20,CouncilArea_21,CouncilArea_22,CouncilArea_23,CouncilArea_24,CouncilArea_25,CouncilArea_26,CouncilArea_27,CouncilArea_28,CouncilArea_29,CouncilArea_30,CouncilArea_31,CouncilArea_32,CouncilArea_33,Lattitude,Longtitude,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7,Regionname_8,Propertycount,Price_class
0,2,1,0,0,1480000.0000,1,0,0,0,0,2.5000,3067.0000,2.0000,1.0000,1.0000,202.0000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.7996,144.9984,1,0,0,0,0,0,0,0,4019.0000,0
1,2,1,0,0,1035000.0000,1,0,0,0,0,2.5000,3067.0000,2.0000,1.0000,0.0000,156.0000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.8079,144.9934,1,0,0,0,0,0,0,0,4019.0000,0
2,3,1,0,0,1465000.0000,0,1,0,0,0,2.5000,3067.0000,3.0000,2.0000,0.0000,134.0000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.8093,144.9944,1,0,0,0,0,0,0,0,4019.0000,0
3,3,1,0,0,850000.0000,0,0,1,0,0,2.5000,3067.0000,3.0000,2.0000,1.0000,94.0000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.7969,144.9969,1,0,0,0,0,0,0,0,4019.0000,0
4,4,1,0,0,1600000.0000,0,0,0,1,0,2.5000,3067.0000,3.0000,1.0000,2.0000,120.0000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.8072,144.9941,1,0,0,0,0,0,0,0,4019.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1,0,0,1245000.0000,1,0,0,0,0,16.7000,3150.0000,4.0000,2.0000,2.0000,652.0000,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.9056,145.1676,0,0,0,0,1,0,0,0,7392.0000,4
13576,3,1,0,0,1031000.0000,0,1,0,0,0,6.8000,3016.0000,3.0000,2.0000,2.0000,333.0000,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.8593,144.8790,0,1,0,0,0,0,0,0,6380.0000,4
13577,3,1,0,0,1170000.0000,1,0,0,0,0,6.8000,3016.0000,3.0000,2.0000,4.0000,436.0000,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.8527,144.8874,0,1,0,0,0,0,0,0,6380.0000,4
13578,4,1,0,0,2500000.0000,0,0,1,0,0,6.8000,3016.0000,4.0000,1.0000,5.0000,866.0000,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.8591,144.8930,0,1,0,0,0,0,0,0,6380.0000,4


# Generate price classes

In [53]:
print("Price column description: \n")
encoded_df['Price'].describe()

Price column description: 



count    1.358000e+04
mean     1.075684e+06
std      6.393107e+05
min      8.500000e+04
25%      6.500000e+05
50%      9.030000e+05
75%      1.330000e+06
max      9.000000e+06
Name: Price, dtype: float64

In [54]:
encoded_df = assign_price_class(encoded_df)
gb = encoded_df.groupby('Price_class')['Price_class'].count().to_frame()
gb

Unnamed: 0_level_0,Price_class
Price_class,Unnamed: 1_level_1
0,2716
1,2716
2,2716
3,2716
4,2716


# Split train/test set.

In [123]:

encoded_df = standarlize_data(encoded_df, class_col='Price_class')

xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)

print(f'Sample counts: xTrain: {len(xTrain)}, xVal: {len(xVal)}, x_test: {len(xTest)}')

Sample counts: xTrain: 10185, xVal: 1358, x_test: 2037


# K nearest neighbors

In [124]:
xTrain

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Propertycount
664,-0.4912,1.2188,-0.5411,-0.4385,-0.8375,2.6303,0.2629,-1.2658,0.0649,0.4056,-0.1598,-0.0144,0.0883,0.6734,0.4040,-0.0477,0.3104,0.9458,0.0810
3270,-0.4912,1.2188,1.5578,-0.4385,-0.8375,-0.3802,0.2629,-1.2658,-0.9815,-0.7597,0.0617,-0.2680,-0.9470,-0.7724,0.4040,0.0069,0.8290,0.5137,-1.0295
3873,-0.4912,1.2188,-0.5411,-0.4385,-0.8375,-0.3802,0.2629,0.5370,-0.9815,0.5386,0.1810,0.4378,-0.9470,-0.7724,-0.6368,-0.0527,-0.7318,0.4618,0.3075
13170,-0.4912,-0.3349,-0.5411,-0.4385,0.8659,-0.3802,0.2629,0.5370,0.0649,-0.8184,1.6124,-0.3232,0.0883,-0.7724,-0.6368,-0.0094,2.1533,0.5444,0.7929
1730,2.0357,1.2188,-0.5411,-0.4385,-0.8375,-0.3802,0.2629,-1.2658,1.1112,1.1799,0.2151,0.6363,0.0883,0.6734,0.4040,0.0322,-1.0585,0.5070,0.0840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,-0.4912,-0.3349,1.5578,-0.4385,0.8659,-0.3802,-1.3506,0.5370,0.0649,0.3196,-0.8414,-0.5437,0.0883,-0.7724,0.4040,-0.0868,0.4069,-0.3596,1.0195
3264,-0.4912,1.2188,1.5578,-0.4385,-0.8375,-0.3802,0.2629,-1.2658,0.0649,-0.2513,0.0617,-0.2680,0.0883,-0.7724,-0.6368,0.0475,0.8530,0.5089,-1.0295
9845,-0.4912,-0.3349,-0.5411,2.0657,0.8659,-0.3802,-1.3506,0.5370,1.1112,-0.0871,-0.5858,-0.5217,1.1236,0.6734,0.4040,-0.0294,0.9271,-0.2180,0.8564
10799,2.0357,-0.3349,-0.5411,-0.4385,-0.8375,-0.3802,0.2629,0.5370,0.0649,-0.4938,0.3173,-0.3562,0.0883,-0.7724,-0.6368,0.0119,1.1183,0.2977,3.2422


In [125]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_range = range(3, 11)
scores_dict = {}
scores_list = []



for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)

    knn.fit(xTrain, yTrain)
    y_pred = knn.predict(xTest)
    score = metrics.accuracy_score(yTest, y_pred)
    scores_dict[f'k={k}'] = round(score, 5)
    scores_list.append(score)



scores_dict

{'k=10': 0.49583,
 'k=3': 0.50368,
 'k=4': 0.49141,
 'k=5': 0.4919,
 'k=6': 0.49435,
 'k=7': 0.49337,
 'k=8': 0.49435,
 'k=9': 0.49337}

# Evaluation

In [159]:
DROPPED_COLUMNS = ['SellerG',   'Address', 'Date'] # , 'BuildingArea', 'YearBuilt'
ENCODING_COLUMNS = ['Suburb', 'Method', 'CouncilArea', 'Regionname', 'Type']

pd.options.display.float_format = '{:,.4f}'.format


IMPUTE_STRETEGY = ['mean', 'most_frequent',  'median']
k_range = list(range(3, 11)) + [20]

data_csv = r'https://github.com/gladcolor/Housing_RandomForest/raw/master/melb_data.csv'
original_data = pd.read_csv(data_csv).drop(columns=DROPPED_COLUMNS)



results_df = pd.DataFrame(columns=['Impute_strategy', 'Nominal_encoding', 
                                   'Radom_forest_50', 
                                   'Radom_forest_100', 
                                   'Radom_forest_200'] + 
                                   [f'KNN_{k}' for k in k_range]
                                   )

ENCODING_METHODS_DICT = {'one_hot': ce.OneHotEncoder,
                         'hasing':ce.HashingEncoder, 
                         'leave_one_out':ce.LeaveOneOutEncoder,  # Accuracy 1.0. Need to dig it.
                         'binary':ce.BinaryEncoder, 
                         'target':ce.TargetEncoder,
                        #  'baseN':ce.BaseNEncoder, # the same results as one-hot and binary when base = 1 or 2.
                         
                         }

for impute_strategy in IMPUTE_STRETEGY:
    imputed_df = impute_df(df=original_data, strategy=impute_strategy)
    imputed_df = assign_price_class(imputed_df)
    
    for ce_encoder_name in ENCODING_METHODS_DICT.keys():   

        encoded_df = encoder_nominals(imputed_df, encode_method=ce_encoder_name)

        encoded_df = standardize_data(encoded_df, class_col='Price_class')
 
        xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)
 
        current_row = len(results_df)

        # conduct Random forest        
        RF_score_50 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=50) 
        RF_score_100 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=100) 
        RF_score_200 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=200) 
        print(f"Computed Random Forest, {impute_strategy}, {ce_encoder_name}, score: {RF_score:.5f}")

        # conduct KNN     
        for k in k_range:
            knn = KNeighborsClassifier(n_neighbors = k)
            knn.fit(xTrain, yTrain)
            y_pred = knn.predict(xTest)
            score = metrics.accuracy_score(yTest, y_pred)
            
            # record the accuracy
            column_name = f'KNN_{k}'
            print(f"Computed  {column_name}, {impute_strategy}, {ce_encoder_name}, score: {score:.5f}")
            

            results_df.loc[current_row, column_name] = round(score, 5)
            results_df.loc[current_row, 'Nominal_encoding'] = ce_encoder_name
            results_df.loc[current_row, 'Impute_strategy'] = impute_strategy
            results_df.loc[current_row, 'Radom_forest_50'] = round(RF_score_50, 5)
            results_df.loc[current_row, 'Radom_forest_100'] = round(RF_score_100, 5)
            results_df.loc[current_row, 'Radom_forest_200'] = round(RF_score_200, 5)
            # print(results_df.head(10))

results_df.to_csv("results.csv")
results_df

  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, one_hot, score: 0.75700
Computed  KNN_3, mean, one_hot, score: 0.51203
Computed  KNN_4, mean, one_hot, score: 0.51645
Computed  KNN_5, mean, one_hot, score: 0.51497
Computed  KNN_6, mean, one_hot, score: 0.51350
Computed  KNN_7, mean, one_hot, score: 0.51203
Computed  KNN_8, mean, one_hot, score: 0.51448
Computed  KNN_9, mean, one_hot, score: 0.51154
Computed  KNN_10, mean, one_hot, score: 0.50761
Computed  KNN_20, mean, one_hot, score: 0.50712


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, hasing, score: 0.75700
Computed  KNN_3, mean, hasing, score: 0.47914
Computed  KNN_4, mean, hasing, score: 0.47324
Computed  KNN_5, mean, hasing, score: 0.47472
Computed  KNN_6, mean, hasing, score: 0.48208
Computed  KNN_7, mean, hasing, score: 0.48110
Computed  KNN_8, mean, hasing, score: 0.48454
Computed  KNN_9, mean, hasing, score: 0.48601
Computed  KNN_10, mean, hasing, score: 0.48405
Computed  KNN_20, mean, hasing, score: 0.47324


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, leave_one_out, score: 0.75700
Computed  KNN_3, mean, leave_one_out, score: 0.49632
Computed  KNN_4, mean, leave_one_out, score: 0.49877
Computed  KNN_5, mean, leave_one_out, score: 0.51006
Computed  KNN_6, mean, leave_one_out, score: 0.50565
Computed  KNN_7, mean, leave_one_out, score: 0.50270
Computed  KNN_8, mean, leave_one_out, score: 0.49485
Computed  KNN_9, mean, leave_one_out, score: 0.49435
Computed  KNN_10, mean, leave_one_out, score: 0.49092
Computed  KNN_20, mean, leave_one_out, score: 0.48454


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, binary, score: 0.75700
Computed  KNN_3, mean, binary, score: 0.51006
Computed  KNN_4, mean, binary, score: 0.51154
Computed  KNN_5, mean, binary, score: 0.51252
Computed  KNN_6, mean, binary, score: 0.51203
Computed  KNN_7, mean, binary, score: 0.51055
Computed  KNN_8, mean, binary, score: 0.51105
Computed  KNN_9, mean, binary, score: 0.50957
Computed  KNN_10, mean, binary, score: 0.50221
Computed  KNN_20, mean, binary, score: 0.50319


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, target, score: 0.75700
Computed  KNN_3, mean, target, score: 0.49485
Computed  KNN_4, mean, target, score: 0.49926
Computed  KNN_5, mean, target, score: 0.50957
Computed  KNN_6, mean, target, score: 0.50712
Computed  KNN_7, mean, target, score: 0.50221
Computed  KNN_8, mean, target, score: 0.49239
Computed  KNN_9, mean, target, score: 0.49386
Computed  KNN_10, mean, target, score: 0.49043
Computed  KNN_20, mean, target, score: 0.48454


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, one_hot, score: 0.75700
Computed  KNN_3, most_frequent, one_hot, score: 0.51105
Computed  KNN_4, most_frequent, one_hot, score: 0.51645
Computed  KNN_5, most_frequent, one_hot, score: 0.51546
Computed  KNN_6, most_frequent, one_hot, score: 0.51645
Computed  KNN_7, most_frequent, one_hot, score: 0.51154
Computed  KNN_8, most_frequent, one_hot, score: 0.51350
Computed  KNN_9, most_frequent, one_hot, score: 0.51006
Computed  KNN_10, most_frequent, one_hot, score: 0.51154
Computed  KNN_20, most_frequent, one_hot, score: 0.50565


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, hasing, score: 0.75700
Computed  KNN_3, most_frequent, hasing, score: 0.47717
Computed  KNN_4, most_frequent, hasing, score: 0.47570
Computed  KNN_5, most_frequent, hasing, score: 0.47521
Computed  KNN_6, most_frequent, hasing, score: 0.48306
Computed  KNN_7, most_frequent, hasing, score: 0.48355
Computed  KNN_8, most_frequent, hasing, score: 0.48945
Computed  KNN_9, most_frequent, hasing, score: 0.48797
Computed  KNN_10, most_frequent, hasing, score: 0.48405
Computed  KNN_20, most_frequent, hasing, score: 0.47177


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, leave_one_out, score: 0.75700
Computed  KNN_3, most_frequent, leave_one_out, score: 0.49779
Computed  KNN_4, most_frequent, leave_one_out, score: 0.49828
Computed  KNN_5, most_frequent, leave_one_out, score: 0.51252
Computed  KNN_6, most_frequent, leave_one_out, score: 0.50074
Computed  KNN_7, most_frequent, leave_one_out, score: 0.50221
Computed  KNN_8, most_frequent, leave_one_out, score: 0.49485
Computed  KNN_9, most_frequent, leave_one_out, score: 0.49288
Computed  KNN_10, most_frequent, leave_one_out, score: 0.49141
Computed  KNN_20, most_frequent, leave_one_out, score: 0.48601


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, binary, score: 0.75700
Computed  KNN_3, most_frequent, binary, score: 0.50908
Computed  KNN_4, most_frequent, binary, score: 0.51154
Computed  KNN_5, most_frequent, binary, score: 0.51350
Computed  KNN_6, most_frequent, binary, score: 0.51448
Computed  KNN_7, most_frequent, binary, score: 0.51055
Computed  KNN_8, most_frequent, binary, score: 0.51154
Computed  KNN_9, most_frequent, binary, score: 0.50712
Computed  KNN_10, most_frequent, binary, score: 0.50417
Computed  KNN_20, most_frequent, binary, score: 0.50074


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, target, score: 0.75700
Computed  KNN_3, most_frequent, target, score: 0.49632
Computed  KNN_4, most_frequent, target, score: 0.49828
Computed  KNN_5, most_frequent, target, score: 0.51154
Computed  KNN_6, most_frequent, target, score: 0.50123
Computed  KNN_7, most_frequent, target, score: 0.50074
Computed  KNN_8, most_frequent, target, score: 0.49730
Computed  KNN_9, most_frequent, target, score: 0.49337
Computed  KNN_10, most_frequent, target, score: 0.49190
Computed  KNN_20, most_frequent, target, score: 0.48552


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, one_hot, score: 0.75700
Computed  KNN_3, median, one_hot, score: 0.51154
Computed  KNN_4, median, one_hot, score: 0.51645
Computed  KNN_5, median, one_hot, score: 0.51595
Computed  KNN_6, median, one_hot, score: 0.51645
Computed  KNN_7, median, one_hot, score: 0.51105
Computed  KNN_8, median, one_hot, score: 0.51301
Computed  KNN_9, median, one_hot, score: 0.51006
Computed  KNN_10, median, one_hot, score: 0.51154
Computed  KNN_20, median, one_hot, score: 0.50614


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, hasing, score: 0.75700
Computed  KNN_3, median, hasing, score: 0.47766
Computed  KNN_4, median, hasing, score: 0.47521
Computed  KNN_5, median, hasing, score: 0.47570
Computed  KNN_6, median, hasing, score: 0.48306
Computed  KNN_7, median, hasing, score: 0.48306
Computed  KNN_8, median, hasing, score: 0.48895
Computed  KNN_9, median, hasing, score: 0.48797
Computed  KNN_10, median, hasing, score: 0.48405
Computed  KNN_20, median, hasing, score: 0.47275


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, leave_one_out, score: 0.75700
Computed  KNN_3, median, leave_one_out, score: 0.49828
Computed  KNN_4, median, leave_one_out, score: 0.49828
Computed  KNN_5, median, leave_one_out, score: 0.51301
Computed  KNN_6, median, leave_one_out, score: 0.50025
Computed  KNN_7, median, leave_one_out, score: 0.50221
Computed  KNN_8, median, leave_one_out, score: 0.49485
Computed  KNN_9, median, leave_one_out, score: 0.49337
Computed  KNN_10, median, leave_one_out, score: 0.49190
Computed  KNN_20, median, leave_one_out, score: 0.48601


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, binary, score: 0.75700
Computed  KNN_3, median, binary, score: 0.50908
Computed  KNN_4, median, binary, score: 0.51105
Computed  KNN_5, median, binary, score: 0.51399
Computed  KNN_6, median, binary, score: 0.51448
Computed  KNN_7, median, binary, score: 0.51006
Computed  KNN_8, median, binary, score: 0.51105
Computed  KNN_9, median, binary, score: 0.50663
Computed  KNN_10, median, binary, score: 0.50417
Computed  KNN_20, median, binary, score: 0.50123


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, target, score: 0.75700
Computed  KNN_3, median, target, score: 0.49681
Computed  KNN_4, median, target, score: 0.49779
Computed  KNN_5, median, target, score: 0.51203
Computed  KNN_6, median, target, score: 0.50123
Computed  KNN_7, median, target, score: 0.50123
Computed  KNN_8, median, target, score: 0.49681
Computed  KNN_9, median, target, score: 0.49435
Computed  KNN_10, median, target, score: 0.49141
Computed  KNN_20, median, target, score: 0.48650


Unnamed: 0,Impute_strategy,Nominal_encoding,Radom_forest_50,Radom_forest_100,Radom_forest_200,KNN_3,KNN_4,KNN_5,KNN_6,KNN_7,KNN_8,KNN_9,KNN_10,KNN_20
0,mean,one_hot,0.7217,0.7217,0.7256,0.512,0.5164,0.515,0.5135,0.512,0.5145,0.5115,0.5076,0.5071
1,mean,hasing,0.7275,0.7246,0.7374,0.4791,0.4732,0.4747,0.4821,0.4811,0.4845,0.486,0.484,0.4732
2,mean,leave_one_out,0.9995,1.0,1.0,0.4963,0.4988,0.5101,0.5057,0.5027,0.4949,0.4944,0.4909,0.4845
3,mean,binary,0.7452,0.7418,0.7501,0.5101,0.5115,0.5125,0.512,0.5105,0.5111,0.5096,0.5022,0.5032
4,mean,target,0.755,0.756,0.7496,0.4949,0.4993,0.5096,0.5071,0.5022,0.4924,0.4939,0.4904,0.4845
5,most_frequent,one_hot,0.7285,0.731,0.727,0.5111,0.5164,0.5155,0.5164,0.5115,0.5135,0.5101,0.5115,0.5057
6,most_frequent,hasing,0.7325,0.7379,0.7334,0.4772,0.4757,0.4752,0.4831,0.4835,0.4894,0.488,0.484,0.4718
7,most_frequent,leave_one_out,1.0,1.0,1.0,0.4978,0.4983,0.5125,0.5007,0.5022,0.4949,0.4929,0.4914,0.486
8,most_frequent,binary,0.7379,0.7432,0.7467,0.5091,0.5115,0.5135,0.5145,0.5105,0.5115,0.5071,0.5042,0.5007
9,most_frequent,target,0.754,0.757,0.7491,0.4963,0.4983,0.5115,0.5012,0.5007,0.4973,0.4934,0.4919,0.4855
