<a href="https://colab.research.google.com/github/gladcolor/Housing_RandomForest/blob/master/CSCE822_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages

In [1]:
pip install category_encoders 



# Import packages

In [2]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier

  import pandas.util.testing as tm


# Define functions

In [3]:
def print_str_unique(df):
    for col in df.columns:
        if original_data.dtypes[col] == np.object:            
            unique_cnt = len(df[col].unique())
            print(f'Column {col.rjust(13)} has {unique_cnt:5} unique values.')

def count_column_nan(df):
    row_cnt = len(df)
    for col in df.columns:
        nan_cnt = df[col].isna().sum()
        percent_str = f'({(nan_cnt / row_cnt * 100):3.1f}%)'.rjust(7)
        print(f'Column {col.rjust(13)} has {nan_cnt:4} {percent_str} nan values.')       

def impute_df(df, strategy="most_frequent"):
    
    numeric_cols = ['BuildingArea', 'YearBuilt', 'Car']
    nominal_cols = ['CouncilArea']

    my_imputer = SimpleImputer(strategy="most_frequent")
    council_area_with_imputed_values = my_imputer.fit_transform(df[nominal_cols])
    imputed_df = df.copy()
    imputed_df.loc[:, nominal_cols] = council_area_with_imputed_values


    if strategy == "most_frequent":
        my_imputer = SimpleImputer(strategy="most_frequent")
        data_with_imputed_values = my_imputer.fit_transform(df)        
        imputed_df.loc[:, :] = data_with_imputed_values

    if strategy == "mean":
        my_imputer = SimpleImputer(strategy="mean")
        area_year_with_imputed_values = my_imputer.fit_transform(df[numeric_cols])
        imputed_df.loc[:, numeric_cols] = area_year_with_imputed_values

    if strategy == "median":
        my_imputer = SimpleImputer(strategy="median")
        area_year_with_imputed_values = my_imputer.fit_transform(df[numeric_cols])
        imputed_df.loc[:, numeric_cols] = area_year_with_imputed_values

    return imputed_df

def encode_dates(imputed_df):
    imputed_df['Date'] = pd.to_datetime(imputed_df['Date']) 
    imputed_df['Ori_Date'] = pd.to_datetime('1970-01-01', format='YY-m-d', errors='ignore')
    imputed_df['Ori_Date'] = pd.to_datetime(imputed_df['Ori_Date'])
    imputed_df['delta_days'] = imputed_df['Date'] - imputed_df['Ori_Date']
    imputed_df['delta_days'] = imputed_df['delta_days'].dt.days
    imputed_df = imputed_df.drop(columns=['Date', 'Ori_Date'])
     
    return imputed_df


ENCODING_METHODS_DICT = {'one_hot': ce.OneHotEncoder, 
                         'hasing':ce.HashingEncoder, 
                         'leave_one_out':ce.LeaveOneOutEncoder, 
                         'target':ce.TargetEncoder,
                         'baseN':ce.BaseNEncoder,
                         'binary':ce.BinaryEncoder,
                         }

def encoder_nominals(imputed_df, encode_method='one_hot'):
    # print(f'Encode methods: {ENCODING_METHODS_DICT.keys()} \n')
    
    ce_encoder = ENCODING_METHODS_DICT[encode_method](cols = ENCODING_COLUMNS)

    y = imputed_df['Price_class'].copy()
    
    for drop_column in DROPPED_COLUMNS:
        try:
            imputed_df = imputed_df.drop(columns=drop_column).copy()
        except:
            pass
            # print(f'Columns: {drop_column} have already dropped before.')

    encoded_df = ce_encoder.fit_transform(imputed_df, y=y) 

    return encoded_df

def assign_price_class(imputed_df):
    row_cnt = len(imputed_df)
    price_class_cnt = 5
    class_step = int(row_cnt / price_class_cnt)
    price_bins = list(range(class_step, row_cnt,  class_step))

    imputed_df.loc[0:price_bins[0], 'Price_class'] = '0' # 'bottom_value'
    imputed_df.loc[price_bins[0]:price_bins[1], 'Price_class'] = '1' # 'low_value'
    imputed_df.loc[price_bins[1]:price_bins[2], 'Price_class'] = '2' # 'medium_value'
    imputed_df.loc[price_bins[2]:price_bins[3], 'Price_class'] = '3' # 'high_value'
    imputed_df.loc[price_bins[3]:row_cnt, 'Price_class'] = '4'  #  'top_value'

    imputed_df['Price_class'] = imputed_df['Price_class'].astype(int)

    # gb = imputed_df.groupby('Price_class')['Price_class'].count().to_frame()
    # gb.columns = ['Count']
    # custom_dict = {'bottom_value': 0, 'low_value': 1, 'medium_value': 2, 'high_value': 3, 'top_value': 4}
    # gb.sort_index(key=lambda x: x.map(custom_dict))
    # print("Price class counts:")
    return imputed_df    


def split_data(encoded_df):
    X = encoded_df.drop(columns=['Price_class'])
    y = encoded_df['Price_class']

    train_ratio = 0.75
    validation_ratio = 0.10
    test_ratio = 0.15

    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size= (1 - train_ratio), random_state = 0)

    xVal, xTest, yVal, yTest = train_test_split(xTest, yTest, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 0) 

    
    return xTrain, yTrain, xVal, yVal, xTest, yTest   

def standardize_data(encoded_df, class_col='Price_class'):
    labels = encoded_df[class_col].copy()
    data_df = encoded_df.drop(columns=[class_col])
    scaler = preprocessing.StandardScaler()
    scaler.fit(data_df) 
    data_df.iloc[:, :] = scaler.transform(data_df)
    data_df.loc[:, class_col] = labels
    return data_df
           

# Load and understand the data

## Load data

In [4]:
data_csv = r'https://github.com/gladcolor/Housing_RandomForest/raw/master/melb_data.csv'
original_data = pd.read_csv(data_csv)
print("Data samples:")
original_data


Data samples:


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


## Types of data columns

In [5]:
print("Column data types: \n")
original_data.dtypes

Column data types: 



Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

## Unique values of nominal columns

In [6]:
print_str_unique(original_data)

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## Counts of missing values

In [7]:
print("Before imputing:")
count_column_nan(original_data)

Before imputing:
Column        Suburb has    0  (0.0%) nan values.
Column       Address has    0  (0.0%) nan values.
Column         Rooms has    0  (0.0%) nan values.
Column          Type has    0  (0.0%) nan values.
Column         Price has    0  (0.0%) nan values.
Column        Method has    0  (0.0%) nan values.
Column       SellerG has    0  (0.0%) nan values.
Column          Date has    0  (0.0%) nan values.
Column      Distance has    0  (0.0%) nan values.
Column      Postcode has    0  (0.0%) nan values.
Column      Bedroom2 has    0  (0.0%) nan values.
Column      Bathroom has    0  (0.0%) nan values.
Column           Car has   62  (0.5%) nan values.
Column      Landsize has    0  (0.0%) nan values.
Column  BuildingArea has 6450 (47.5%) nan values.
Column     YearBuilt has 5375 (39.6%) nan values.
Column   CouncilArea has 1369 (10.1%) nan values.
Column     Lattitude has    0  (0.0%) nan values.
Column    Longtitude has    0  (0.0%) nan values.
Column    Regionname has    0  (0

# Imputate missing values

In [8]:
IMPUTE_STRETEGY = ['most_frequent', 'mean', 'median']


imputed_df = impute_df(df=original_data, strategy='most_frequent')
print("After imputing:")
count_column_nan(imputed_df)

After imputing:
Column        Suburb has    0  (0.0%) nan values.
Column       Address has    0  (0.0%) nan values.
Column         Rooms has    0  (0.0%) nan values.
Column          Type has    0  (0.0%) nan values.
Column         Price has    0  (0.0%) nan values.
Column        Method has    0  (0.0%) nan values.
Column       SellerG has    0  (0.0%) nan values.
Column          Date has    0  (0.0%) nan values.
Column      Distance has    0  (0.0%) nan values.
Column      Postcode has    0  (0.0%) nan values.
Column      Bedroom2 has    0  (0.0%) nan values.
Column      Bathroom has    0  (0.0%) nan values.
Column           Car has    0  (0.0%) nan values.
Column      Landsize has    0  (0.0%) nan values.
Column  BuildingArea has    0  (0.0%) nan values.
Column     YearBuilt has    0  (0.0%) nan values.
Column   CouncilArea has    0  (0.0%) nan values.
Column     Lattitude has    0  (0.0%) nan values.
Column    Longtitude has    0  (0.0%) nan values.
Column    Regionname has    0  (0.

# Generate price classes

In [9]:
print("Price column description: \n")
imputed_df['Price'].describe()

Price column description: 



count    1.358000e+04
mean     1.075684e+06
std      6.393107e+05
min      8.500000e+04
25%      6.500000e+05
50%      9.030000e+05
75%      1.330000e+06
max      9.000000e+06
Name: Price, dtype: float64

In [10]:
imputed_df = assign_price_class(imputed_df)
gb = imputed_df.groupby('Price_class')['Price_class'].count().to_frame()
gb

Unnamed: 0_level_0,Price_class
Price_class,Unnamed: 1_level_1
0,2716
1,2716
2,2716
3,2716
4,2716


# Encode nominal columns (i.e., features)

## encode dates

In [11]:
from datetime import datetime


imputed_df = encode_dates(imputed_df)
imputed_df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price_class,delta_days
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,2.5,3067.0,2.0,1.0,1.0,202.0,120.0,1970.0,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0,0,16872
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0,0,16893
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0,0,17259
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,2.5,3067.0,3.0,2.0,1.0,94.0,120.0,1970.0,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0,0,17259
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0,0,16897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,16.7,3150.0,4.0,2.0,2.0,652.0,120.0,1981.0,Moreland,-37.90562,145.16761,South-Eastern Metropolitan,7392.0,4,17404
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,Moreland,-37.85927,144.87904,Western Metropolitan,6380.0,4,17404
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,6.8,3016.0,3.0,2.0,4.0,436.0,120.0,1997.0,Moreland,-37.85274,144.88738,Western Metropolitan,6380.0,4,17404
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,Moreland,-37.85908,144.89299,Western Metropolitan,6380.0,4,17404


Print the unique value counts for each column.

No need to encode addresses.

In [12]:
print("Unique value count for each nominal column: \n")
print_str_unique(original_data)

Unique value count for each nominal column: 

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## encode nominal values

In [13]:
import category_encoders as ce

ENCODING_COLUMNS = ['CouncilArea', 'Regionname', 'Type', 'Method']

DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'Date', 'Price']

# DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'delta_days']


encoded_df = encoder_nominals(imputed_df, encode_method='target')#.drop(columns=['Price'])

encoded_df

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Rooms,Type,Method,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,Price_class,delta_days
0,2,2.097682,1.993017,2.5,3067.0,2.0,1.0,1.0,202.0,120.0,1970.0,1.275116,-37.79960,144.99840,1.913882,4019.0,0,16872
1,2,2.097682,1.993017,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,1.275116,-37.80790,144.99340,1.913882,4019.0,0,16893
2,3,2.097682,2.089254,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,1.275116,-37.80930,144.99440,1.913882,4019.0,0,17259
3,3,2.097682,1.890026,2.5,3067.0,3.0,2.0,1.0,94.0,120.0,1970.0,1.275116,-37.79690,144.99690,1.913882,4019.0,0,17259
4,4,2.097682,2.010842,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,1.275116,-37.80720,144.99410,1.913882,4019.0,0,16897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,2.097682,1.993017,16.7,3150.0,4.0,2.0,2.0,652.0,120.0,1981.0,2.810427,-37.90562,145.16761,3.397778,7392.0,4,17404
13576,3,2.097682,2.089254,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,2.810427,-37.85927,144.87904,2.083786,6380.0,4,17404
13577,3,2.097682,1.993017,6.8,3016.0,3.0,2.0,4.0,436.0,120.0,1997.0,2.810427,-37.85274,144.88738,2.083786,6380.0,4,17404
13578,4,2.097682,1.890026,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,2.810427,-37.85908,144.89299,2.083786,6380.0,4,17404


# Split train/test set.

In [14]:

encoded_df = standardize_data(encoded_df, class_col='Price_class')

xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)

print(f'Sample counts: xTrain: {len(xTrain)}, xVal: {len(xVal)}, x_test: {len(xTest)}')

Sample counts: xTrain: 10185, xVal: 1358, x_test: 2037


# K nearest neighbors

In [15]:
xTrain

Unnamed: 0,Rooms,Type,Method,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,delta_days
664,0.064876,0.659632,-0.087583,-0.159798,-0.014358,0.088284,0.673367,0.403998,-0.047717,0.102509,1.451197,-0.879128,0.310418,0.945839,-0.804755,0.080984,-1.344403
3270,-0.981463,0.659632,-0.087583,0.061723,-0.268015,-0.947035,-0.772376,0.403998,0.006912,-0.144742,-0.405266,0.148444,0.828984,0.513742,1.283410,-1.029462,-1.311000
3873,-0.981463,0.659632,-0.087583,0.181004,0.437813,-0.947035,-0.772376,-0.636847,-0.052729,-0.042783,0.110418,-0.295337,-0.731763,0.461775,-0.804755,0.307550,-0.832225
13170,0.064876,0.659632,-0.087583,1.612373,-0.323158,0.088284,-0.772376,-0.636847,-0.009376,-0.042783,0.110418,1.230403,2.153285,0.544441,-0.217939,0.792885,1.377935
1730,1.111216,0.659632,-0.087583,0.215084,0.636327,0.088284,0.673367,0.403998,0.032222,0.255448,0.557344,-1.360186,-1.058548,0.507005,-0.804755,0.083953,-0.565001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,0.064876,0.659632,1.119472,-0.841402,-0.543729,0.088284,-0.772376,0.403998,-0.086810,-0.042783,0.110418,1.230403,0.406939,-0.359595,-0.217939,1.019450,1.377935
3264,0.064876,0.659632,-0.087583,0.061723,-0.268015,0.088284,-0.772376,-0.636847,0.047509,-0.091213,-0.577161,0.148444,0.852957,0.508930,1.283410,-1.029462,0.208833
9845,1.111216,0.659632,-1.379354,-0.585801,-0.521672,1.123604,0.673367,0.403998,-0.029424,0.301330,1.210544,1.230403,0.927146,-0.218033,-0.217939,0.856378,0.871324
10799,0.064876,0.659632,-0.087583,0.317325,-0.356244,0.088284,-0.772376,-0.636847,0.011924,-0.042783,0.110418,-0.407697,1.118297,0.297693,-0.217939,3.242170,1.116279


In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_range = range(3, 11)
scores_dict = {}
scores_list = []



for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)

    knn.fit(xTrain, yTrain)
    y_pred = knn.predict(xTest)
    score = metrics.accuracy_score(yTest, y_pred)
    scores_dict[f'k={k}'] = round(score, 5)
    scores_list.append(score)

scores_dict

{'k=10': 0.59548,
 'k=3': 0.58714,
 'k=4': 0.59352,
 'k=5': 0.60677,
 'k=6': 0.60088,
 'k=7': 0.59647,
 'k=8': 0.58567,
 'k=9': 0.59352}

# Evaluation

In [25]:
DROPPED_COLUMNS = ['SellerG',   'Address', 'Date', 'Price'] # , 'BuildingArea', 'YearBuilt'
ENCODING_COLUMNS = ['Suburb', 'Method', 'CouncilArea', 'Regionname', 'Type']

pd.options.display.float_format = '{:,.4f}'.format


IMPUTE_STRETEGY = ['mean', 'most_frequent',  'median']
k_range = list(range(3, 11)) + [20]

data_csv = r'https://github.com/gladcolor/Housing_RandomForest/raw/master/melb_data.csv'
original_data = pd.read_csv(data_csv).drop(columns=DROPPED_COLUMNS)



results_df = pd.DataFrame(columns=['Impute_strategy', 'Nominal_encoding', 
                                   'Radom_forest_50', 
                                   'Radom_forest_100', 
                                   'Radom_forest_200'] + 
                                   [f'KNN_{k}' for k in k_range]
                                   )

ENCODING_METHODS_DICT = {'one_hot': ce.OneHotEncoder,
                         'hasing':ce.HashingEncoder, 
                         'leave_one_out':ce.LeaveOneOutEncoder,  # Accuracy 1.0. Need to dig it.
                         'binary':ce.BinaryEncoder, 
                         'target':ce.TargetEncoder,
                        #  'baseN':ce.BaseNEncoder, # the same results as one-hot and binary when base = 1 or 2.
                         
                         }

def get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=50):
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(xTrain, yTrain)
    yPred = clf.predict(xTest)
    accuracy = metrics.accuracy_score(yTest, yPred)
    return accuracy


for impute_strategy in IMPUTE_STRETEGY:
    imputed_df = impute_df(df=original_data, strategy=impute_strategy)
    imputed_df = assign_price_class(imputed_df)
    
    for ce_encoder_name in ENCODING_METHODS_DICT.keys():   

        encoded_df = encoder_nominals(imputed_df, encode_method=ce_encoder_name)

        encoded_df = standardize_data(encoded_df, class_col='Price_class')
 
        xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)
 
        current_row = len(results_df)

        # conduct Random forest        
        RF_score_50 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=50) 
        RF_score_100 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=100) 
        RF_score_200 = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=200) 
        print(f"Computed Random Forest, {impute_strategy}, {ce_encoder_name}, score: {RF_score_50:.5f}, {RF_score_100:.5f}, {RF_score_200:.5f}")

        # conduct KNN     
        for k in k_range:
            knn = KNeighborsClassifier(n_neighbors = k)
            knn.fit(xTrain, yTrain)
            y_pred = knn.predict(xTest)
            score = metrics.accuracy_score(yTest, y_pred)
            
            # record the accuracy
            column_name = f'KNN_{k}'
            print(f"Computed  {column_name}, {impute_strategy}, {ce_encoder_name}, score: {score:.5f}")
            

            results_df.loc[current_row, column_name] = round(score, 5)
            results_df.loc[current_row, 'Nominal_encoding'] = ce_encoder_name
            results_df.loc[current_row, 'Impute_strategy'] = impute_strategy
            results_df.loc[current_row, 'Radom_forest_50'] = round(RF_score_50, 5)
            results_df.loc[current_row, 'Radom_forest_100'] = round(RF_score_100, 5)
            results_df.loc[current_row, 'Radom_forest_200'] = round(RF_score_200, 5)
            # print(results_df.head(10))

results_df.to_csv("results.csv")
results_df

  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, one_hot, score: 0.71870, 0.71625, 0.71674
Computed  KNN_3, mean, one_hot, score: 0.65734
Computed  KNN_4, mean, one_hot, score: 0.67108
Computed  KNN_5, mean, one_hot, score: 0.67108
Computed  KNN_6, mean, one_hot, score: 0.66814
Computed  KNN_7, mean, one_hot, score: 0.66618
Computed  KNN_8, mean, one_hot, score: 0.67305
Computed  KNN_9, mean, one_hot, score: 0.66961
Computed  KNN_10, mean, one_hot, score: 0.67207
Computed  KNN_20, mean, one_hot, score: 0.64948


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, hasing, score: 0.73834, 0.73883, 0.73490
Computed  KNN_3, mean, hasing, score: 0.56063
Computed  KNN_4, mean, hasing, score: 0.55572
Computed  KNN_5, mean, hasing, score: 0.57339
Computed  KNN_6, mean, hasing, score: 0.57094
Computed  KNN_7, mean, hasing, score: 0.57732
Computed  KNN_8, mean, hasing, score: 0.57634
Computed  KNN_9, mean, hasing, score: 0.57388
Computed  KNN_10, mean, hasing, score: 0.57192
Computed  KNN_20, mean, hasing, score: 0.57192


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, leave_one_out, score: 0.99951, 1.00000, 0.99951
Computed  KNN_3, mean, leave_one_out, score: 0.59254
Computed  KNN_4, mean, leave_one_out, score: 0.60285
Computed  KNN_5, mean, leave_one_out, score: 0.61463
Computed  KNN_6, mean, leave_one_out, score: 0.60432
Computed  KNN_7, mean, leave_one_out, score: 0.61267
Computed  KNN_8, mean, leave_one_out, score: 0.60874
Computed  KNN_9, mean, leave_one_out, score: 0.61217
Computed  KNN_10, mean, leave_one_out, score: 0.60776
Computed  KNN_20, mean, leave_one_out, score: 0.59254


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, binary, score: 0.74521, 0.75012, 0.74865
Computed  KNN_3, mean, binary, score: 0.63770
Computed  KNN_4, mean, binary, score: 0.64359
Computed  KNN_5, mean, binary, score: 0.64507
Computed  KNN_6, mean, binary, score: 0.64899
Computed  KNN_7, mean, binary, score: 0.65390
Computed  KNN_8, mean, binary, score: 0.66323
Computed  KNN_9, mean, binary, score: 0.65636
Computed  KNN_10, mean, binary, score: 0.65930
Computed  KNN_20, mean, binary, score: 0.65488


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, mean, target, score: 0.75503, 0.75749, 0.75945
Computed  KNN_3, mean, target, score: 0.59352
Computed  KNN_4, mean, target, score: 0.60628
Computed  KNN_5, mean, target, score: 0.61561
Computed  KNN_6, mean, target, score: 0.60579
Computed  KNN_7, mean, target, score: 0.61954
Computed  KNN_8, mean, target, score: 0.61267
Computed  KNN_9, mean, target, score: 0.61512
Computed  KNN_10, mean, target, score: 0.61168
Computed  KNN_20, mean, target, score: 0.59352


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, one_hot, score: 0.71379, 0.71625, 0.71870
Computed  KNN_3, most_frequent, one_hot, score: 0.66176
Computed  KNN_4, most_frequent, one_hot, score: 0.67354
Computed  KNN_5, most_frequent, one_hot, score: 0.67501
Computed  KNN_6, most_frequent, one_hot, score: 0.66716
Computed  KNN_7, most_frequent, one_hot, score: 0.66618
Computed  KNN_8, most_frequent, one_hot, score: 0.67108
Computed  KNN_9, most_frequent, one_hot, score: 0.66618
Computed  KNN_10, most_frequent, one_hot, score: 0.67108
Computed  KNN_20, most_frequent, one_hot, score: 0.64850


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, hasing, score: 0.74325, 0.74521, 0.73981
Computed  KNN_3, most_frequent, hasing, score: 0.55817
Computed  KNN_4, most_frequent, hasing, score: 0.55817
Computed  KNN_5, most_frequent, hasing, score: 0.57732
Computed  KNN_6, most_frequent, hasing, score: 0.56848
Computed  KNN_7, most_frequent, hasing, score: 0.56996
Computed  KNN_8, most_frequent, hasing, score: 0.57192
Computed  KNN_9, most_frequent, hasing, score: 0.57339
Computed  KNN_10, most_frequent, hasing, score: 0.56652
Computed  KNN_20, most_frequent, hasing, score: 0.57094


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, leave_one_out, score: 0.99951, 0.99951, 0.99951
Computed  KNN_3, most_frequent, leave_one_out, score: 0.59499
Computed  KNN_4, most_frequent, leave_one_out, score: 0.60530
Computed  KNN_5, most_frequent, leave_one_out, score: 0.61954
Computed  KNN_6, most_frequent, leave_one_out, score: 0.61463
Computed  KNN_7, most_frequent, leave_one_out, score: 0.61267
Computed  KNN_8, most_frequent, leave_one_out, score: 0.60825
Computed  KNN_9, most_frequent, leave_one_out, score: 0.61119
Computed  KNN_10, most_frequent, leave_one_out, score: 0.60383
Computed  KNN_20, most_frequent, leave_one_out, score: 0.59107


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, binary, score: 0.74227, 0.74472, 0.75061
Computed  KNN_3, most_frequent, binary, score: 0.64016
Computed  KNN_4, most_frequent, binary, score: 0.64752
Computed  KNN_5, most_frequent, binary, score: 0.64850
Computed  KNN_6, most_frequent, binary, score: 0.64850
Computed  KNN_7, most_frequent, binary, score: 0.65439
Computed  KNN_8, most_frequent, binary, score: 0.66078
Computed  KNN_9, most_frequent, binary, score: 0.65390
Computed  KNN_10, most_frequent, binary, score: 0.66421
Computed  KNN_20, most_frequent, binary, score: 0.65439


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, most_frequent, target, score: 0.75503, 0.76092, 0.76043
Computed  KNN_3, most_frequent, target, score: 0.59745
Computed  KNN_4, most_frequent, target, score: 0.60481
Computed  KNN_5, most_frequent, target, score: 0.61905
Computed  KNN_6, most_frequent, target, score: 0.61561
Computed  KNN_7, most_frequent, target, score: 0.61561
Computed  KNN_8, most_frequent, target, score: 0.61365
Computed  KNN_9, most_frequent, target, score: 0.61807
Computed  KNN_10, most_frequent, target, score: 0.60923
Computed  KNN_20, most_frequent, target, score: 0.59450


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, one_hot, score: 0.71821, 0.71969, 0.72214
Computed  KNN_3, median, one_hot, score: 0.66127
Computed  KNN_4, median, one_hot, score: 0.67354
Computed  KNN_5, median, one_hot, score: 0.67403
Computed  KNN_6, median, one_hot, score: 0.66716
Computed  KNN_7, median, one_hot, score: 0.66568
Computed  KNN_8, median, one_hot, score: 0.67108
Computed  KNN_9, median, one_hot, score: 0.66618
Computed  KNN_10, median, one_hot, score: 0.67108
Computed  KNN_20, median, one_hot, score: 0.64850


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, hasing, score: 0.73490, 0.74718, 0.73883
Computed  KNN_3, median, hasing, score: 0.55817
Computed  KNN_4, median, hasing, score: 0.55817
Computed  KNN_5, median, hasing, score: 0.57634
Computed  KNN_6, median, hasing, score: 0.56750
Computed  KNN_7, median, hasing, score: 0.56996
Computed  KNN_8, median, hasing, score: 0.57192
Computed  KNN_9, median, hasing, score: 0.57290
Computed  KNN_10, median, hasing, score: 0.56652
Computed  KNN_20, median, hasing, score: 0.57094


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, leave_one_out, score: 1.00000, 1.00000, 0.99951
Computed  KNN_3, median, leave_one_out, score: 0.59597
Computed  KNN_4, median, leave_one_out, score: 0.60530
Computed  KNN_5, median, leave_one_out, score: 0.61905
Computed  KNN_6, median, leave_one_out, score: 0.61512
Computed  KNN_7, median, leave_one_out, score: 0.61119
Computed  KNN_8, median, leave_one_out, score: 0.60972
Computed  KNN_9, median, leave_one_out, score: 0.61168
Computed  KNN_10, median, leave_one_out, score: 0.60383
Computed  KNN_20, median, leave_one_out, score: 0.59107


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, binary, score: 0.74423, 0.75061, 0.74963
Computed  KNN_3, median, binary, score: 0.63967
Computed  KNN_4, median, binary, score: 0.64703
Computed  KNN_5, median, binary, score: 0.64752
Computed  KNN_6, median, binary, score: 0.64850
Computed  KNN_7, median, binary, score: 0.65390
Computed  KNN_8, median, binary, score: 0.66078
Computed  KNN_9, median, binary, score: 0.65390
Computed  KNN_10, median, binary, score: 0.66421
Computed  KNN_20, median, binary, score: 0.65439


  elif pd.api.types.is_categorical(cols):


Computed Random Forest, median, target, score: 0.75896, 0.75994, 0.75945
Computed  KNN_3, median, target, score: 0.59794
Computed  KNN_4, median, target, score: 0.60481
Computed  KNN_5, median, target, score: 0.61856
Computed  KNN_6, median, target, score: 0.61561
Computed  KNN_7, median, target, score: 0.61561
Computed  KNN_8, median, target, score: 0.61414
Computed  KNN_9, median, target, score: 0.61807
Computed  KNN_10, median, target, score: 0.60874
Computed  KNN_20, median, target, score: 0.59450


Unnamed: 0,Impute_strategy,Nominal_encoding,Radom_forest_50,Radom_forest_100,Radom_forest_200,KNN_3,KNN_4,KNN_5,KNN_6,KNN_7,KNN_8,KNN_9,KNN_10,KNN_20
0,mean,one_hot,0.7187,0.7163,0.7167,0.6573,0.6711,0.6711,0.6681,0.6662,0.6731,0.6696,0.6721,0.6495
1,mean,hasing,0.7383,0.7388,0.7349,0.5606,0.5557,0.5734,0.5709,0.5773,0.5763,0.5739,0.5719,0.5719
2,mean,leave_one_out,0.9995,1.0,0.9995,0.5925,0.6028,0.6146,0.6043,0.6127,0.6087,0.6122,0.6078,0.5925
3,mean,binary,0.7452,0.7501,0.7487,0.6377,0.6436,0.6451,0.649,0.6539,0.6632,0.6564,0.6593,0.6549
4,mean,target,0.755,0.7575,0.7594,0.5935,0.6063,0.6156,0.6058,0.6195,0.6127,0.6151,0.6117,0.5935
5,most_frequent,one_hot,0.7138,0.7163,0.7187,0.6618,0.6735,0.675,0.6672,0.6662,0.6711,0.6662,0.6711,0.6485
6,most_frequent,hasing,0.7432,0.7452,0.7398,0.5582,0.5582,0.5773,0.5685,0.57,0.5719,0.5734,0.5665,0.5709
7,most_frequent,leave_one_out,0.9995,0.9995,0.9995,0.595,0.6053,0.6195,0.6146,0.6127,0.6082,0.6112,0.6038,0.5911
8,most_frequent,binary,0.7423,0.7447,0.7506,0.6402,0.6475,0.6485,0.6485,0.6544,0.6608,0.6539,0.6642,0.6544
9,most_frequent,target,0.755,0.7609,0.7604,0.5975,0.6048,0.619,0.6156,0.6156,0.6137,0.6181,0.6092,0.5945


In [24]:
for c in xTrain.columns:
    print(c)

Suburb_1
Suburb_2
Suburb_3
Suburb_4
Suburb_5
Suburb_6
Suburb_7
Suburb_8
Suburb_9
Suburb_10
Suburb_11
Suburb_12
Suburb_13
Suburb_14
Suburb_15
Suburb_16
Suburb_17
Suburb_18
Suburb_19
Suburb_20
Suburb_21
Suburb_22
Suburb_23
Suburb_24
Suburb_25
Suburb_26
Suburb_27
Suburb_28
Suburb_29
Suburb_30
Suburb_31
Suburb_32
Suburb_33
Suburb_34
Suburb_35
Suburb_36
Suburb_37
Suburb_38
Suburb_39
Suburb_40
Suburb_41
Suburb_42
Suburb_43
Suburb_44
Suburb_45
Suburb_46
Suburb_47
Suburb_48
Suburb_49
Suburb_50
Suburb_51
Suburb_52
Suburb_53
Suburb_54
Suburb_55
Suburb_56
Suburb_57
Suburb_58
Suburb_59
Suburb_60
Suburb_61
Suburb_62
Suburb_63
Suburb_64
Suburb_65
Suburb_66
Suburb_67
Suburb_68
Suburb_69
Suburb_70
Suburb_71
Suburb_72
Suburb_73
Suburb_74
Suburb_75
Suburb_76
Suburb_77
Suburb_78
Suburb_79
Suburb_80
Suburb_81
Suburb_82
Suburb_83
Suburb_84
Suburb_85
Suburb_86
Suburb_87
Suburb_88
Suburb_89
Suburb_90
Suburb_91
Suburb_92
Suburb_93
Suburb_94
Suburb_95
Suburb_96
Suburb_97
Suburb_98
Suburb_99
Suburb_100
Suburb_1