<a href="https://colab.research.google.com/github/gladcolor/Housing_RandomForest/blob/master/CSCE822_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install packages

In [2]:
pip install category_encoders 

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[?25l[K     |████                            | 10 kB 21.7 MB/s eta 0:00:01[K     |████████▏                       | 20 kB 28.6 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 25.2 MB/s eta 0:00:01[K     |████████████████▎               | 40 kB 20.0 MB/s eta 0:00:01[K     |████████████████████▎           | 51 kB 8.6 MB/s eta 0:00:01[K     |████████████████████████▍       | 61 kB 9.9 MB/s eta 0:00:01[K     |████████████████████████████▍   | 71 kB 9.1 MB/s eta 0:00:01[K     |████████████████████████████████| 80 kB 5.0 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


# Import packages

In [3]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.model_selection import train_test_split

  import pandas.util.testing as tm


# Define functions

In [64]:
def print_str_unique(df):
    for col in df.columns:
        if original_data.dtypes[col] == np.object:            
            unique_cnt = len(df[col].unique())
            print(f'Column {col.rjust(13)} has {unique_cnt:5} unique values.')

def count_column_nan(df):
    row_cnt = len(df)
    for col in df.columns:
        nan_cnt = df[col].isna().sum()
        percent_str = f'({(nan_cnt / row_cnt * 100):3.1f}%)'.rjust(7)
        print(f'Column {col.rjust(13)} has {nan_cnt:4} {percent_str} nan values.')       

def impute_df(df, strategy="most_frequent"):
    my_imputer = SimpleImputer(strategy=strategy)

    data_with_imputed_values = my_imputer.fit_transform(df)
    imputed_df = df.copy()
    imputed_df.iloc[:, :] = data_with_imputed_values
    return imputed_df



def encode_dates(imputed_df):
    imputed_df['Date'] = pd.to_datetime(imputed_df['Date']) 
    imputed_df['Ori_Date'] = pd.to_datetime('1970-01-01', format='YY-m-d', errors='ignore')
    imputed_df['Ori_Date'] = pd.to_datetime(imputed_df['Ori_Date'])
    imputed_df['delta_days'] = imputed_df['Date'] - imputed_df['Ori_Date']
    imputed_df['delta_days'] = imputed_df['delta_days'].dt.days
    imputed_df = imputed_df.drop(columns=['Date', 'Ori_Date'])
     
    return imputed_df


ENCODING_METHODS_DICT = {'one_hot': ce.OneHotEncoder, 
                         'hasing':ce.HashingEncoder, 
                         'leave_one_out':ce.LeaveOneOutEncoder, 
                         'target':ce.TargetEncoder,
                         'baseN':ce.BaseNEncoder,
                         'binary':ce.BinaryEncoder,
                         }

def encoder_nominals(imputed_df, encode_method='one_hot'):
    print(f'Encode methods: {ENCODING_METHODS_DICT.keys()} \n')
    encode_method = 'one_hot'
    ce_encoder = ENCODING_METHODS_DICT[encode_method](cols = ENCODING_COLUMNS)

    
    for drop_column in DROPPED_COLUMNS:
        try:
            imputed_df = imputed_df.drop(columns=drop_column).copy()
        except:
            print(f'Columns: {drop_column} have already dropped before.')

    #imputed_df = impute_df(df=original_data, strategy="most_frequent")

    encoded_df = ce_encoder.fit_transform(imputed_df) 

    return encoded_df

def assign_price_class(imputed_df):
    row_cnt = len(imputed_df)
    price_class_cnt = 5
    class_step = int(row_cnt / price_class_cnt)
    price_bins = list(range(class_step, row_cnt,  class_step))

    imputed_df.loc[0:price_bins[0], 'Price_class'] = '0' # 'bottom_value'
    imputed_df.loc[price_bins[0]:price_bins[1], 'Price_class'] = '1' # 'low_value'
    imputed_df.loc[price_bins[1]:price_bins[2], 'Price_class'] = '2' # 'medium_value'
    imputed_df.loc[price_bins[2]:price_bins[3], 'Price_class'] = '3' # 'high_value'
    imputed_df.loc[price_bins[3]:row_cnt, 'Price_class'] = '4'  #  'top_value'

    imputed_df['Price_class'] = imputed_df['Price_class'].astype(int)

    # gb = imputed_df.groupby('Price_class')['Price_class'].count().to_frame()
    # gb.columns = ['Count']
    # custom_dict = {'bottom_value': 0, 'low_value': 1, 'medium_value': 2, 'high_value': 3, 'top_value': 4}
    # gb.sort_index(key=lambda x: x.map(custom_dict))
    # print("Price class counts:")
    return imputed_df    


def split_data(encoded_df):
    X = encoded_df.drop(columns=['Price_class'])
    y = encoded_df['Price_class']


    train_ratio = 0.75
    validation_ratio = 0.10
    test_ratio = 0.15

    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size= (1 - train_ratio), random_state = 0)

    xVal, xTest, yVal, yTest = train_test_split(xTest, yTest, test_size=test_ratio/(test_ratio + validation_ratio), random_state = 0) 

    
    return xTrain, yTrain, xVal, yVal, xTest, yTest                

# Load and known the data

## Load data

In [20]:
data_csv = r'https://github.com/gladcolor/Housing_RandomForest/raw/master/melb_data.csv'
original_data = pd.read_csv(data_csv)
print("Data samples:")
original_data


Data samples:


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


## Types of data columns

In [76]:
print("Column data types: \n")
original_data.dtypes

Column data types: 



Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

## Unique values of nominal columns

In [77]:


print_str_unique(original_data)

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## Counts of missing values

In [78]:


print("Before imputing:")
count_column_nan(original_data)

Before imputing:
Column        Suburb has    0  (0.0%) nan values.
Column       Address has    0  (0.0%) nan values.
Column         Rooms has    0  (0.0%) nan values.
Column          Type has    0  (0.0%) nan values.
Column         Price has    0  (0.0%) nan values.
Column        Method has    0  (0.0%) nan values.
Column       SellerG has    0  (0.0%) nan values.
Column          Date has    0  (0.0%) nan values.
Column      Distance has    0  (0.0%) nan values.
Column      Postcode has    0  (0.0%) nan values.
Column      Bedroom2 has    0  (0.0%) nan values.
Column      Bathroom has    0  (0.0%) nan values.
Column           Car has   62  (0.5%) nan values.
Column      Landsize has    0  (0.0%) nan values.
Column  BuildingArea has 6450 (47.5%) nan values.
Column     YearBuilt has 5375 (39.6%) nan values.
Column   CouncilArea has 1369 (10.1%) nan values.
Column     Lattitude has    0  (0.0%) nan values.
Column    Longtitude has    0  (0.0%) nan values.
Column    Regionname has    0  (0

# Imputate missing values

In [6]:
IMPUTE_STRETEGY = ['most_frequent', 'mean', 'median']




imputed_df = impute_df(df=original_data, strategy='most_frequent')
print("After imputing:")
count_column_nan(imputed_df)

After imputing:


NameError: ignored

# Encode nominal columns (i.e., features)

## encode dates

In [7]:
from datetime import datetime



imputed_df = encode_dates(imputed_df)
imputed_df

KeyError: ignored

Print the unique value counts for each column.

No need to encode addresses.

In [40]:
print("Unique value count for each nominal column: \n")
print_str_unique(original_data)

Unique value count for each nominal column: 

Column        Suburb has   314 unique values.
Column       Address has 13378 unique values.
Column          Type has     3 unique values.
Column        Method has     5 unique values.
Column       SellerG has   268 unique values.
Column          Date has    58 unique values.
Column   CouncilArea has    34 unique values.
Column    Regionname has     8 unique values.


## encode nominal values

In [52]:
import category_encoders as ce

ENCODING_COLUMNS = ['CouncilArea', 'Regionname', 'Type', 'Method']

DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'Date']

# DROPPED_COLUMNS = ['Suburb', 'SellerG', 'Address', 'delta_days']


encoded_df = encoder_nominals(imputed_df, encode_method='target')

encoded_df

Encode methods: dict_keys(['one_hot', 'hasing', 'leave_one_out', 'target']) 

Columns: Suburb have already dropped before.
Columns: SellerG have already dropped before.
Columns: Address have already dropped before.
Columns: Date have already dropped before.


  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Rooms,Type_1,Type_2,Type_3,Price,Method_1,Method_2,Method_3,Method_4,Method_5,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea_1,CouncilArea_2,CouncilArea_3,CouncilArea_4,CouncilArea_5,CouncilArea_6,CouncilArea_7,CouncilArea_8,CouncilArea_9,CouncilArea_10,CouncilArea_11,CouncilArea_12,CouncilArea_13,CouncilArea_14,CouncilArea_15,CouncilArea_16,CouncilArea_17,CouncilArea_18,CouncilArea_19,CouncilArea_20,CouncilArea_21,CouncilArea_22,CouncilArea_23,CouncilArea_24,CouncilArea_25,CouncilArea_26,CouncilArea_27,CouncilArea_28,CouncilArea_29,CouncilArea_30,CouncilArea_31,CouncilArea_32,CouncilArea_33,Lattitude,Longtitude,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7,Regionname_8,Propertycount
0,2,1,0,0,1480000.0,1,0,0,0,0,2.5,3067.0,2.0,1.0,1.0,202.0,120.0,1970.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.79960,144.99840,1,0,0,0,0,0,0,0,4019.0
1,2,1,0,0,1035000.0,1,0,0,0,0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.80790,144.99340,1,0,0,0,0,0,0,0,4019.0
2,3,1,0,0,1465000.0,0,1,0,0,0,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.80930,144.99440,1,0,0,0,0,0,0,0,4019.0
3,3,1,0,0,850000.0,0,0,1,0,0,2.5,3067.0,3.0,2.0,1.0,94.0,120.0,1970.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.79690,144.99690,1,0,0,0,0,0,0,0,4019.0
4,4,1,0,0,1600000.0,0,0,0,1,0,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.80720,144.99410,1,0,0,0,0,0,0,0,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,1,0,0,1245000.0,1,0,0,0,0,16.7,3150.0,4.0,2.0,2.0,652.0,120.0,1981.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.90562,145.16761,0,0,0,0,1,0,0,0,7392.0
13576,3,1,0,0,1031000.0,0,1,0,0,0,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.85927,144.87904,0,1,0,0,0,0,0,0,6380.0
13577,3,1,0,0,1170000.0,1,0,0,0,0,6.8,3016.0,3.0,2.0,4.0,436.0,120.0,1997.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.85274,144.88738,0,1,0,0,0,0,0,0,6380.0
13578,4,1,0,0,2500000.0,0,0,1,0,0,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.85908,144.89299,0,1,0,0,0,0,0,0,6380.0


# Generate price classes

In [53]:
print("Price column description: \n")
encoded_df['Price'].describe()

Price column description: 



count    1.358000e+04
mean     1.075684e+06
std      6.393107e+05
min      8.500000e+04
25%      6.500000e+05
50%      9.030000e+05
75%      1.330000e+06
max      9.000000e+06
Name: Price, dtype: float64

In [54]:
encoded_df = assign_price_class(encoded_df)
gb = encoded_df.groupby('Price_class')['Price_class'].count().to_frame()
gb

Unnamed: 0_level_0,Price_class
Price_class,Unnamed: 1_level_1
0,2716
1,2716
2,2716
3,2716
4,2716


# Split train/test set.

In [55]:

xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)

print(f'Sample counts: xTrain: {len(xTrain)}, xVal: {len(xVal)}, x_test: {len(xTest)}')

Sample counts: xTrain: 10185, xVal: 1358, x_test: 2037


# K nearest neighbors

In [56]:
xTrain

Unnamed: 0,Rooms,Type_1,Type_2,Type_3,Price,Method_1,Method_2,Method_3,Method_4,Method_5,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea_1,CouncilArea_2,CouncilArea_3,CouncilArea_4,CouncilArea_5,CouncilArea_6,CouncilArea_7,CouncilArea_8,CouncilArea_9,CouncilArea_10,CouncilArea_11,CouncilArea_12,CouncilArea_13,CouncilArea_14,CouncilArea_15,CouncilArea_16,CouncilArea_17,CouncilArea_18,CouncilArea_19,CouncilArea_20,CouncilArea_21,CouncilArea_22,CouncilArea_23,CouncilArea_24,CouncilArea_25,CouncilArea_26,CouncilArea_27,CouncilArea_28,CouncilArea_29,CouncilArea_30,CouncilArea_31,CouncilArea_32,CouncilArea_33,Lattitude,Longtitude,Regionname_1,Regionname_2,Regionname_3,Regionname_4,Regionname_5,Regionname_6,Regionname_7,Regionname_8,Propertycount
664,3,1,0,0,1335000.0,1,0,0,0,0,9.2,3104.0,3.0,2.0,2.0,368.0,177.0,2009.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.78460,145.09350,0,0,1,0,0,0,0,0,7809.0
3270,2,1,0,0,590000.0,1,0,0,0,0,10.5,3081.0,2.0,1.0,2.0,586.0,80.0,1955.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.74350,145.04860,0,0,0,1,0,0,0,0,2947.0
3873,2,1,0,0,1420000.0,1,0,0,0,0,11.2,3145.0,2.0,1.0,1.0,348.0,120.0,1970.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.86720,145.04320,0,0,1,0,0,0,0,0,8801.0
13170,3,1,0,0,552500.0,1,0,0,0,0,19.6,3076.0,3.0,1.0,1.0,521.0,120.0,1970.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.63854,145.05179,1,0,0,0,0,0,0,0,10926.0
1730,4,1,0,0,1830000.0,1,0,0,0,0,11.4,3163.0,3.0,2.0,2.0,687.0,237.0,1983.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.89310,145.04790,0,0,1,0,0,0,0,0,7822.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,3,1,0,0,1280000.0,0,1,0,0,0,5.2,3056.0,3.0,1.0,2.0,212.0,120.0,1970.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.77695,144.95785,1,0,0,0,0,0,0,0,11918.0
3264,3,1,0,0,915000.0,1,0,0,0,0,10.5,3081.0,3.0,1.0,1.0,748.0,101.0,1950.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.74160,145.04810,0,0,0,1,0,0,0,0,2947.0
9845,4,1,0,0,1020000.0,0,0,1,0,0,6.7,3058.0,4.0,2.0,2.0,441.0,255.0,2002.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.73572,144.97256,1,0,0,0,0,0,0,0,11204.0
10799,3,1,0,0,760000.0,1,0,0,0,0,12.0,3073.0,3.0,1.0,1.0,606.0,120.0,1970.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-37.72057,145.02615,1,0,0,0,0,0,0,0,21650.0


In [57]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_range = range(3, 11)
scores_dict = {}
scores_list = []



for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)

    knn.fit(xTrain, yTrain)
    y_pred = knn.predict(xTest)
    score = metrics.accuracy_score(yTest, y_pred)
    scores_dict[f'k={k}'] = round(score, 5)
    scores_list.append(score)



scores_dict

{'k=10': 0.23515,
 'k=3': 0.24546,
 'k=4': 0.25675,
 'k=5': 0.25331,
 'k=6': 0.2597,
 'k=7': 0.26166,
 'k=8': 0.24742,
 'k=9': 0.24251}

# Evaluation

In [66]:
DROPPED_COLUMNS = [ 'SellerG', 'Address', 'Date', 'Method']
ENCODING_COLUMNS = ['Suburb', 'CouncilArea', 'Regionname', 'Type']



IMPUTE_STRETEGY = ['most_frequent']
k_range = list(range(6, 8)) + [20]

data_csv = r'https://github.com/gladcolor/Housing_RandomForest/raw/master/melb_data.csv'
original_data = pd.read_csv(data_csv).drop(columns=DROPPED_COLUMNS)

results_df = pd.DataFrame(columns=['Impute_strategy', 'Nominal_encoding', 'Radom_forest'] + 
                                   [f'KNN_{k}' for k in k_range]
                                   )

ENCODING_METHODS_DICT = {'one_hot': ce.OneHotEncoder, 
                         'hasing':ce.HashingEncoder, 
                         'leave_one_out':ce.LeaveOneOutEncoder, 
                         'target':ce.TargetEncoder,
                         'baseN':ce.BaseNEncoder,
                         'binary':ce.BinaryEncoder,
                         }

row_cnt = 0
for impute_strategy in IMPUTE_STRETEGY:
    imputed_df = impute_df(df=original_data, strategy=impute_strategy)
    imputed_df = assign_price_class(imputed_df)
    for ce_encoder_name in ENCODING_METHODS_DICT:
        ce_encoder = ENCODING_METHODS_DICT[ce_encoder_name](cols = ENCODING_COLUMNS)
        print(ce_encoder)

        encoded_df = encoder_nominals(imputed_df) 
        xTrain, yTrain, xVal, yVal, xTest, yTest = split_data(encoded_df)

        current_row = len(results_df)

        # conduct Random forest
        RF_score = get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=100) 

        # conduct KNN 
        for k in k_range:
            knn = KNeighborsClassifier(n_neighbors = k)
            knn.fit(xTrain, yTrain)
            y_pred = knn.predict(xTest)
            score = metrics.accuracy_score(yTest, y_pred)
            column_name = f'KNN_{k}'
            results_df.loc[current_row, column_name] = round(score, 5)
            results_df.loc[current_row, 'Nominal_encoding'] = ce_encoder_name
            results_df.loc[current_row, 'Impute_strategy'] = impute_strategy
            results_df.loc[current_row, 'Radom_forest'] = round(RF_score, 5)
            print(results_df.head(10))

results_df

OneHotEncoder(cols=['Suburb', 'CouncilArea', 'Regionname', 'Type'],
              drop_invariant=False, handle_missing='value',
              handle_unknown='value', return_df=True, use_cat_names=False,
              verbose=0)
Encode methods: dict_keys(['one_hot', 'hasing', 'leave_one_out', 'target', 'baseN', 'binary']) 

Columns: SellerG have already dropped before.
Columns: Address have already dropped before.
Columns: Date have already dropped before.
Columns: Method have already dropped before.


  elif pd.api.types.is_categorical(cols):


  Impute_strategy Nominal_encoding Radom_forest   KNN_6 KNN_7 KNN_20
0   most_frequent          one_hot      0.72361  0.2597   NaN    NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7 KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166    NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
HashingEncoder(cols=['Suburb', 'CouncilArea', 'Regionname', 'Type'],
               drop_invariant=False, hash_method='md5', max_process=1,
               max_sample=0, n_components=8, return_df=True, verbose=0)
Encode methods: dict_keys(['one_hot', 'hasing', 'leave_one_out', 'target', 'baseN', 'binary']) 

Columns: SellerG have already dropped before.
Columns: Address have already dropped before.
Columns: Date have already dropped before.
Columns: Method have already dropped before.


  elif pd.api.types.is_categorical(cols):


  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597      NaN     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
LeaveOneOutEncoder(cols=['Suburb', 'CouncilArea', 'Regionname', 'Type'],
                   drop_invariant=False, handle_missing='value',
                   handle_unknown='value', random_state=None, return_df=True,
                   sigma=None, verbose=0)
Encode methods: dict_keys(['one_hot', 'hasing', 'leave_one_out', 'target', 'baseN', '

  elif pd.api.types.is_categorical(cols):


  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597      NaN     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166  0.2489
TargetEncoder(cols=['Suburb', 'CouncilArea', 'Regionname', 'Type'],
              drop_invariant=False, handle_missing='valu

  elif pd.api.types.is_categorical(cols):


  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166  0.2489
3   most_frequent           target      0.72901  0.2597      NaN     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166  0.2489
3   most_frequent           target      0.72901  0.2597  0.26166     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.

  elif pd.api.types.is_categorical(cols):


  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166  0.2489
3   most_frequent           target      0.72901  0.2597  0.26166  0.2489
4   most_frequent            baseN      0.71969  0.2597      NaN     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166  0.2489
3   most_frequent           target      0.72901  0.2597  0.26166  0.2489
4   most_frequent            baseN      0.71969  0.2597  0.26166     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.

  elif pd.api.types.is_categorical(cols):


  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166  0.2489
3   most_frequent           target      0.72901  0.2597  0.26166  0.2489
4   most_frequent            baseN      0.71969  0.2597  0.26166  0.2489
5   most_frequent           binary      0.72018  0.2597      NaN     NaN
  Impute_strategy Nominal_encoding Radom_forest   KNN_6    KNN_7  KNN_20
0   most_frequent          one_hot      0.72361  0.2597  0.26166  0.2489
1   most_frequent           hasing      0.72018  0.2597  0.26166  0.2489
2   most_frequent    leave_one_out      0.72558  0.2597  0.26166  0.2489
3   most_frequent           target      0.72901  0.2597  0.26166  0.2489
4   most_frequent            baseN      0.71969  0.2597  0.26166  0.2489
5   most_frequent           binary      0.72018  0.

Unnamed: 0,Impute_strategy,Nominal_encoding,Radom_forest,KNN_6,KNN_7,KNN_20
0,most_frequent,one_hot,0.72361,0.2597,0.26166,0.2489
1,most_frequent,hasing,0.72018,0.2597,0.26166,0.2489
2,most_frequent,leave_one_out,0.72558,0.2597,0.26166,0.2489
3,most_frequent,target,0.72901,0.2597,0.26166,0.2489
4,most_frequent,baseN,0.71969,0.2597,0.26166,0.2489
5,most_frequent,binary,0.72018,0.2597,0.26166,0.2489


ValueError: ignored

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(xTrain, yTrain)

y_pred=clf.predict(xVal)

score = metrics.accuracy_score(yVal, y_pred)
score 

0.7010309278350515

In [17]:
def get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest, n_estimators=100):
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(xTrain, yTrain)
    y_pred = clf.predict(xTest)
    score = metrics.accuracy_score(yTest, y_pred)
    return score

get_RandomForest_accuracy(xTrain, yTrain, xTest, yTest,  n_estimators=100)    

NameError: ignored

In [389]:
preprocessing.StandardScaler().fit(X_train)

ValueError: ignored

In [390]:
X_train

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
11413,Knoxfield,22 Watersedge Cl,4,h,S,Ray,15/07/2017,23.3,3180.0,4.0,2.0,2.0,633.0,120.0,1970.0,Knox,-37.88214,145.25058,Eastern Metropolitan,2949.0
5118,Reservoir,3/14 Aberdeen St,2,u,S,Nelson,13/08/2016,11.2,3073.0,2.0,1.0,1.0,0.0,120.0,1970.0,Darebin,-37.70300,145.02010,Northern Metropolitan,21650.0
12161,South Morang,3 Robin Pl,4,h,S,Millership,29/07/2017,20.5,3752.0,4.0,2.0,2.0,617.0,120.0,1970.0,Whittlesea,-37.65621,145.09390,Northern Metropolitan,7969.0
7227,West Footscray,38 Wellington St,3,h,S,Sweeney,4/03/2017,8.2,3012.0,3.0,1.0,1.0,361.0,120.0,1970.0,Maribyrnong,-37.79160,144.86730,Western Metropolitan,5058.0
2140,Doncaster,36 Henry St,5,h,S,Jellis,8/10/2016,13.9,3108.0,5.0,3.0,2.0,656.0,320.0,1985.0,Manningham,-37.77520,145.13170,Eastern Metropolitan,9028.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4023,Middle Park,171 Ashworth St,3,h,S,Greg,26/07/2016,4.1,3206.0,3.0,2.0,2.0,181.0,212.0,2010.0,Port Phillip,-37.85360,144.96250,Southern Metropolitan,2019.0
7259,West Footscray,2a Hope St,2,h,S,Jas,22/05/2016,8.2,3012.0,2.0,1.0,1.0,308.0,120.0,1970.0,Maribyrnong,-37.79080,144.87170,Western Metropolitan,5058.0
5200,Reservoir,70 Barry St,3,h,S,Love,24/09/2016,11.2,3073.0,3.0,1.0,1.0,719.0,120.0,1970.0,Darebin,-37.70900,145.00150,Northern Metropolitan,21650.0
3775,Malvern,12 Hornsby St,4,h,S,Jellis,4/06/2016,7.4,3144.0,4.0,2.0,2.0,599.0,120.0,1977.0,Stonnington,-37.84930,145.03950,Southern Metropolitan,4675.0


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(20, 10))
hist = imputed_df.hist('Price_class', bins=price_bins, ax=ax)
hist