# Part III: OneHotEncoding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('2021_housesigma_cleaned_2.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Community       957 non-null    object 
 1   House_type      957 non-null    object 
 2   Listing_price   957 non-null    int64  
 3   Sigma_estimate  957 non-null    int64  
 4   Sold_price      957 non-null    float64
 5   Bedroom         957 non-null    float64
 6   Washroom        957 non-null    int64  
 7   Parking         957 non-null    int64  
 8   Tax             957 non-null    float64
 9   Maintenance     957 non-null    float64
 10  Size            957 non-null    float64
 11  Days_on_market  957 non-null    int64  
 12  Exposure        957 non-null    object 
 13  heat            957 non-null    int64  
 14  water           957 non-null    int64  
 15  hydro           957 non-null    int64  
 16  Size_cat        957 non-null    object 
dtypes: float64(5), int64(8), object(4)


In [4]:
df = df.drop(columns = ['Sigma_estimate', 'Size_cat', 'Community'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   House_type      957 non-null    object 
 1   Listing_price   957 non-null    int64  
 2   Sold_price      957 non-null    float64
 3   Bedroom         957 non-null    float64
 4   Washroom        957 non-null    int64  
 5   Parking         957 non-null    int64  
 6   Tax             957 non-null    float64
 7   Maintenance     957 non-null    float64
 8   Size            957 non-null    float64
 9   Days_on_market  957 non-null    int64  
 10  Exposure        957 non-null    object 
 11  heat            957 non-null    int64  
 12  water           957 non-null    int64  
 13  hydro           957 non-null    int64  
dtypes: float64(5), int64(7), object(2)
memory usage: 104.8+ KB


## One-hot encoding

In [6]:
ohe_columns = df.select_dtypes(include = ['object']).columns
ohe_columns

Index(['House_type', 'Exposure'], dtype='object')

In [7]:
utils_col = pd.Index(['heat', 'hydro', 'water'])

In [8]:
ohe_columns = ohe_columns.append(utils_col)

In [9]:
ohe_df = df[ohe_columns]
ohe_df.head()

Unnamed: 0,House_type,Exposure,heat,hydro,water
0,Condo Apt,S,1,0,1
1,Condo Apt,N,1,1,1
2,Condo Apt,E,1,0,1
3,Condo Apt,NW,1,0,1
4,Condo Townhouse,E,0,0,1


In [10]:
ohe_df.nunique()

House_type    4
Exposure      8
heat          2
hydro         2
water         2
dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False, drop = 'if_binary')

In [12]:
ohe_array = ohe.fit_transform(np.array(ohe_df))
print('train data set has got {} rows and {} columns'.format(ohe_array.shape[0],ohe_array.shape[1]))

train data set has got 957 rows and 15 columns


In [13]:
ohe_array[1]

array([1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1.])

In [14]:
ohe_array = pd.DataFrame(ohe_array, columns = ohe.get_feature_names())
ohe_array.head(3)

Unnamed: 0,x0_Condo Apt,x0_Condo Townhouse,x0_Detached,x0_Semi-Detached,x1_E,x1_N,x1_NE,x1_NW,x1_S,x1_SE,x1_SW,x1_W,x2_1,x3_1,x4_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [15]:
# Add more description to utils columns
ohe_array.rename(columns = {'x2_1': 'x2_1_heat',
                            'x3_1': 'x3_1_hydro',
                            'x4_1': 'x4_1_water'}, inplace=True)

In [16]:
ohe_array.head(4)

Unnamed: 0,x0_Condo Apt,x0_Condo Townhouse,x0_Detached,x0_Semi-Detached,x1_E,x1_N,x1_NE,x1_NW,x1_S,x1_SE,x1_SW,x1_W,x2_1_heat,x3_1_hydro,x4_1_water
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [17]:
df = pd.concat([df,ohe_array], axis = 1)

In [18]:
df.shape

(957, 29)

## Pickling train/test files

In [19]:
import pickle

In [20]:
# # Pickle df
# filename='df_ohe_final_2'
# outfile = open(filename, 'wb')

# pickle.dump(df, outfile)
# outfile.close()

In [None]:
# Unpickle df

In [None]:
infile = open('df_ohe_final', 'rb')

df = pickle.load(infile)
infile.close()

In [None]:
df.shape