# Notebook to prep sample data for test driving Manifold

Using the Singapore enbloc dataset to create a simple classifier with preds

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/sg_property_enbloc.csv')
df[['enbloc']].to_csv('../data/sg_property_enbloc_gt.csv', index=False)

In [3]:
df.head()

Unnamed: 0,name,id,type_code,total_units,floors,top_month,top_year,tenure,district_code,postcode,streetname,ura_key,transaction_psf,nearest_mrt_distance,enbloc,enbloc_date
0,DERBY COURT,1868,CONDO,22.0,0.0,12.0,1979.0,F,D11,309461,Derbyshire Road,DERBY COURT,788.0,0.49,1,2017-12-13
1,DUSUN GROVE,1885,CONDO,12.0,0.0,12.0,1986.0,F,D12,329371,Jalan Dusun,DUSUN GROVE,394.0,0.0,1,2010-01-19
2,BALESTIER COURT,1768,CONDO,16.0,0.0,1.0,1986.0,F,D12,329191,Jalan Raja Udang,BALESTIER COURT,390.0,0.0,1,2007-03-02
3,HJ HEIGHTS,2217,CONDO,76.0,5.0,12.0,1989.0,F,D21,597667,Merbok Crescent,HJ HEIGHTS,350.0,0.0,1,2006-08-14
4,ANGULLIA VIEW,1755,APT,24.0,0.0,1.0,1987.0,F,D09,239975,Augullia Park,ANGULLIA VIEW,800.0,0.0,1,2000-03-15


In [4]:
df.columns

Index(['name', 'id', 'type_code', 'total_units', 'floors', 'top_month',
       'top_year', 'tenure', 'district_code', 'postcode', 'streetname',
       'ura_key', 'transaction_psf', 'nearest_mrt_distance', 'enbloc',
       'enbloc_date'],
      dtype='object')

In [5]:
df.isnull().sum()

name                        0
id                          0
type_code                   0
total_units             10370
floors                     57
top_month               13785
top_year                13785
tenure                   1504
district_code               2
postcode                    0
streetname                  1
ura_key                  1555
transaction_psf          3946
nearest_mrt_distance        0
enbloc                      0
enbloc_date             16790
dtype: int64

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import numpy as np


oe = OrdinalEncoder()
imp_mean = SimpleImputer(missing_values=np.nan)
clf = KNeighborsClassifier()


training_pipeline = Pipeline([('imp_mean', imp_mean),
                              ('oe', oe),
                              ('clf', clf)])

features = ['total_units', 'floors', 'top_month', 'top_year', 'transaction_psf', 'nearest_mrt_distance']

training_pipeline.fit(df[features],  df['enbloc'])

Pipeline(memory=None,
         steps=[('imp_mean',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('oe',
                 OrdinalEncoder(categories='auto',
                                dtype=<class 'numpy.float64'>)),
                ('clf',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [7]:
df['preds'] = training_pipeline.predict(df[features])

In [8]:
df[['preds']].to_csv('../data/sg_property_enbloc_preds.csv', index=False)