## Data pre-processing

In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [62]:
sql = (
    '''
    WITH
      enbloc_list AS (
      SELECT
        project_name,
        EXTRACT(YEAR FROM contract_date) AS contract_year,
        1 AS enbloc
      FROM
        `drq-machine-learning.enbloc.enbloc`),
      property AS (
      SELECT
        UPPER(property_name) AS project_name,
        * EXCEPT(property_name)
      FROM
        `drq-machine-learning.enbloc.property_list` )
    SELECT
      property.*,
      enbloc_list.contract_year,
      enbloc_list.enbloc
    FROM
      property
    LEFT JOIN
      enbloc_list
    ON
      property.project_name = enbloc_list.project_name
    '''
)



In [63]:
data = pd.read_gbq(
    sql,
    project_id='413980110872',
    private_key='../../drq-machine-learning-0a5d30a93870.json',
    dialect='standard',
    verbose=True
)

In [64]:
data.to_csv('../data/enbloc_property.csv', index=False)

In [65]:
data = data[pd.notnull(data['TOP'])]

In [66]:
data['units'] = data['units'].fillna(data['units'].median())

In [67]:
data = data[pd.notnull(data['tenure'])]

In [68]:
data['enbloc'] = data['enbloc'].fillna(0)

In [69]:
data['contract_year'] = data['contract_year'].fillna(2018)

In [70]:
data['tenure'] = data['tenure'].apply(lambda x: x.split(' ')[0])

In [71]:
data['district'] = data['district'].apply(lambda x: x.split(' ')[0])

In [72]:
data['tenure']=data['tenure'].apply(lambda x: x.upper())

In [73]:
data['tenure']=data['tenure'].apply(lambda x: x.replace('99-YEAR', '99'))

In [74]:
data['age'] = data['contract_year'] - data['TOP']

In [75]:
data=pd.get_dummies(data, columns=['district','property_type','tenure'])

In [76]:
data=data.drop(columns ='address')

In [77]:
data=data.drop(columns ='developer')

In [78]:
data

Unnamed: 0,project_name,TOP,units,contract_year,enbloc,age,district_D02,district_D03,district_D04,district_D05,...,tenure_99,tenure_999,tenure_9999,tenure_F,tenure_FREEHOLD,tenure_L99,tenure_L999,tenure_LEASEHOLD/99,tenure_MIXED,tenure_N.A
10,PARKSUITES,1900,71.0,2018,0,118,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,TEACHERS' HOUSING ESTATE,1968,71.0,2018,0,50,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
12,LAKESIDE APARTMENTS,1970,71.0,2018,0,48,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
13,PEOPLE'S PARK COMPLEX,1972,71.0,2018,0,46,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
14,GOLDEN MILE COMPLEX,1974,71.0,2018,0,44,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
15,INTERNATIONAL PLAZA,1976,71.0,2018,0,42,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
16,FOOK HAI BUILDING,1976,71.0,2018,0,42,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
17,PEOPLE'S PARK CENTRE,1976,71.0,2018,0,42,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
18,TEXTILE CENTRE,1977,71.0,2018,0,41,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
19,PEACE CENTRE/MANSIONS,1977,42.0,2018,0,41,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [79]:
data.columns

Index(['project_name', 'TOP', 'units', 'contract_year', 'enbloc', 'age',
       'district_D02', 'district_D03', 'district_D04', 'district_D05',
       'district_D07', 'district_D08', 'district_D09', 'district_D1',
       'district_D10', 'district_D11', 'district_D12', 'district_D13',
       'district_D14', 'district_D15', 'district_D16', 'district_D17',
       'district_D18', 'district_D19', 'district_D2', 'district_D20',
       'district_D21', 'district_D22', 'district_D23', 'district_D25',
       'district_D26', 'district_D27', 'district_D28', 'district_D3',
       'district_D4', 'district_D5', 'district_D6', 'district_D7',
       'district_D8', 'district_D9', 'property_type_Apartment',
       'property_type_Condominium', 'property_type_Corner Terrace',
       'property_type_Detached House', 'property_type_HDB Apartment',
       'property_type_Terraced House', 'tenure_100', 'tenure_101',
       'tenure_103', 'tenure_110', 'tenure_60', 'tenure_929', 'tenure_946',
       'tenure_947', 

#### D05 & D5??????

In [84]:
y = data['enbloc'].as_matrix()
x = data.drop(columns=['project_name', 'enbloc']).as_matrix()

In [87]:
x_train, x_test, y_train, y_test=train_test_split(
    x,y,test_size=0.2)

In [88]:
x_train.shape

(1632, 63)

In [89]:
x.shape

(2040, 63)

In [91]:
x_test.shape

(408, 63)

In [105]:
x_test.shape

(408, 63)

## RandomForestClassifier

In [93]:
random_forest=RandomForestClassifier()

In [94]:
random_forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [97]:
y_predict=random_forest.predict(x_test)

In [99]:
accuracy=metrics.accuracy_score(y_test,y_predict)

In [100]:
accuracy

1.0

In [101]:
metrics.precision_score(y_test,y_predict)

1.0

In [102]:
metrics.recall_score(y_test,y_predict)

1.0