In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('data/train-ml_v2.csv')

In [3]:
df.head()

Unnamed: 0,longitude,lotsizesquarefeet,landtaxvaluedollarcnt,latitude,structuretaxvaluedollarcnt,taxamount,calculatedfinishedsquarefeet,yearbuilt,roomcnt,buildingqualitytypeid_8.0,...,transaction_mth,propertycountylandusecode_122,transaction_day_of_wk,bedroomcnt,buildingqualitytypeid_6.0,heatingorsystemtypeid_7.0,propertycountylandusecode_010C,buildingqualitytypeid_4.0,transaction_yr,logerror
0,-118488536.0,7528.0,237416.0,34280992.0,122754.0,6735.879883,1684.0,1959.0,0.0,0,...,1.0,0,4.0,3.0,0,0,0,1,2016.0,0.0276
1,-117677552.0,3643.0,239071.0,33668120.0,346458.0,10153.019531,2263.0,2014.0,0.0,0,...,1.0,0,4.0,4.0,0,0,0,0,2016.0,-0.1684
2,-118175032.0,11423.0,57912.0,34136312.0,61994.0,11484.480469,2217.0,1940.0,0.0,0,...,1.0,0,4.0,2.0,0,0,0,1,2016.0,-0.004
3,-118309000.0,70859.0,73362.0,33755800.0,171518.0,3048.73999,839.0,1987.0,0.0,0,...,1.0,0,5.0,2.0,0,0,1,1,2016.0,0.0218
4,-117700232.0,6000.0,264977.0,33485644.0,169574.0,5488.959961,2283.0,1981.0,8.0,0,...,1.0,1,5.0,4.0,0,0,0,0,2016.0,-0.005


In [4]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [5]:
props, NAlist = reduce_mem_usage(df)

Memory usage of properties dataframe is : 28.178512573242188  MB
******************************
Column:  longitude
dtype before:  float64
dtype after:  int32
******************************
******************************
Column:  lotsizesquarefeet
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  landtaxvaluedollarcnt
dtype before:  float64
dtype after:  uint32
******************************
******************************
Column:  latitude
dtype before:  float64
dtype after:  uint32
******************************
******************************
Column:  structuretaxvaluedollarcnt
dtype before:  float64
dtype after:  uint32
******************************
******************************
Column:  taxamount
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  calculatedfinishedsquarefeet
dtype before:  float64
dtype after:  uint16
******************************
****

In [41]:
df_sample1 = df.sample(frac=0.01)

In [42]:
len(df_sample1)

1679

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
train, test = train_test_split(df_sample1,test_size=0.2, random_state=4)

In [45]:
X_train = train.drop('logerror', axis=1).values
y_train = train['logerror'].values

In [46]:
len(X_train), len(y_train)

(1343, 1343)

In [47]:
from sklearn.metrics import mean_absolute_error

In [51]:
rf = RandomForestRegressor(criterion='mae',min_samples_leaf=10,n_estimators=20,n_jobs=-1)

In [52]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=10,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=20, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [53]:
X_test = test.drop('logerror',axis=1).values
y_test = test['logerror'].values

In [54]:
y_preds = rf.predict(X_test)

In [55]:
len(y_preds)

336

In [56]:
ys = rf.predict(X_train)

In [57]:
mean_absolute_error(y_test, y_preds)

0.076562669574224182

In [58]:
num_splits = 10
seed = 7

In [59]:
kfold = model_selection.KFold(n_splits = num_splits, random_state=seed)

In [60]:
model = RandomForestRegressor(criterion='mae',min_samples_leaf=10,n_estimators=20,n_jobs=-1)

In [61]:
results = model_selection.cross_val_score(model,X_train , y_train, cv=num_splits,
                                          n_jobs=-1,scoring='neg_mean_absolute_error')

In [62]:
results.mean()


-0.070521681309162532

In [63]:
from sklearn.model_selection import GridSearchCV

In [66]:
grid = [{'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
       {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]},]

In [67]:
randn_forest = RandomForestRegressor()

In [69]:
grid_search = GridSearchCV(randn_forest, grid, cv=5,scoring='neg_mean_absolute_error')

In [70]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)