In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('data/train-ml.csv')

In [3]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [4]:
props, NAlist = reduce_mem_usage(df)

Memory usage of properties dataframe is : 13.423698425292969  MB
******************************
Column:  taxvaluedollarcnt
dtype before:  float64
dtype after:  uint32
******************************
******************************
Column:  taxdelinquencyyear
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  latitude
dtype before:  float64
dtype after:  uint32
******************************
******************************
Column:  yearbuilt
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  transaction_mth
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  unitcnt
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  taxamount
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  lo

dtype after:  uint8
******************************
******************************
Column:  propertycountylandusecode_0101
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  propertycountylandusecode_0102
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  propertycountylandusecode_0103
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  propertycountylandusecode_0104
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  propertycountylandusecode_0108
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  propertycountylandusecode_0109
dtype before:  float64
dtype after:  uint8
******************************
******************************
Column:  propertycountylandusecode_010C
dtype be

In [5]:
df_sample1 = df[['propertycountylandusecode_38', 'propertylandusetypeid_248.0', 'pooltypeid10_0.0', 'propertycountylandusecode_01HC', 'propertycountylandusecode_1', 'buildingqualitytypeid_9.0', 'typeconstructiontypeid_0.0', 'propertycountylandusecode_34', 'propertycountylandusecode_0200', 'taxdelinquencyflag_Y', 'buildingqualitytypeid_8.0', 'propertycountylandusecode_010E', 'propertycountylandusecode_0101', 'taxdelinquencyyear', 'propertylandusetypeid_266.0', 'pooltypeid2_0.0', 'transaction_yr', 'fips_6059.0', 'unitcnt', 'bathroomcnt', 'buildingqualitytypeid_6.0', 'airconditioningtypeid_1.0', 'heatingorsystemtypeid_7.0', 'transaction_day_of_wk', 'bedroomcnt', 'roomcnt', 'yearbuilt', 'calculatedfinishedsquarefeet', 'taxamount', 'taxvaluedollarcnt', 'landtaxvaluedollarcnt', 'structuretaxvaluedollarcnt', 'latitude', 'lotsizesquarefeet', 'longitude', 'pooltypeid10_1.0', 'typeconstructiontypeid_6.0', 'pooltypeid2_1.0', 'taxdelinquencyflag_no', 'logerror']]

In [6]:
len(df_sample1)

13431

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train, test = train_test_split(df_sample1,test_size=0.2, random_state=4)

In [9]:
X_train = train.drop('logerror', axis=1).values
y_train = train['logerror'].values

In [10]:
len(X_train), len(y_train)

(10744, 10744)

In [11]:
from sklearn.metrics import mean_absolute_error

In [12]:
rf = RandomForestRegressor(criterion='mae',n_jobs=-1)

In [13]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [14]:
X_test = test.drop('logerror',axis=1).values
y_test = test['logerror'].values

In [15]:
y_preds = rf.predict(X_test)

In [16]:
len(y_preds)

2687

In [18]:
ys = rf.predict(X_train)

In [17]:
mean_absolute_error(y_test, y_preds)

0.084207073958967663

In [9]:
num_splits = 10
seed = 7

In [10]:
kfold = model_selection.KFold(n_splits = num_splits, random_state=seed)

In [11]:
model = RandomForestRegressor(criterion='mae')

In [15]:
results = model_selection.cross_val_score(model, data, target, cv=num_splits,
                                          n_jobs=-1,scoring='neg_mean_absolute_error')

In [16]:
results.mean()


-0.088577886290800587

In [17]:
results

array([-0.09519925, -0.09184664, -0.08757733, -0.0993521 , -0.07764035,
       -0.08395402, -0.08728465, -0.09514591, -0.08713768, -0.08064094])