In [1]:
import pandas as pd
import numpy as np

In [96]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

In [106]:
from sklearn import utils as forest

In [3]:
import re

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [4]:
df = pd.read_feather("tmp/bulldozers_1")

In [5]:
df

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,Steering_Controls,DateYear,DateMonth,DateWeek,DateDay,DateDayofweek,DateDayofyear,DateIs_month_end,DateIs_month_start,DateElapsed
0,1139246,11.097410,999089,3157,121,3.0,2004,68.0,3,521D,...,Conventional,2006,11,46,16,3,320,False,False,1163635200
1,1139248,10.950807,117657,77,121,3.0,1996,4640.0,3,950FII,...,Conventional,2004,3,13,26,4,86,False,False,1080259200
2,1139249,9.210340,434808,7009,121,3.0,2001,2838.0,1,226,...,,2004,2,9,26,3,57,False,False,1077753600
3,1139251,10.558414,1026470,332,121,3.0,2001,3486.0,1,PC120-6E,...,,2011,5,20,19,3,139,False,False,1305763200
4,1139253,9.305651,1057373,17311,121,3.0,2007,722.0,2,S175,...,,2009,7,30,23,3,204,False,False,1248307200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412693,6333344,9.210340,1919201,21435,149,2.0,2005,,0,30NX,...,,2012,3,10,7,2,67,False,False,1331078400
412694,6333345,9.259131,1882122,21436,149,2.0,2005,,0,30NX2,...,,2012,1,4,28,5,28,False,False,1327708800
412695,6333347,9.433484,1944213,21435,149,2.0,2005,,0,30NX,...,,2012,1,4,28,5,28,False,False,1327708800
412696,6333348,9.210340,1794518,21435,149,2.0,2006,,0,30NX,...,,2012,3,10,7,2,67,False,False,1331078400


In [6]:
def fix_missing(df,col,name,nan_dict,is_train):
    
    if is_train:
         if is_numeric_dtype(col):
            if pd.isnull(col).sum:
                df[name + "_na"] = pd.isnull(col)
                nan_dict[name]= col.median()
                df[name] = col.fillna(nan_dict[name])
                
    else:
        if is_numeric_dtype(col):
            if name in nan_dict:
                df[name + "_na"] = pd.isnull(col)
                df[name] = col.fillna(nan_dict[name])
            
            else:
                df[name]=col.fillna(df[name].median())
                

In [7]:
def numericalize (df,col,name):
    if not pd.api.types.is_numeric_dtype(col):
        df[name] = col.cat.codes + 1

In [8]:
def proc(df,y_fld,nan_dict= None, is_train= True):
    
    df = df.copy()
    y = df[y_fld].values
    
    df.drop([y_fld],axis=1, inplace=True)
    
    
    if nan_dict is None:
        nan_dict = {}
        
    for n,c in df.items():
        fix_missing(df,c,n,nan_dict,is_train)
        numericalize(df,c,n)
        
    if is_train:
        return df, y,nan_dict
        
    return df,y

In [9]:
def split_train_val(df,n):
    return df[:n].copy(), df[n:].copy()


In [10]:
n_valid = 12000 # kaggle test set büyüklüğü
n_train = len(df) - n_valid
raw_train, raw_valid = split_train_val(df,n_train)

In [11]:
x_train, y_train, nas = proc(raw_train, 'SalePrice')

In [12]:
x_valid, y_valid = proc(raw_valid,"SalePrice", nas, is_train=False )

# İlk Model

In [14]:
m = RandomForestRegressor(n_estimators=1, bootstrap = False, n_jobs = -1)
m.fit(x_train,y_train)
m.score(x_train,y_train)

1.0

In [19]:
import math

def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [25]:
def print_score(m):
    print(f"RMSE Score of train set {rmse(m.predict(x_train),y_train)}")
    print(f"RMSE Score of validation set {rmse(m.predict(x_valid),y_valid)}")
    print(f"R^2 Score of train set {m.score(x_train,y_train)}")
    print(f"R^2 Score of validation set {m.score(x_valid,y_valid)}")

In [26]:
print_score(m)

RMSE Score of train set 1.0424642451347363e-16
RMSE Score of validation set 0.3648072650717891
R^2 Score of train set 1.0
R^2 Score of validation set 0.756037966082763


# Subsample

In [28]:
m = RandomForestRegressor(n_estimators=10, n_jobs = -1)
%time m.fit(x_train,y_train)
print_score(m)

CPU times: user 58.6 s, sys: 586 ms, total: 59.2 s
Wall time: 12.6 s
RMSE Score of train set 0.09044644234442918
RMSE Score of validation set 0.2675273661068178
R^2 Score of train set 0.9829900590940973
R^2 Score of validation set 0.8688006402349607


In [29]:
m = RandomForestRegressor(n_estimators=30, n_jobs = -1)
%time m.fit(x_train,y_train)
print_score(m)

CPU times: user 3min, sys: 1.5 s, total: 3min 2s
Wall time: 27.7 s
RMSE Score of train set 0.07956968995867437
RMSE Score of validation set 0.25084666630245345
R^2 Score of train set 0.9868351729242192
R^2 Score of validation set 0.8846514988598666


# Subsample Yaratma

In [33]:
def get_sample(df,n):
    idxs = np.random.permutation(len(df))[:n]
    return idxs, df.iloc[idxs].copy()

In [34]:
idxs, x_train = get_sample(x_train,3000)

In [37]:
y_train = y_train[idxs]

In [39]:
m = RandomForestRegressor(n_estimators=10, n_jobs = -1)
%time m.fit(x_train,y_train)
print_score(m)

CPU times: user 354 ms, sys: 10.1 ms, total: 364 ms
Wall time: 88.1 ms
RMSE Score of train set 0.14108941012370088
RMSE Score of validation set 0.37756639061406494
R^2 Score of train set 0.9570672818470456
R^2 Score of validation set 0.7386744069402327


In [40]:
m = RandomForestRegressor(n_estimators=30, n_jobs = -1)
%time m.fit(x_train,y_train)
print_score(m)

CPU times: user 1.05 s, sys: 22.9 ms, total: 1.07 s
Wall time: 212 ms
RMSE Score of train set 0.1259579377118333
RMSE Score of validation set 0.3575097492544348
R^2 Score of train set 0.9657823143148119
R^2 Score of validation set 0.7657006590053841


# Subsampling 2

In [107]:
def set_rf_sampling(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
                                        forest.check_random_state(rs).randint(0,n_samples,n))

In [90]:
def reset_rf_sampling(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
                                        forest.check_random_state(rs).randint(0,n_samples,n_samples))

In [110]:
set_rf_sampling(3000)

In [111]:
m = RandomForestRegressor(n_estimators=10, n_jobs = -1)
%time m.fit(x_train,y_train)
print_score(m)

CPU times: user 411 ms, sys: 9.97 ms, total: 421 ms
Wall time: 111 ms
RMSE Score of train set 0.1452123877493973
RMSE Score of validation set 0.37656797522024366
R^2 Score of train set 0.9545214213455125
R^2 Score of validation set 0.7400546490990882


In [113]:
m = RandomForestRegressor(n_estimators=70, n_jobs = -1, min_samples_leaf = 5, max_features = 0.5,oob_score=True)
%time m.fit(x_train,y_train)
print_score(m)

CPU times: user 896 ms, sys: 40.9 ms, total: 937 ms
Wall time: 250 ms
RMSE Score of train set 0.2177483234466407
RMSE Score of validation set 0.3590654526673838
R^2 Score of train set 0.8977391991471538
R^2 Score of validation set 0.7636571155967935
