# Kategorik Değişken Dönüşüm

In [1]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics

In [2]:
import pandas as pd
import numpy as np

In [3]:
import math

In [4]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [5]:
PATH = "data/bulldozers/"

In [6]:
df = pd.read_csv(f'{PATH}Train.csv', low_memory=False, parse_dates=["saledate"])

In [7]:
df = df.sort_values(by="saledate").reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,...,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1646770,9500,1126363,8434,132,18.0,1974,,,1989-01-17,...,,,,,,None or Unspecified,Straight,None or Unspecified,,
1,1404019,24000,1169900,7110,132,99.0,1986,,,1989-01-31,...,,,,,,,,,,
2,1415646,35000,1262088,3357,132,99.0,1975,,,1989-01-31,...,,,,,,,,,,
3,1596358,19000,1433229,8247,132,99.0,1978,,,1989-01-31,...,,,,,,,,,Standard,Conventional
4,1821514,14000,1194089,10150,132,99.0,1980,,,1989-01-31,...,,,,,,,,,Standard,Conventional


## RMSLE

In [9]:
#df["SalePrice"] = np.log(df.SalePrice)

In [10]:
#m = RandomForestRegressor(n_jobs=-1)
#m.fit(df.drop('SalePrice', axis=1), df.SalePrice)

## Kategorik Değişkenleri Sayısal Hale Getirmek

In [11]:
df["UsageBand"]

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
401120    NaN
401121    NaN
401122    NaN
401123    NaN
401124    NaN
Name: UsageBand, Length: 401125, dtype: object

In [12]:
for n, c in df.items():
    print(n)
    print("-------")
    print(c)
    break

SalesID
-------
0         1646770
1         1404019
2         1415646
3         1596358
4         1821514
           ...   
401120    6260878
401121    6288376
401122    6258093
401123    6315563
401124    6312170
Name: SalesID, Length: 401125, dtype: int64


In [13]:
# string dtype'ı olan columnları pandas category tipine çevirir
def train_cats(df):
    for n, c in df.items():
        if is_string_dtype(c):
            df[n] = c.astype("category").cat.as_ordered()

In [14]:
# train set'e train_cats uygulandıktan sonra aynı category değişimlerim olsun diye validation train'e bu uygulanır
def apply_cats(df, train):
    for n, c in df.items():
        if train[n].dtype == "category":
            df[n] = pd.Categorical(c, categories = train[n].cat.categories, ordered = True)

In [15]:
train_cats(df)

In [16]:
df["UsageBand"]

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
         ... 
401120    NaN
401121    NaN
401122    NaN
401123    NaN
401124    NaN
Name: UsageBand, Length: 401125, dtype: category
Categories (3, object): ['High' < 'Low' < 'Medium']

In [17]:
df["UsageBand"].cat.categories

Index(['High', 'Low', 'Medium'], dtype='object')

In [18]:
df["UsageBand"].cat.set_categories(["High", "Medium", "Low"], ordered = True, inplace = True)

In [19]:
df["UsageBand"].cat.categories

Index(['High', 'Medium', 'Low'], dtype='object')

In [20]:
df["UsageBand"].cat.codes

0        -1
1        -1
2        -1
3        -1
4        -1
         ..
401120   -1
401121   -1
401122   -1
401123   -1
401124   -1
Length: 401125, dtype: int8

In [21]:
def numericalize(df, col, name):
    if not is_numeric_dtype(col):
        df[name] = col.cat.codes + 1

In [22]:
numericalize(df, df["UsageBand"], "UsageBand")

In [23]:
df["UsageBand"]

0         0
1         0
2         0
3         0
4         0
         ..
401120    0
401121    0
401122    0
401123    0
401124    0
Name: UsageBand, Length: 401125, dtype: int8

## Datetime Column

In [24]:
df["saledate"]

0        1989-01-17
1        1989-01-31
2        1989-01-31
3        1989-01-31
4        1989-01-31
            ...    
401120   2011-12-30
401121   2011-12-30
401122   2011-12-30
401123   2011-12-30
401124   2011-12-30
Name: saledate, Length: 401125, dtype: datetime64[ns]

In [25]:
df["saledate"].dt.year

0         1989
1         1989
2         1989
3         1989
4         1989
          ... 
401120    2011
401121    2011
401122    2011
401123    2011
401124    2011
Name: saledate, Length: 401125, dtype: int64

In [26]:
def add_datepart(df, dt_name, drop=True):
    
    dt_column = df[dt_name]
    column_dtype = dt_column.dtype
    
    attr = ['year', 'month', 'week', 'day', 'dayofweek', 'dayofyear', 'is_month_end', 'is_month_start',
            'is_quarter_end', 'is_quarter_start', 'is_year_end', 'is_year_start']
    
    for a in attr:
        df["Date" + a.capitalize()] = getattr(dt_column.dt, a)
        
    df["Date" + 'Elapsed'] = dt_column.astype(np.int64) // 10 ** 9
    
    if drop:
        df.drop(dt_name, axis=1, inplace=True)

In [27]:
add_datepart(df, "saledate")

  df["Date" + a.capitalize()] = getattr(dt_column.dt, a)


In [28]:
df

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,DateDay,DateDayofweek,DateDayofyear,DateIs_month_end,DateIs_month_start,DateIs_quarter_end,DateIs_quarter_start,DateIs_year_end,DateIs_year_start,DateElapsed
0,1646770,9500,1126363,8434,132,18.0,1974,,0,TD20,...,17,1,17,False,False,False,False,False,False,600998400
1,1404019,24000,1169900,7110,132,99.0,1986,,0,416,...,31,1,31,True,False,False,False,False,False,602208000
2,1415646,35000,1262088,3357,132,99.0,1975,,0,12G,...,31,1,31,True,False,False,False,False,False,602208000
3,1596358,19000,1433229,8247,132,99.0,1978,,0,644,...,31,1,31,True,False,False,False,False,False,602208000
4,1821514,14000,1194089,10150,132,99.0,1980,,0,A66,...,31,1,31,True,False,False,False,False,False,602208000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401120,6260878,13500,1799594,4102,149,2.0,1000,,0,D4C,...,30,4,364,False,False,False,False,False,False,1325203200
401121,6288376,9750,1872596,4875,149,2.0,1000,,0,520C,...,30,4,364,False,False,False,False,False,False,1325203200
401122,6258093,14500,1877553,3170,149,2.0,1988,,0,580K,...,30,4,364,False,False,False,False,False,False,1325203200
401123,6315563,12500,1869637,26456,149,2.0,2010,,0,L160,...,30,4,364,False,False,False,False,False,False,1325203200


## Feather Format

In [29]:
import os

In [30]:
os.makedirs('tmp', exist_ok=True) # Eğer tmp diye bir klasör varsa hata vermemesini sağlar
df.to_feather('tmp/bulldozers_1')

# Missing Value

In [31]:
import pandas as pd
import numpy as np

In [32]:
df = pd.read_feather("tmp/bulldozers_1")

In [33]:
d = {"a": [1, 2, 3, 4, 4, np.nan], "b":[1, 2, 3, 3, 4, 5]}
toy = pd.DataFrame(d)

In [34]:
toy

Unnamed: 0,a,b
0,1.0,1
1,2.0,2
2,3.0,3
3,4.0,3
4,4.0,4
5,,5


In [35]:
toy.isnull()

Unnamed: 0,a,b
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,True,False


In [36]:
toy["a"].isnull()

0    False
1    False
2    False
3    False
4    False
5     True
Name: a, dtype: bool

In [37]:
toy["a"].isnull().sum()

1

In [38]:
def fix_missing(df, col, name):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum:
            df[name+"_na"] = pd.isnull(col)
        df[name] = col.fillna(col.median())

In [39]:
for n, c in df.items():
    if is_numeric_dtype(c):
        if df[n].isnull().sum():
            print(n)

auctioneerID
MachineHoursCurrentMeter


In [40]:
df["MachineHoursCurrentMeter"].isnull().sum()

258360

In [41]:
fix_missing(df, df["MachineHoursCurrentMeter"], "MachineHoursCurrentMeter")

In [42]:
df["MachineHoursCurrentMeter"].isnull().sum()

0

In [43]:
df["MachineHoursCurrentMeter_na"]

0         True
1         True
2         True
3         True
4         True
          ... 
401120    True
401121    True
401122    True
401123    True
401124    True
Name: MachineHoursCurrentMeter_na, Length: 401125, dtype: bool

# Hepsi Bir Arada

In [44]:
def fix_missing(df, col, name):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum:
            df[name+"_na"] = pd.isnull(col)
        df[name] = col.fillna(col.median())

In [45]:
def numericalize(df, col, name):
    if not is_numeric_dtype(col):
        df[name] = col.cat.codes + 1

In [46]:
def proc_df(df, y_fld):
    
    y = df[y_fld].values
    df.drop([y_fld], axis=1, inplace=True)
    
    for n, c in df.items():
        fix_missing(df, c, n)
        numericalize(df, c, n)
    
    return df, y

In [47]:
def fix_missing(df, col, name, nan_dict, is_train):
    
    if is_train:
        if is_numeric_dtype(col):
            if pd.isnull(col).sum():
                df[name+"_na"] = pd.isnull(col)
                nan_dict[name] = col.median()
                df[name] = col.fillna(nan_dict[name])

    else:
        if is_numeric_dtype(col):
            if pd.isnull(col).sum:
                df[name+"_na"] = pd.isnull(col)
                df[name] = col.fillna(col.median())
            else:
                df[name] = col.fillna(df[name].median())

In [48]:
# We will have codes starting from 0 (for missing)
def numericalize(df, col, name):
    if not is_numeric_dtype(col):
        df[name] = col.cat.codes + 1

In [49]:
def proc_df(df, y_fld, nan_dict=None, is_train=True):
    
    df = df.copy()
    y = df[y_fld].values
    
    df.drop([y_fld], axis=1, inplace=True)
    
    if nan_dict is None:
        nan_dict = {}
    
    for n, c in df.items():
        fix_missing(df, c, n, nan_dict, is_train)
        numericalize(df, c, n)
        
    if is_train:
        return df, y, nan_dict
    
    return df, y

In [50]:
def split_train_val(df, n):
    
    return df[:n].copy(), df[n:].copy()

In [51]:
n_valid = 12000 # same as Kaggle's test size
n_train = len(df)-n_valid
raw_train, raw_valid = split_train_val(df, n_train)

In [52]:
x_train, y_train, nas = proc_df(raw_train, 'SalePrice')

In [53]:
x_valid, y_valid = proc_df(raw_valid, 'SalePrice', nan_dict=nas, is_train=False)
x_valid.drop([col for col in x_valid.columns if col not in x_train.columns], axis=1, inplace=True)

# İlk Model

- Default olarak R^2 alır

In [54]:
m = RandomForestRegressor(n_estimators=1, bootstrap=False, n_jobs=-1)
m.fit(x_train, y_train)
m.score(x_train, y_train)

1.0

In [55]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [56]:
def print_score(m):
    
    print(f"RMSE of train set {rmse(m.predict(x_train), y_train)}")
    print(f"RMSE of validation set {rmse(m.predict(x_valid), y_valid)}")
    print(f"R^2 of train set {m.score(x_train, y_train)}")
    print(f"R^2 of validation set {m.score(x_valid, y_valid)}")

In [57]:
print_score(m)

RMSE of train set 0.0
RMSE of validation set 13170.869434333103
R^2 of train set 1.0
R^2 of validation set 0.7296747921151657


## Tüm Veri İle

In [58]:
x_train, y_train, nas = proc_df(raw_train, 'SalePrice')

In [59]:
x_valid, y_valid = proc_df(raw_valid, 'SalePrice', nan_dict=nas, is_train=False)
x_valid.drop([col for col in x_valid.columns if col not in x_train.columns], axis=1, inplace=True)

In [60]:
m = RandomForestRegressor(n_estimators=10, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 10.5 s
RMSE of train set 3038.7929421986346
RMSE of validation set 9507.078502073995
R^2 of train set 0.9824768059420598
R^2 of validation set 0.859151632851154


In [61]:
m = RandomForestRegressor(n_estimators=30, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 30 s
RMSE of train set 2675.349526416471
RMSE of validation set 9122.847054079593
R^2 of train set 0.9864177377689093
R^2 of validation set 0.8703064301682794


# Subsample

## Farklı Şeyleri Hızlı Şekilde Denemek

- Elimizdeki problem için uygun modeli bulmak için çok şey denememiz gerekebilir.
- Bu iteratif süreci hızlı bir hale getirmek için model seçme kısmını subsample alarak yapabiliriz.

In [62]:
arr = np.array([10, 21, 7, 13, 5, 12, 56, 2, 3, 40])

In [63]:
a = np.random.permutation(10)

In [64]:
idxs = a[:3]

In [65]:
arr[idxs]

array([ 2, 13,  7])

## Subset Yaratma

In [66]:
def get_sample(df, n):
    
    idxs = np.random.permutation(len(df))[:n]
    return idxs, df.iloc[idxs].copy()

- Validation seti değiştirmek istemiyorum, sadece train içinde subset alacağım

In [67]:
idxs, x_train = get_sample(x_train, 3000)
y_train = y_train[idxs]

In [68]:
m = RandomForestRegressor(n_estimators=10, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 97.7 ms
RMSE of train set 4982.879617291257
RMSE of validation set 13681.546424749775
R^2 of train set 0.9547820121802503
R^2 of validation set 0.7083056389803544


In [69]:
m = RandomForestRegressor(n_estimators=30, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 145 ms
RMSE of train set 4458.369027009765
RMSE of validation set 13315.297086756482
R^2 of train set 0.9638005086396194
R^2 of validation set 0.723713682417142


# Hyperparameters

## Using Bootstraping and More Trees than Default

In [70]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 190 ms
RMSE of train set 4379.950943021627
RMSE of validation set 12944.851473420844
R^2 of train set 0.9650627325109566
R^2 of validation set 0.7388729874446343


## Using min_sample_leaf

**min_sample_leaf**: The minimum number of samples required to be at a leaf node.

We can grow our trees less deeply to reduce overfitting. We do this by setting **min_sample_leaf**.
- There are less decision rules for each leaf node. Our model will not memorize the data, it will be so called simpler, and not specialized to our data, that kind of models should generalize better.
- The predictions are made by averaging more rows in the leaf node, it will also help our model generalize better.

In [71]:
# It will train more quickly because it will be less deep
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 155 ms
RMSE of train set 6058.25009474688
RMSE of validation set 12908.880301848649
R^2 of train set 0.9331587076906988
R^2 of validation set 0.7403222113136385


It increases our R^2 of validation set! It generalizes better as we thought it would!

If you are using big dataset, you can set min_sample_leaf to 10-10000

The only way to know which one is better is to try and experiment!

- Generally try these values first: 1, 3, 5, 10, 25, 100

In [72]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=5, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 131 ms
RMSE of train set 7462.833593333829
RMSE of validation set 12989.120770509482
R^2 of train set 0.8985719775680946
R^2 of validation set 0.7370839092416209


## Using max_features to Add Randomness to Split

It will make our trees to learn specific things, patterns in our model better.

It will also increase the amount of variation amongst the trees. It will also using a sample of _columns_ for each _split_. We do this by specifying **max_features**.

We said that less correlated our trees are the better.

Suppose that some features are too much important at deciding than others, so regardless of our different subset, all trees will use those features in first splits, and that makes our trees similar, more correlated.

But we want our trees to learn different patterns in our data. We don't want them to learn same things! The reason we are averaging all the trees we have is to gain knowledge from them, but if all of them learn same things, it will be much less informative. **So we want our trees to learn different things and for that reason randomizing the features that they can use will create more unique trees that can learn different things**.

So every individual splits will be based on different subset of features. At every decision point, we will use different subset of features to decide splitting.

- None: Use all of them
- 0.5: Use half of them
- 'sqrt': Use sqrt of the amount

In [73]:
m = RandomForestRegressor(n_estimators=70, min_samples_leaf=5, max_features=0.5, n_jobs=-1, oob_score=True)
%time m.fit(x_train, y_train)
print_score(m)

Wall time: 174 ms
RMSE of train set 7985.778211557599
RMSE of validation set 13296.697608898154
R^2 of train set 0.8838591684476145
R^2 of validation set 0.7244850046277846
