In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xgboost as xgb
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error 

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

- sample data 불러오기

In [17]:
base_path = 'C:/Users/hojun/Documents/dev/Kaggle-PUBG/'
sample_train = pd.read_csv(base_path + 'sample_train.csv')

In [18]:
x = sample_train.drop(columns=['winPlacePerc', 'matchId', 'groupId', 'Unnamed: 0']).copy()
y = sample_train['winPlacePerc']

In [6]:
x.maptype = x.maptype.astype(float)

- test data 불러오기

In [20]:
test_data = pd.read_csv(base_path + 'test_V2.csv')

- test data 정리하기

In [21]:
# add match_type_numerical
def divide_match_type(x):
    if x.find('solo') != -1:
        return 0
    elif x.find('duo') != -1:
        return 1
    else:
        return 2

test_data['match_type_numerical'] = test_data['matchType'].apply(lambda x : divide_match_type(x) )

# add maptype
test_data['maptype'] = test_data['matchDuration'].apply(lambda x : 0 if x<1600 else 1)

# add team members
test_data['team_members'] = test_data.groupby('groupId').Id.transform('count')

# drop colums
drop_columns = ['killStreaks','headshotKills', 'assists', 'matchDuration','matchType', 'matchId', 'groupId', 'Id', 'swimDistance', 'vehicleDestroys', 'roadKills', 'DBNOs', 'revives', 'teamKills','killPoints', 'winPoints', 'rankPoints', 'numGroups' ]
test_data = test_data.drop(columns=drop_columns).copy()

- split

In [24]:
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)

## DecisionTreeRegressor

In [25]:
dt_model = DecisionTreeRegressor(random_state=0).fit(X_train, y_train)
df_score = dt_model.score(X_test, y_test) * 100
np.round(df_score, 2)

87.43

- k-fold cross validation score

In [26]:
scores = cross_val_score(dt_model, X_test, y_test, cv=10)
scores.mean(), scores.var()

(0.8664710123483198, 2.319317999897107e-05)

- MAE

In [27]:
y_pred_xgb = dt_model.predict(X_test)
mean_absolute_error(y_test, y_pred_xgb)

0.07586144782544446

## RandomForestRegressor

- RandomForestRegressor(
    - n_estimators=100, *, criterion="squared_error", 

        max_depth=None, min_samples_split=2, 
    
        min_samples_leaf=1, min_weight_fraction_leaf=0, 
    
        max_features="auto", max_leaf_nodes=None, 
    
        min_impurity_decrease=0, bootstrap=True, 
    
        oob_score=False, n_jobs=None, 
    
        random_state=None, verbose=0, 
        
        warm_start=False, ccp_alpha=0, 
        
        max_samples=None   )

In [28]:
rf_model = RandomForestRegressor(random_state=0).fit(X_train, y_train)
rf_score = rf_model.score(X_test, y_test) * 100
np.round(rf_score, 2)

93.79

- k-fold cross validation score
    - 분산이 크면 데이터에 따라서 모델의 정확도 차이가 크므로 과적합의 위험이 크다.

In [None]:
cv_scores = cross_val_score(rf_model, X_test, y_test, cv=10)
cv_scores.mean(), cv_scores.var()

- MAE

In [None]:
y_pred_rf = rf_model.predict(X_test)
print('MAE :', mean_absolute_error(y_test, y_pred_rf))

0.005498396425221854

- max_features : sqrt

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=0, max_features='sqrt')
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print('tarin score : ', rf.score(X_train, y_train), '\t','test score :', rf.score(X_test, y_test))
print('MAE :', mean_absolute_error(y_test, y_pred_rf))

- leaf node size(마지막 노드 갯수) : 100

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=0, max_features='sqrt', max_leaf_nodes=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print('tarin score : ', rf.score(X_train, y_train), '\t','test score :', rf.score(X_test, y_test))
print('MAE :', mean_absolute_error(y_test, y_pred_rf))

- oob score : True

In [None]:
# oob score
rf = RandomForestRegressor(n_estimators=100,random_state=0,
                           max_features='sqrt',max_leaf_nodes=100,
                           oob_score=True)
                           
rf.fit(X_train, y_train)
print('tarin score : ', rf.score(X_train, y_train), '\t','test score :', rf.score(X_test, y_test))
print('MAE :', mean_absolute_error(y_test, y_pred_rf))

# out-of-bag score 낮을수록 좋다.
print(rf.oob_score_)

In [None]:
# parameter tuning using oob scores
tree_size = np.arange(2, 50, 2)
s = np.zeros((len(tree_size),4))

for i in range(len(tree_size)):
    rf = RandomForestRegressor(n_estimators=100, random_state=0,
                           max_features='sqrt', max_leaf_nodes=tree_size[i],
                           oob_score=True)
    rf.fit(X_train, y_train)
    s[i,0] = rf.score(X_train, y_train)
    s[i,1] = rf.oob_score_ 
    s[i,2] = cross_val_score(rf,X_train, y_train, cv=5).mean()
    s[i,3] = rf.score(X_test, y_test)

plt.plot(tree_size, s)