https://www.kaggle.com/competitions/porto-seguro-safe-driver-prediction

# 宣言部

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from numba import jit
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import SCORERS
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from bayes_opt import BayesianOptimization

# 関数定義

In [None]:
#計算時間算出
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        #thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        #tmin, tsec = divmod(temp_sec, 60)
        #print('Time: %i H %i M %s sec' % (thour, tmin, round(tsec, 2)))
        tsec = (datetime.now() - start_time).total_seconds()
        print('Time:' + str(tsec))

#gini係数算出
@jit
def gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n - 1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', gini(labels, preds), True

#未使用データ削除
#'ps_car_03_cat', 'ps_car_05_cat'欠損値多過ぎる、'ps_car_11_cat'カテゴリにしては多過ぎる
def dropmissingcol(pdData):
    vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat','ps_car_11_cat']
    pdData.drop(vars_to_drop, inplace=True, axis=1)
    return pdData

#欠損値補完
def missingvalues(pdData):
    mean_imp = SimpleImputer(missing_values=-1, strategy='mean')
    mode_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')#catは最頻値
    mode_col = ['ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat',
                'ps_car_01_cat','ps_car_02_cat','ps_car_07_cat','ps_car_09_cat']
    pd_return = pdData.copy()
    features = pdData.columns
    for i in features:
        if i in mode_col:
            pd_return[i] = mode_imp.fit_transform(pdData[[i]]).ravel()
        else:
            pd_return[i] = mean_imp.fit_transform(pdData[[i]]).ravel()
    return pd_return

#カテゴリone-hot
def encodecat(train, test):
    cat_features = [col for col in train.columns if '_cat' in col]
    for column in cat_features:
        temp = pd.get_dummies(pd.Series(train[column]), prefix=column)
        train = pd.concat([train, temp], axis=1)
        train = train.drop([column], axis=1)

    for column in cat_features:
        temp = pd.get_dummies(pd.Series(test[column]), prefix=column)
        test = pd.concat([test, temp], axis=1)
        test = test.drop([column], axis=1)
    return train, test

#標準化
def RescaleData(train, test):
    scaler = StandardScaler()
    scaler.fit_transform(train)
    scaler.fit_transform(test)
    return train, test

#calc削除
def DropCalcCol(train, test):
    col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
    train = train.drop(col_to_drop, axis=1)
    test = test.drop(col_to_drop, axis=1)
    return train, test

# データ読込・確認

In [None]:
#データ読込
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
#データ確認
x = np.shape(train)[0]
y = np.shape(train)[1]
train_columns = train.columns
print('x = ' + str(x))
print('y = ' + str(y))
print(train_columns)

# データ分割

In [None]:
#各種データ分割　データ系統ごと
train_target = train['target'].values
train_id = train['id'].values
test_id = test['id'].values
train = dropmissingcol(train)
test = dropmissingcol(test)
train = train.drop(['id','target'], axis=1)
test = test.drop(['id'], axis=1)

train_cat_cols = []
train_bin_cols = []
train_num_cols = []
for col in train.columns:
    if '_cat' in col:
        train_cat_cols.append(col)
    elif '_bin' in col:
        train_bin_cols.append(col)
    else:
        train_num_cols.append(col)

train_cat = train[train_cat_cols]
train_bin = train[train_bin_cols]
train_num = train[train_num_cols]

test_cat_cols = []
test_bin_cols = []
test_num_cols = []
for col in test.columns:
    if '_cat' in col:
        test_cat_cols.append(col)
    elif '_bin' in col:
        test_bin_cols.append(col)
    else:
        test_num_cols.append(col)

test_cat = test[test_cat_cols]
test_bin = test[test_bin_cols]
test_num = test[test_num_cols]

In [None]:
print('cat:' + train_cat.columns)
print('bin:' + train_bin.columns)
print('num:' + train_num.columns)

In [None]:
#各種データ分割　データ種類ごと
#train = train.drop(['target', 'id'], axis=1)
train_ind_cols = []
train_reg_cols = []
train_car_cols = []
train_calc_cols = []
for col in train.columns:
    if 'ps_ind_' in col:
        train_ind_cols.append(col)
    elif 'ps_reg_' in col:
        train_reg_cols.append(col)
    elif 'ps_car_' in col:
        train_car_cols.append(col)       
    else:
        train_calc_cols.append(col)

train_ind = train[train_ind_cols]
train_reg = train[train_reg_cols]
train_car = train[train_car_cols]
train_calc = train[train_calc_cols]

test_ind_cols = []
test_reg_cols = []
test_car_cols = []
test_calc_cols = []
for col in test.columns:
    if 'ps_ind_' in col:
        test_ind_cols.append(col)
    elif 'ps_reg_' in col:
        test_reg_cols.append(col)
    elif 'ps_car_' in col:
        test_car_cols.append(col)       
    else:
        test_calc_cols.append(col)

test_ind = test[test_ind_cols]
test_reg = test[test_reg_cols]
test_car = test[test_car_cols]
test_calc = test[test_calc_cols]

In [None]:
print('ind:' + train_ind.columns)
print('reg:' + train_reg.columns)
print('car:' + train_car.columns)
print('calc:' + train_calc.columns)

In [None]:
#trainデータ選択
#train_temp = train_cat
#test_temp = test_cat
train_temp =pd.concat([train_cat, train_num], axis=1)
test_temp =pd.concat([test_cat, test_num], axis=1)
print(train_temp.columns)

# 学習データ準備

In [None]:
#ラベルデータ設定
y_train = train_target

#欠損値補完
X = missingvalues(train_temp)
X_test = missingvalues(test_temp)
X = pd.DataFrame(X)
X_test = pd.DataFrame(X_test)
#calc削除、on-hot、正規化
X, X_test = encodecat(X, X_test)
X, X_test = RescaleData(X, X_test)

In [None]:
#データ数削減
train_num = 50000
X = X.loc[:train_num - 1]
y_train = np.delete(y_train,range(train_num,len(y_train)),0)

In [None]:
print(X.columns)

# 複数手法で検証

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

#データ分割
X_train, X_val, Y_train, Y_val= train_test_split(X, y_train, test_size=0.10)

#各モデルで検証
Models = {
    "LR":LinearRegression(),                                        #線形回帰モデル
    "KNNR":KNeighborsRegressor(),                                   #k-近傍回帰
    "SVR":SVR(),                                                    #サポートベクター回帰
    "DT":DecisionTreeRegressor(),                                   #決定木回帰
    "RF":RandomForestRegressor(n_estimators=625),                   #ランダムフォレスト回帰
    "GBR":GradientBoostingRegressor(n_estimators=4000,alpha=0.01), #勾配ブースティング回帰
    "XGBR":XGBRegressor(n_estimators=550),                         #XGBoost回帰
    "LGBM":LGBMRegressor(n_estimators=1200, alpha=0.02)            #LightGMB回帰
}

for name, model in Models.items():
    print(f"Using Model: {name}" )
    start_time = timer(None)
    model.fit(X_train, Y_train)
    timer(start_time)
    print(f'Training Score: {model.score(X_train, Y_train)}')
    print(f'Test Score: {model.score(X_val, Y_val)}')
    Validate_Predictions = model.predict(X_val)
    print(f'gini: {gini(Y_val, Validate_Predictions)}')
    print('-'*45)
    #joblib.dump(model, name+'.h5')

# ラッパー法で特徴量選択

In [None]:
#データ分割
X_train, X_val, Y_train, Y_val= train_test_split(X, y_train, test_size=0.10)

feature = X_train.shape[1]
print(feature)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#sfs = SFS(LinearRegression(),　　　　　　　　　　　　　　　　　　　#LR
sfs = SFS(LGBMRegressor(n_estimators=1200, alpha=0.02),          #LGBM
           k_features=feature,
           forward=True,
           floating=False,
           scoring = 'neg_mean_squared_error',
           cv = skf)
sfs.fit(X_train,Y_train)
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results['avg_score'] = df_SFS_results["avg_score"].astype(float)
df_SFS_results
#df_SFS_results.to_csv('df_SFS_results_LGBM.csv') #結果保存

# Grid Searchでパラメータ最適化　LGBM

In [None]:
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50

#paramaters to search over
params = {
    'min_child_weight': [5, 10, 12, 15, 30, 50, 100, 150],
    'num_leaves': [4, 5, 8, 10, 15, 20, 30],
    'subsample': [0.2, 0.4, 0.6, 0.8],
    'drop_rate': [0.1, 0.3, 0.5, 0.7, 0.15, 0.2],
    'max_depth': [3, 4, 5, 7, 10, 12, 15, 20]
}
#classifier model
model = lgbm.LGBMClassifier(learning_rate=LEARNING_RATE, n_estimators=600, objective='binary', )

#folds to use in stratified k-fold
folds = 3
#how many combinations of the above parameters should we try
param_comb = 10
#the algorithm is going to run folds x param_comb times

SKfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1)
#set up search with SKfold split
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4,
                                   cv=SKfold.split(X, y_train), verbose=3, random_state=1)


#UNCOMMENT FOLLOWING TO RUN GRIDSEARCH
start_time = timer(None)
#start search
random_search.fit(X, y_train)
timer(start_time)

print('All results:')
print(random_search.cv_results_)
print('Best estimator:')
print(random_search.best_estimator_)
print('Best Normalised gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_)
print('Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('lightgbm-randomgridsearch-results-03.csv')

# ベイズでパラメータ最適化　LGBM

In [None]:
def evaluate_model(num_leaves, min_child_weight, feature_fraction, subsample, drop_rate, max_depth):
    params = {
        "objective": "binary",
        "boosting_type": "gbdt",
        "learning_rate": 0.07,
        "verbosity": -1,
        "num_leaves": int(num_leaves),
        "min_child_weight": min_child_weight,
        "feature_fraction": feature_fraction,
        "subsample": subsample,
        'drop_rate': drop_rate,
        'max_depth': int(max_depth)
    }
    num_boost_round = 10000

    # define the number of folds for cross-validation
    n_folds = 5

    # create a stratified k-fold iterator
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1)

    # initialize a list to store the evaluation metric for each fold
    scores = []

    # iterate over the folds
    for id_train, id_val in skf.split(X, y_train):
        # get the training and validation data for this fold
        X_train_fold = X.iloc[id_train]
        y_train_fold = y_train[id_train]
        X_val_fold = X.iloc[id_val]
        y_val_fold = y_train[id_val]

        lgb_train = lgbm.Dataset(X_train_fold, y_train_fold)
        lgb_val = lgbm.Dataset(X_val_fold, y_val_fold)

        # train the model with the specified parameters on the training data
        model = lgbm.train(params, lgb_train, num_boost_round, valid_sets=lgb_val, feval=evalerror, verbose_eval=False,
                           early_stopping_rounds=100)
        scores.append(model.best_score['valid_0']['gini'])

    # return the mean evaluation metric across all folds
    return np.mean(scores)

# define the hyperparameters to be optimised
hyperparameters = {
    "num_leaves": (4, 50),
    "min_child_weight": (0.001, 150),
    "feature_fraction": (0.1, 0.9),
    "subsample": (0.1, 1),
    'drop_rate': (0.1, 0.8),
    'max_depth': (3, 20)
}

#UNCOMMENT THE FOLLOWING TO RUN BAYESIAN OPTIMISATION

# perform Bayesian optimisation to find the optimal hyperparameters
optimizer = BayesianOptimization(evaluate_model, hyperparameters)
optimizer.maximize(n_iter=10)

# display the optimal values of the hyperparameters
print("Optimal hyperparameters:")
print(optimizer.max)


# LightGBM modelで学習、提出データ作成

In [None]:
#パラメータ設定
min_data_in_leaf = 2000
num_boost_round = 10000
params = {'n_estimators': 1200,
          'alpha': 0.02
          }

#交差検証
folds = 5
SKfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1)

#スコア格納のための空変数
best_trees = []
fold_scores = []

cv_train = np.zeros(len(y_train))
cv_pred = np.zeros(len(X_test))

start_time = timer(None)
#過学習防止のため平均
iterations = 3
for seed in range(iterations):
    timer(start_time)
    params['seed'] = seed
    #交差検証開始
    for id_train, id_test in SKfold.split(X, y_train):
        #x train, x validation
        xtr, xvl = X.loc[id_train], X.loc[id_test]
        #y train, y validation
        ytr, yvl = y_train[id_train], y_train[id_test]
        #efficient datastructures for lgbm
        dtrain = lgbm.Dataset(data=xtr, label=ytr)
        dval = lgbm.Dataset(data=xvl, label=yvl, reference=dtrain)
        #学習モデル作成
        bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dval, feval=evalerror, verbose_eval=100,
                         early_stopping_rounds=100)
        #ベストを保存
        best_trees.append(bst.best_iteration)
        fold_scores.append(bst.best_score)
        #フォールドベストを学習
        cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)

pd.DataFrame({'id': test_id, 'target': cv_pred / (iterations * folds)}).to_csv('submission_LGBM.csv', index=False)

# Linear modelで学習、提出データ作成

In [None]:
#パラメータ設定
Lr=LinearRegression()
params = ()

#交差検証
folds = 5
SKfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1)

#スコア格納のための空変数
best_trees = []
fold_scores = []

cv_train = np.zeros(len(y_train))
cv_pred = np.zeros(len(X_test))

fold_count = 0
#交差検証開始
for id_train, id_test in SKfold.split(X, y_train):
    fold_count += 1
    #x train, x validation
    xtr, xvl = X.loc[id_train], X.loc[id_test]
    #y train, y validation
    ytr, yvl = y_train[id_train], y_train[id_test]
    #学習モデル作成
    bst = Lr.fit(xtr,ytr)
    bst_val = bst.predict(xvl)
    bst_val[bst_val < 0] = 0
    fold_gini = gini(yvl,bst_val)
    print('fold' + str(fold_count) + ':' + str(fold_gini))
    test_pred = bst.predict(X_test)
    test_pred[test_pred < 0] = 0
    cv_pred += test_pred
    
pd.DataFrame({'id': test_id, 'target': cv_pred / (folds)}).to_csv('submission_linear.csv', index=False)