In [64]:
from __future__ import print_function
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib
#import pandas_profiling
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
#import xgboost as xgb
import lightgbm as lgb


%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

import warnings
warnings.filterwarnings('ignore')

Подгрузим тренировочный датасет

In [65]:
fn = 'train.csv'
df = pd.read_csv(fn)

Сделаем преобразования для признаков

In [66]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, Imputer, LabelBinarizer, OneHotEncoder,StandardScaler
from sklearn.feature_extraction import DictVectorizer

def get_sex_col(df):
    lb= LabelBinarizer()
    return lb.fit_transform(df[['Sex']])

def get_num_cols(df):
    bins = [0, 10, 15, 25, 40, 55, 100]
    labels = [10, 15, 25, 40, 55, 100 ]
    mn = df['Age'].mean()
    df['AgeGrp'] = df['Age'].fillna(mn)
    df['AgeGrp'] = pd.cut(df['AgeGrp'], bins, labels=labels)

    bins = [0, 10, 30, 100, 600]
    labels = [10, 30, 100, 600 ]
    mn = df['Fare'].mean()
    df['FareGrp'] = df['Fare'].fillna(mn)
    df['FareGrp'] = pd.cut(df['FareGrp'], bins, labels=labels)    
    return df[['AgeGrp', 'FareGrp']]

def get_pclass_col(df):
    return df[['Pclass']]

def get_port_col(df):
    le = LabelEncoder()
    return le.fit_transform(df['Embarked'].fillna('S').T).reshape(-1, 1).astype('int')


def get_cabin_col(df):
    le = LabelEncoder()
    return le.fit_transform(df['Cabin'].fillna('NaN').T).reshape(-1, 1).astype('float')

# наличие родственников можно объединить
def get_rel_col(df):
    return np.sum(df[['SibSp','Parch']] , axis=1).values.reshape(-1, 1).astype('float')  


построим пайплайн

In [67]:

pipeline = make_union(*[
    make_pipeline(FunctionTransformer(get_num_cols, validate=False), Imputer(strategy='mean'), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_pclass_col, validate=False), OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_sex_col, validate=False)),
    make_pipeline(FunctionTransformer(get_port_col, validate=False), OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_cabin_col, validate=False), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_rel_col, validate=False), StandardScaler())
])


In [68]:
df_train = df.copy()

In [69]:

y_train = df_train['Survived']
y_train.shape

(891,)

In [70]:
x_train = pipeline.fit_transform(df_train)
x_train.shape

(891, 11)

Посмотрим на список параметров которые можно выставить для lightgdb

In [71]:
lgb.LGBMClassifier().get_params()

{'boosting_type': 'gbdt',
 'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_bin': 255,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 1}

In [72]:
import time
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
def randomized_cv(model, param_grid, x_train, y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=10, scoring='accuracy')
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    return grid_search.best_estimator_

Произведем подбор параметров рандомной сеткой

In [79]:
import lightgbm as lgb

param_grid = {
    'max_depth': [2, 3, 5, 7],
    'n_estimators': [10, 50, 100, 150], #Number of boosted trees to fit
    'learning_rate': [0.01, 0.02, 0.05, 0.1],
    'boosting_type': ['gbdt','dart']
}
model = randomized_cv(lgb.LGBMClassifier(), param_grid, x_train, y_train)

model LGBMClassifier best accuracy score is 0.8271604938271605
time for training is 4.996448755264282 seconds


In [80]:
model.get_params()

{'boosting_type': 'gbdt',
 'colsample_bytree': 1.0,
 'learning_rate': 0.05,
 'max_bin': 255,
 'max_depth': 3,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 150,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 1}

Оценим accuracy выбранной модели

In [81]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, model.predict(x_train))

0.85185185185185186

In [82]:
df_test = pd.read_csv('test.csv')
x_test = pipeline.fit_transform(df_test) 
x_test.shape

(418, 11)

Сделаем предсказание полученной моделью и сабмитим его на каггл

In [83]:
y_test =  model.predict(x_test)
df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
df_predicted.to_csv('sample_submission.csv', sep=',', index=False)

результат  с каггла - 0.75119, удручает

Попробуем подобрать парметры полным перебором по сетке

In [78]:
# переопределим функцию преобразования
#def get_num_cols(df):
#    return df[['Age', 'Fare']]
# проведем тренировку
#x_train = pipeline.fit_transform(df_train)

def search_cv(model, param_grid, x_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=10, scoring='accuracy')
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    return grid_search.best_estimator_

model = search_cv(lgb.LGBMClassifier(), param_grid, x_train, y_train)
print(accuracy_score(y_train, model.predict(x_train)))
# сделаем предсказание
x_test = pipeline.fit_transform(df_test) 
y_test =  model.predict(x_test)
df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
df_predicted.to_csv('sample_submission.csv', sep=',', index=False)


model LGBMClassifier best accuracy score is 0.8316498316498316
time for training is 63.90380620956421 seconds
0.847362514029


результат тот же - 0.75119
но скорость подбора гораздо медленне, будем использовать рандом серч