In [13]:
from __future__ import print_function
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib
#import pandas_profiling
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
#import xgboost as xgb
import lightgbm as lgb


%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format



Подгрузим тренировочный датасет

In [14]:
fn = 'train.csv'
df = pd.read_csv(fn)

Сделаем преобразования для признаков

In [38]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, MinMaxScaler, Imputer, LabelBinarizer, OneHotEncoder,StandardScaler
from sklearn.feature_extraction import DictVectorizer

def get_sex_col(df):
    lb= LabelBinarizer()
    return lb.fit_transform(df[['Sex']])

def get_num_cols(df):
    bins = [0, 10, 15, 25, 40, 55, 100]
    labels = [10, 15, 25, 40, 55, 100 ]
    mn = df['Age'].mean()
    df['AgeGrp'] = df['Age'].fillna(mn)
    df['AgeGrp'] = pd.cut(df['AgeGrp'], bins, labels=labels)

    bins = [0, 10, 30, 100, 600]
    labels = [10, 30, 100, 600 ]
    mn = df['Fare'].mean()
    df['FareGrp'] = df['Fare'].fillna(mn)
    df['FareGrp'] = pd.cut(df['FareGrp'], bins, labels=labels)    
    return df[['AgeGrp', 'FareGrp']]

def get_pclass_col(df):
    return df[['Pclass']]

def get_port_col(df):
    le = LabelEncoder()
    return le.fit_transform(df['Embarked'].fillna('S').T).reshape(-1, 1).astype('int')


def get_cabin_col(df):
    le = LabelEncoder()
    return le.fit_transform(df['Cabin'].fillna('NaN').T).reshape(-1, 1).astype('float')

# наличие родственников можно объединить
def get_rel_col(df):
    return np.sum(df[['SibSp','Parch']] , axis=1).values.reshape(-1, 1).astype('float')  


построим пайплайн

In [39]:

pipeline = make_union(*[
    make_pipeline(FunctionTransformer(get_num_cols, validate=False), Imputer(strategy='mean'), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_pclass_col, validate=False), OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_sex_col, validate=False)),
    make_pipeline(FunctionTransformer(get_port_col, validate=False), OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_cabin_col, validate=False), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_rel_col, validate=False), StandardScaler())
])


In [40]:
df_train = df.copy()

In [41]:

y_train = df_train['Survived']
y_train.shape

(891,)

In [42]:
x_train = pipeline.fit_transform(df_train)
x_train.shape

(891, 11)

In [43]:
import time
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
def randomized_cv(model, param_grid, x_train, y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy')
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    return grid_search.best_estimator_

In [44]:
import lightgbm as lgb

param_grid = {
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.02, 0.05]
}
model = randomized_cv(lgb.LGBMClassifier(), param_grid, x_train, y_train)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.f

model LGBMClassifier best accuracy score is 0.8282828282828283
time for training is 3.555612564086914 seconds


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


In [45]:
df_test = pd.read_csv('test.csv')
x_test = pipeline.fit_transform(df_test) 
x_test.shape

(418, 11)

Сделаем предсказание полученной моделью и сабмитим его на каггл

In [46]:
y_test =  model.predict(x_test)
df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
df_predicted.to_csv('sample_submission.csv', sep=',', index=False)

результат  с каггла - 0.76076, удручает

    уберем категориальное преобразование, оставим оригинальные цифры

In [47]:
# переопределим функцию преобразования
def get_num_cols(df):
    return df[['Age', 'Fare']]
# проведем тренировку
x_train = pipeline.fit_transform(df_train)
model = randomized_cv(lgb.LGBMClassifier(), param_grid, x_train, y_train)
# сделаем предсказание
x_test = pipeline.fit_transform(df_test) 
y_test =  model.predict(x_test)
df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
df_predicted.to_csv('sample_submission.csv', sep=',', index=False)


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.f

model LGBMClassifier best accuracy score is 0.8305274971941639
time for training is 3.7878942489624023 seconds


  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


результат еще хуже - 0.75119