In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold

%matplotlib inline

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_test_data = pd.concat([train_data, test_data])

In [4]:
train_test_data['Cabin'] = train_test_data['Cabin'].str[:1]

In [5]:
object_columns = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

for column in object_columns:
    train_test_data[column] = pd.factorize(train_test_data[column])[0]

In [6]:
train_test_data['Cabin'] = train_test_data['Cabin'].replace(-1, np.nan)

In [7]:
train_test_data['Cabin'].fillna(train_test_data.groupby('Pclass')['Cabin'].transform('median'), inplace=True)

In [8]:
train_test_data['Cabin'].value_counts(dropna=False)

4.0    715
6.0    275
0.0     94
2.0     72
5.0     65
3.0     46
1.0     41
7.0      1
Name: Cabin, dtype: int64

In [10]:
train_data = train_test_data[~train_test_data['Survived'].isnull()]
test_data = train_test_data[train_test_data['Survived'].isnull()]

In [21]:
from lightgbm import LGBMClassifier

def train_oof(train_data, test_data, nfolds=5):
    ftr_train = train_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1)
    target = train_data['Survived']
    
    folds = KFold(n_splits=nfolds, shuffle=True, random_state=2022)
    
    oof_preds = np.zeros(ftr_train.shape[0])
    test_preds = np.zeros(test_data.shape[0])
    
    clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000,
        learning_rate=0.01,
        max_bin=19,
        min_child_samples=43,
        min_child_weight=8,
        num_leaves=4,
        subsample=0.982,
        max_depth=5,
        reg_alpha=23.764,
        reg_lambda=5.647,
        silent=-1,
        verbose=-1
    )
    
    for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(ftr_train)):
        print('##### iteration ', fold_idx, ' 시작')
        train_x  = ftr_train.iloc[train_idx, :]
        train_y = target.iloc[train_idx]
        valid_x = ftr_train.iloc[valid_idx, :]
        valid_y = target.iloc[valid_idx]
        
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'auc', verbose= 200, 
                early_stopping_rounds= 200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]       

        test_preds += clf.predict_proba(test_data.drop(['PassengerId', 'Survived', 'Name', 'Ticket'], axis=1),
                                        num_iteration=clf.best_iteration_)[:, 1]/folds.n_splits
        
        
    return clf, test_preds

In [22]:
import datetime
 
print(datetime.datetime.now())

clf, test_preds = train_oof(train_data, test_data, nfolds=5)

print(datetime.datetime.now())

2022-04-03 16:04:12.350436
##### iteration  0  시작




[200]	training's auc: 0.836429	training's binary_logloss: 0.49854	valid_1's auc: 0.836167	valid_1's binary_logloss: 0.510162
##### iteration  1  시작




[200]	training's auc: 0.836731	training's binary_logloss: 0.493309	valid_1's auc: 0.821723	valid_1's binary_logloss: 0.524351
##### iteration  2  시작




[200]	training's auc: 0.843527	training's binary_logloss: 0.491113	valid_1's auc: 0.811785	valid_1's binary_logloss: 0.532784
##### iteration  3  시작




[200]	training's auc: 0.830977	training's binary_logloss: 0.515626	valid_1's auc: 0.867393	valid_1's binary_logloss: 0.467863
##### iteration  4  시작




[200]	training's auc: 0.839284	training's binary_logloss: 0.502481	valid_1's auc: 0.831815	valid_1's binary_logloss: 0.50352
2022-04-03 16:04:14.385334


In [27]:
test_preds = [1 if x > 0.5 else 0 for x in test_preds]

In [28]:
test_data['Survived'] = test_preds
test_data[['PassengerId', 'Survived']].to_csv('titanic_final_result.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Survived'] = test_preds
