# An example of LightGBM
### A little exercise using LightGBM on the Titanic dataset
* References:
  * https://towardsdatascience.com/understanding-gradient-boosting-machines-9be756fe76ab
  * https://lightgbm.readthedocs.io/en/latest/Python-Intro.html
  


In [661]:
import pandas as pd
import numpy as np
import random as rnd

from sklearn import metrics, model_selection
import lightgbm as lgb
import os
import math

In [662]:
# get the data
train_df = pd.read_csv('data/titanic/train.csv')
test_df = pd.read_csv('data/titanic/test.csv')

In [663]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [664]:
# DataFrame.dtypes for data must be int, float or bool
# Did not expect the data types in the following fields: Name, Sex, Ticket, Cabin, Embarked

In [665]:
# show the attribute types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [666]:
## The plan:
#
# PassenterId - do not use
# Survived (the class) - already INT
# Pclass - already INT
# Name - will not use
# Sex -----> convert to bool
# Age -----> convert to nominal (INT)
# SibSp - already INT
# Parch - already INT
# Ticket - will not use
# Fare - already a float
# Cabin - 147 unique values - could probably use this but will ignore for now
# Embarked ----> convert to INT

In [667]:
def sex_to_bracket(cols):
    sex = cols[0]

    if sex == 'male': return 0
    if sex == 'female': return 1
    return -1

In [668]:
train_df['SexBracket'] = train_df[['Sex']].apply(sex_to_bracket, axis = 1)

In [669]:
#def age_to_bracket(cols):
#    age = cols[0]
#
#    if math.isnan(age): return -1
#    if age < 10 : return 0
#    if age < 20 : return 1
#    if age < 40 : return 2
#    if age < 60 : return 3
#    return 4

In [670]:
#train_df['AgeBracket'] = train_df[['Age']].apply(age_to_bracket, axis = 1)

In [671]:
train_df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [672]:
def embarked_to_bracket(cols):
    x = cols[0]

    if x == 'S': return 0
    if x == 'C': return 1
    if x == 'Q': return 2
    return -1

In [673]:
train_df['EmbarkedBracket'] = train_df[['Embarked']].apply(embarked_to_bracket, axis = 1)

In [674]:
def survived_to_bracket(cols):
    x = cols[0]

    if x == 0: return False
    return True

In [675]:
train_df['SurvivedBracket'] = train_df[['Survived']].apply(survived_to_bracket, axis = 1)

In [676]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SexBracket,EmbarkedBracket,SurvivedBracket
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,0,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1,True
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1,0,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1,0,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0,0,False
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1,0,True
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,1,0,False
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,0,1,True


In [677]:
train_df.isnull().sum()

PassengerId          0
Survived             0
Pclass               0
Name                 0
Sex                  0
Age                177
SibSp                0
Parch                0
Ticket               0
Fare                 0
Cabin              687
Embarked             2
SexBracket           0
EmbarkedBracket      0
SurvivedBracket      0
dtype: int64

In [678]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,SexBracket,EmbarkedBracket
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.352413,0.359147
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.47799,0.638707
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0,-1.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,0.0,0.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,0.0,0.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,1.0,1.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,1.0,2.0


In [679]:
train_df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Lobb, Mr. William Arthur",male,1601,G6,S
freq,1,577,7,4,644


In [680]:
#train_df[["AgeBracket", "Survived"]].groupby(['AgeBracket'], as_index=False).count()

In [681]:
#train_df[["AgeBracket", "Survived"]].groupby(['AgeBracket'], as_index=False).mean()

In [682]:
train_df[["SexBracket", "Survived"]].groupby(['SexBracket'], as_index=False).count().sort_values(by='SexBracket', ascending=True)

Unnamed: 0,SexBracket,Survived
0,0,577
1,1,314


In [683]:
train_df[["SexBracket", "Survived"]].groupby(['SexBracket'], as_index=False).mean().sort_values(by='SexBracket', ascending=True)

Unnamed: 0,SexBracket,Survived
0,0,0.188908
1,1,0.742038


In [684]:
train_df[["EmbarkedBracket", "Survived"]].groupby(['EmbarkedBracket'], as_index=False).count().sort_values(by='EmbarkedBracket', ascending=True)

Unnamed: 0,EmbarkedBracket,Survived
0,-1,2
1,0,644
2,1,168
3,2,77


In [685]:
train_df[["EmbarkedBracket", "Survived"]].groupby(['EmbarkedBracket'], as_index=False).mean().sort_values(by='EmbarkedBracket', ascending=True)

Unnamed: 0,EmbarkedBracket,Survived
0,-1,1.0
1,0,0.336957
2,1,0.553571
3,2,0.38961


In [686]:
# show the attribute types
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId        891 non-null int64
Survived           891 non-null int64
Pclass             891 non-null int64
Name               891 non-null object
Sex                891 non-null object
Age                714 non-null float64
SibSp              891 non-null int64
Parch              891 non-null int64
Ticket             891 non-null object
Fare               891 non-null float64
Cabin              204 non-null object
Embarked           889 non-null object
SexBracket         891 non-null int64
EmbarkedBracket    891 non-null int64
SurvivedBracket    891 non-null bool
dtypes: bool(1), float64(2), int64(7), object(5)
memory usage: 98.4+ KB


# create the lgb "Dataset"
here I am following https://lightgbm.readthedocs.io/en/latest/Python-Intro.html

In [687]:
# the class
label = train_df['SurvivedBracket']

In [688]:
# the attributes
# PassenterId - do not use
# Survived (the class) - already INT
# Pclass - already INT
# Name - will not use
# Sex -----> convert to bool
# Age -----> convert to nominal (INT)
# SibSp - already INT
# Parch - already INT
# Ticket - will not use
# Fare - already a float
# Cabin - 147 unique values - could probably use this but will ignore for now
# Embarked ----> convert to INT

data = train_df[['Pclass', 'SexBracket', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedBracket']]

In [689]:
# Specific feature names and categorical features:
# LightGBM can use categorical features as input directly.
# It doesn’t need to convert to one-hot coding, and is much faster than one-hot coding (about 8x speed-up).

train_data = lgb.Dataset(data.values,
                         label=label.values, 
                         feature_name=list(data.columns.values),
                         categorical_feature='auto')

In [690]:
# Saving Dataset into a LightGBM binary file will make loading faster
train_data.save_binary('train_data.bin')

<lightgbm.basic.Dataset at 0x23d72c72508>

In [691]:
#Create validation data:
validation_data = train_data.create_valid(data.values, label=label.values);

In [692]:
# setting parameters
#param = {'num_leaves': 31, 'objective': 'binary'}
#param['metric'] = 'auc'

#params = {}
#params['learning_rate']= 0.003
#params['boosting_type']='gbdt'
#params['objective']='binary'
#params['metric']='binary_logloss'
#params['sub_feature']=0.5
#params['num_leaves']= 10
#params['min_data']=50
#params['max_depth']=10

params = {}
params['learning_rate']= 0.003
params['boosting_type']='gbdt'
params['objective']='binary'
params['metric']='binary_logloss'
params['sub_feature']=0.5
params['num_leaves']= 30
params['min_data']=50
params['max_depth']=10


In [693]:
# training
#num_round = 10
#bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])

In [694]:
# After training, the model can be saved
#bst.save_model('model.txt')
#A saved model can be loaded:
#bst = lgb.Booster(model_file='model.txt')  # init model

In [695]:
# Training with 5-fold CV:
#lgb.cv(param, train_data, num_round, nfold=5)

In [696]:
# Early Stopping
# Early stopping requires at least one set in valid_sets.
# Validation score needs to improve at least every early_stopping_rounds to continue training.
bst = lgb.train(param, train_data, num_round, valid_sets=validation_data, early_stopping_rounds=5)
bst.save_model('model.txt', num_iteration=bst.best_iteration)

[1]	valid_0's auc: 0.90276
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.910763
[3]	valid_0's auc: 0.911032
[4]	valid_0's auc: 0.915333
[5]	valid_0's auc: 0.919074
[6]	valid_0's auc: 0.920081
[7]	valid_0's auc: 0.921058
[8]	valid_0's auc: 0.922967
[9]	valid_0's auc: 0.922938
[10]	valid_0's auc: 0.9262
Did not meet early stopping. Best iteration is:
[10]	valid_0's auc: 0.9262


<lightgbm.basic.Booster at 0x23d72b7fdc8>

In [697]:
# Prediction

test_df['SexBracket'] = test_df[['Sex']].apply(sex_to_bracket, axis = 1)
#test_df['AgeBracket'] = test_df[['Age']].apply(age_to_bracket, axis = 1)
test_df['EmbarkedBracket'] = test_df[['Embarked']].apply(embarked_to_bracket, axis = 1)


In [698]:
data = test_df[['Pclass', 'SexBracket', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedBracket']]

In [699]:
ypred = bst.predict(data, num_iteration=bst.best_iteration)

In [700]:
Submission = pd.DataFrame({
    "PassengerId": test_df['PassengerId'],
    "Survived": ypred
})

Submission.to_csv('Submission.csv', index = False)

In [701]:
Submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0.140653
1,893,0.413334
2,894,0.252793
3,895,0.271939
4,896,0.444383


In [702]:
Submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null float64
dtypes: float64(1), int64(1)
memory usage: 6.7 KB


In [703]:
Submission[Submission["Survived"]<=0.5].count()

PassengerId    298
Survived       298
dtype: int64

In [704]:
def survived_to_bracket(cols):
    survived = cols[0]

    if math.isnan(survived): return -1
    if survived < 0.5 : return 0
    return 1


In [705]:
Submission['SurvivedBracket'] = Submission[['Survived']].apply(survived_to_bracket, axis = 1)

In [706]:
Submission.head()

Unnamed: 0,PassengerId,Survived,SurvivedBracket
0,892,0.140653,0
1,893,0.413334,0
2,894,0.252793,0
3,895,0.271939,0
4,896,0.444383,0


In [707]:
# drop the "Survived" column
Submission.drop(['Survived'], axis=1, inplace=True)
# rename from "SurvivedBracket" to "Survived"
Submission.rename(columns={"SurvivedBracket": "Survived"}, inplace=True)

In [708]:
Submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [709]:
Submission.to_csv('Submission.csv', index = False)

# Submit file to Kaggle



Go to the submission section of the Titanic competition. Drag your file from the directory which contains your code and make your submission.



# Basic
* the training gets 90% and I'm only at 77% on the leaderboard...
* tried:
  * params['num_leaves']= 30  (formerly 10)
  * no change
* thinking I'm over-training I tried lim
  * early_stopping_rounds=1 (formerly 5)
  * no change



Congratulations:

You advanced 2,476 places on the leaderboard!

Your submission scored 0.77033, which is an improvement of your previous score of 0.75598. Great job!
    

# Leave Age as a float (implemented above)

* age = float
* params['num_leaves']= 10
* early_stopping_rounds=5


You advanced 758 places on the leaderboard!

Your submission scored 0.77511, which is an improvement of your previous score of 0.77033.