In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn import metrics

In [2]:
data = pd.read_csv('train.csv', index_col=0)
data.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [4]:
data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [5]:
data.shape

(891, 11)

In [6]:
data = data.drop(['Age', 'Cabin', 'Name', 'Ticket'], axis=1)
data.head(3)

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S


In [7]:
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [8]:
data.dropna(inplace=True)
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [9]:
y = data.Survived
X = data.drop(['Survived'], axis=1)

In [10]:
X.head()

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,male,1,0,7.25,S
2,1,female,1,0,71.2833,C
3,3,female,0,0,7.925,S
4,1,female,1,0,53.1,S
5,3,male,0,0,8.05,S


In [11]:
cat_features = ['Sex', 'Embarked']

In [12]:
encoder_1 = LabelEncoder()

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=11)

In [14]:
enc_train = X_train.copy()
enc_valid = X_valid.copy()

for col in cat_features:
    enc_train[col] = encoder_1.fit_transform(X_train[col])
    enc_valid[col] = encoder_1.transform(X_valid[col])            
    
enc_train.head()

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
486,3,0,3,1,25.4667,2
335,1,0,1,0,133.65,2
774,3,1,0,0,7.225,0
384,1,0,1,0,52.0,2
333,1,1,0,1,153.4625,2


In [15]:
model_RF = RandomForestClassifier(random_state=0)
model_RF.fit(enc_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [16]:
preds_RF = model_RF.predict(enc_valid)
score = metrics.roc_auc_score(preds_RF, y_valid)
score

0.7831498951781971

In [17]:
data_2 = data.copy()
data_2.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S
4,1,1,female,1,0,53.1,S
5,0,3,male,0,0,8.05,S


In [18]:
valid_fraction = 0.1
valid_size = int(len(data_2) * valid_fraction)

train = data_2[ : -2 * valid_size]
valid = data_2[-2 * valid_size : -valid_size]
test = data_2[-valid_size:]

train.shape

(713, 7)

In [19]:
train.head(3)

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,1,0,7.25,S
2,1,1,female,1,0,71.2833,C
3,1,3,female,0,0,7.925,S


In [20]:
test.shape

(88, 7)

In [21]:
encoder_2 = ce.CountEncoder(cols=cat_features)

In [22]:
encoder_2.fit(train[cat_features], train['Survived'])

CountEncoder(cols=['Sex', 'Embarked'], combine_min_nan_groups=True,
             drop_invariant=False, handle_missing='count', handle_unknown=None,
             min_group_name=None, min_group_size=None, normalize=False,
             return_df=True, verbose=0)

In [23]:
encoded_train = train.join(encoder_2.transform(train[cat_features], train['Survived']).add_suffix('_Count'))
encoded_valid = valid.join(encoder_2.transform(valid[cat_features], valid['Survived']).add_suffix('_Count'))
encoded_test = test.join(encoder_2.transform(test[cat_features], test['Survived']).add_suffix('_Count'))

encoded_train.head(3)

Unnamed: 0_level_0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Sex_Count,Embarked_Count
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,1,0,7.25,S,458,511
2,1,1,female,1,0,71.2833,C,255,138
3,1,3,female,0,0,7.925,S,255,511


In [24]:
features = encoded_train.columns.drop(['Survived', 'Sex', 'Embarked'])
features

Index(['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex_Count', 'Embarked_Count'], dtype='object')

In [25]:
encoded_train[features].head(3)

Unnamed: 0_level_0,Pclass,SibSp,Parch,Fare,Sex_Count,Embarked_Count
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3,1,0,7.25,458,511
2,1,1,0,71.2833,255,138
3,3,0,0,7.925,255,511


In [26]:
param = {'num_leaves': 20, 'objective': 'binary', 'metric': 'auc'}
num_rounds = 1000

In [27]:
dtrain = lgb.Dataset(encoded_train[features], encoded_train['Survived'])
dvalid = lgb.Dataset(encoded_valid[features], encoded_valid['Survived'])

In [28]:
model_2 = lgb.train(param, dtrain, num_rounds, valid_sets=[dvalid], early_stopping_rounds=10)

[LightGBM] [Info] Number of positive: 278, number of negative: 435
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 142
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389902 -> initscore=-0.447725
[LightGBM] [Info] Start training from score -0.447725
[1]	valid_0's auc: 0.908046
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.910632
[3]	valid_0's auc: 0.91523
[4]	valid_0's auc: 0.915805
[5]	valid_0's auc: 0.921552
[6]	valid_0's auc: 0.920402
[7]	valid_0's auc: 0.923851
[8]	valid_0's auc: 0.924425
[9]	valid_0's auc: 0.922701
[10]	valid_0's auc: 0.922701
[11]	valid_0's auc: 0.922701
[12]	valid_0's auc: 0.923851
[13]	valid_0's auc: 0.924713
[14]	valid_0's auc: 0.927011
[15]	valid_0's auc: 0.924713
[16]	valid_0's auc: 0.925
[17]	valid_0's auc: 0.925575
[18]	valid_0'

In [29]:
test_preds = model_2.predict(encoded_test[features])
ROC_Score = metrics.roc_auc_score(encoded_test['Survived'], test_preds)
print('The Test ROC Score: ', ROC_Score)

The Test ROC Score:  0.8772321428571429
