In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [43]:
train.describe(include='O')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [44]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [45]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean() # 여성이 더 생존율이 높음

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [46]:
train.groupby('Embarked').Survived.value_counts() # c가 생존율이 높음

Embarked  Survived
C         1            93
          0            75
Q         0            47
          1            30
S         0           427
          1           217
Name: Survived, dtype: int64

In [47]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


In [48]:
train.groupby('Parch').Survived.value_counts() 
# parch가 1이면 생존율이 높음
# 3이거나 2인 경우도 생존율 자체는 높지만 2는 어느정도 숫자를 생각했을 때 인지할 수 있지만
# 3인 경우는 해당하는 인원의 수가 너무 적으므로 표본 부족으로 신뢰성이 떨어짐

Parch  Survived
0      0           445
       1           233
1      1            65
       0            53
2      0            40
       1            40
3      1             3
       0             2
4      0             4
5      0             4
       1             1
6      0             1
Name: Survived, dtype: int64

In [49]:
train.groupby('SibSp').Survived.value_counts()
# 형제가 1인 경우는 생존율이 더 높음

SibSp  Survived
0      0           398
       1           210
1      1           112
       0            97
2      0            15
       1            13
3      0            12
       1             4
4      0            15
       1             3
5      0             5
8      0             7
Name: Survived, dtype: int64

In [50]:
train_test_data = [train, test]
for dataset in train_test_data:
    dataset['Title'] = dataset.Name.str.extract('([A-Za-z]+)\.')

In [51]:
pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [52]:
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col', \
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Other,0.347826


In [53]:
# 위 출력을 보면 Miss, Mrs의 경우 생존율이 굉장히 높았음
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}
for dataset in train_test_data:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

In [54]:
for dataset in train_test_data: # 성별은 숫자로 표기 변경
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [55]:
train.Embarked.value_counts(dropna=False) # nan이 2개 있으며 S가 제일 많으므로 S로 채우기

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [56]:
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [57]:
for dataset in train_test_data: # embarked를 정수형 데이터로 변환
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [58]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0,1


In [59]:
for dataset in train_test_data:
    age_avg = dataset.Age.mean()
    age_std = dataset.Age.std()
    age_null_count = dataset.Age.isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list


In [60]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,1,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,0,3
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,0,1


In [61]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

In [62]:
train['AgeBand'] = pd.cut(train['Age'], 5)
print(train[['AgeBand', 'Survived']].groupby(['AgeBand']).mean()) 
# 현재 동일하게 5등분을 낸 것인데 이것은 동일하게 잘랐을 경우에 생존율이지 생존율을 따로 올리는게 더 좋아 보임

               Survived
AgeBand                
(-0.08, 16.0]  0.513514
(16.0, 32.0]   0.357968
(32.0, 48.0]   0.370787
(48.0, 64.0]   0.434783
(64.0, 80.0]   0.090909


In [63]:
train['AgeBand'] = pd.cut(train['Age'], 9)
print(train[['AgeBand', 'Survived']].groupby(['AgeBand']).mean().mean()) 
print(train[['AgeBand', 'Survived']].groupby(['AgeBand']).mean()) 
# 9등분 냈을 때 평균생존율이 높아지므로 실험으로 이 상태를 기준으로 적용

Survived    0.396929
dtype: float64
                  Survived
AgeBand                   
(-0.08, 8.889]    0.666667
(8.889, 17.778]   0.373333
(17.778, 26.667]  0.324219
(26.667, 35.556]  0.408511
(35.556, 44.444]  0.358974
(44.444, 53.333]  0.415385
(53.333, 62.222]  0.371429
(62.222, 71.111]  0.153846
(71.111, 80.0]    0.500000


In [64]:
for dataset in train_test_data:
    dataset.loc[ dataset['Age'] <= 9, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 9) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 27), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 36), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 45), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 45) & (dataset['Age'] <= 54), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 54) & (dataset['Age'] <= 63), 'Age'] = 6
    dataset.loc[(dataset['Age'] > 63) & (dataset['Age'] <= 72), 'Age'] = 7
    dataset.loc[ dataset['Age'] > 72, 'Age'] = 8

In [65]:
train.Fare.describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [66]:
train['FareBand'] = pd.qcut(train['Fare'], 6)
print(train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean())
print(train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().mean())
# 기존에 4등분 했었으나 조금더 늘려서 6등분으로 설정

            FareBand  Survived
0    (-0.001, 7.775]  0.205128
1     (7.775, 8.662]  0.190789
2    (8.662, 14.454]  0.366906
3     (14.454, 26.0]  0.436242
4     (26.0, 52.369]  0.417808
5  (52.369, 512.329]  0.697987
Survived    0.38581
dtype: float64


  print(train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().mean())


In [67]:
for dataset in train_test_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())

In [68]:
for dataset in train_test_data:
    dataset.loc[ dataset['Fare'] <= 7.775, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.775) & (dataset['Fare'] <= 8.662), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 8.662) & (dataset['Fare'] <= 14.454), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 26.0), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 26.0) & (dataset['Fare'] <= 52.369), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 52.369, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

In [69]:
for dataset in train_test_data:
    dataset['FamilySize'] = dataset['SibSp'] +  dataset['Parch'] + 1

print(train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

   FamilySize  Survived
0           1  0.303538
1           2  0.552795
2           3  0.578431
3           4  0.724138
4           5  0.200000
5           6  0.136364
6           7  0.333333
7           8  0.000000
8          11  0.000000


In [70]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,AgeBand,FareBand,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",0,2,1,0,A/5 21171,0,,0,1,"(17.778, 26.667]","(-0.001, 7.775]",2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,4,1,0,PC 17599,5,C85,1,3,"(35.556, 44.444]","(52.369, 512.329]",2
2,3,1,3,"Heikkinen, Miss. Laina",1,2,0,0,STON/O2. 3101282,1,,0,2,"(17.778, 26.667]","(7.775, 8.662]",1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,3,1,0,113803,5,C123,0,3,"(26.667, 35.556]","(52.369, 512.329]",2
4,5,0,3,"Allen, Mr. William Henry",0,3,0,0,373450,1,,0,1,"(26.667, 35.556]","(7.775, 8.662]",1


In [71]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize
0,892,3,"Kelly, Mr. James",0,3,0,0,330911,1,,2,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,5,1,0,363272,0,,0,3,2
2,894,2,"Myles, Mr. Thomas Francis",0,6,0,0,240276,2,,2,1,1
3,895,3,"Wirz, Mr. Albert",0,2,0,0,315154,2,,0,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,2,1,1,3101298,2,,0,3,3


In [72]:
features_drop = ['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)

In [73]:
train = train.drop(['PassengerId', 'AgeBand', 'FareBand'], axis=1)
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,0,3,0,2,0,0,1,2
1,1,1,1,4,5,1,3,2
2,1,3,1,2,1,0,2,1
3,1,1,1,3,5,0,3,2
4,0,3,0,3,1,0,1,1


In [74]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,FamilySize
0,892,3,0,3,1,2,1,1
1,893,3,1,5,0,0,3,2
2,894,2,0,6,2,2,1,1
3,895,3,0,2,2,0,1,1
4,896,3,1,2,2,0,3,3


In [75]:
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']
X_test = test.drop("PassengerId", axis=1).copy()

In [76]:
X_train.shape, y_train.shape, X_test.shape

((891, 7), (891,), (418, 7))

In [81]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred_random_forest = clf.predict(X_test)
acc_random_forest = round(clf.score(X_train, y_train) * 100, 2)
print (acc_random_forest)

90.01


In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_pred_random_forest
    })
submission.to_csv('submission_random_forest.csv', index=False)  # 0.76555

In [77]:
# 모델 사용
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC, LinearSVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [78]:
train_data = lgb.Dataset(X_train, label=y_train)
# 하이퍼 파라미터
params = { 
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 20,
    'learning_rate': 0.01,
    'bagging_fraction': 0.8,
}
epoch = 100
lgbm_model = lgb.train(params, train_data, epoch) # 학습

[LightGBM] [Info] Number of positive: 342, number of negative: 549
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288


In [79]:
y_pred_lgb = lgbm_model.predict(X_test)
y_pred_lgb = [1 if x>=0.5 else 0 for x in y_pred_lgb]

In [80]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_pred_lgb
    })
submission.to_csv('submission_lgb8.csv', index=False)  
# 0.78947 학습율을 0.01로 낮췄을 때 제일 좋았음
# params = { 
#     'objective': 'binary',
#     'metric': 'binary_logloss',
#     'num_leaves': 20,
#     'learning_rate': 0.01,
#     'bagging_fraction': 0.8,
# }
# epoch = 100
# 위의 하이퍼 파라미터가 점수가 잘 나왔음