# Connect to google account

In [4]:
from google.colab import drive
drive.mount('/content/data')

Mounted at /content/data


In [5]:
# 코렙 한글깨짐 방지
!apt -qq -y install fonts-nanum > /dev/null

# 데이터 시각화에 사용할 라이브러리
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font_name = fm.FontProperties(fname=fontpath).get_name()
# fm._rebuild()


%config InlineBackend.figure_format = 'retina'

plt.rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False





# Global Variables

In [6]:
import easydict
import json
args = easydict.EasyDict()

# path 정보
args.default_path = '/content/data/MyDrive/AI_study/1. Machine Learning/3. Supervised Learning/original data/'
args.train_csv = args.default_path+'train.csv'
args.test_csv = args.default_path+'test.csv'
args.default_submission_csv = args.default_path+'submission.csv'

args.submission_csv = args.default_path+'result/submission_0220_eda_model_test.csv'
args.save_results = args.default_path+"result/model_results.json"

# 데이터 분석을 위한 변수들
args.random_state = 42
args.results = []
with open(args.save_results, 'r') as file:
    args.results = json.load(file)

args.results

[{'model': 'modelV4',
  'score_tr': 0.9547581903,
  'score_te': 0.8327272727,
  'auc_te': 0.916020018,
  'len_features': 10,
  'feaute_importances': ['fare',
   'age',
   'sibsp',
   'pclass',
   'embarked_S',
   'gender_female',
   'parch',
   'embarked_C',
   'embarked_Q',
   'gender_male'],
  'create_dt': '0217'}]

# Load Tritanic

- Surived:0=사망, 1=생존
- Pclass: 1=1등석, 2=2등석, 3=3등석
- Sex:male=남성, female=여성
- Age: 나이
- SibSp: 타이타닉 호에 동승한 자매/배우자의 수
- Parch: 타이타닉 호에 동승한 부모/자식의 수
- Ticket: 티켓 번호
- Fare: 승객 요금
- Cabin: 방 호수
- Embarked: 탑승지; C=셰르부르, Q=퀴즈타운, S=사우샘프턴

In [7]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
plt.ion()

import warnings
warnings.filterwarnings('ignore')

In [8]:
ori_train = pd.read_csv(args.train_csv)
ori_test = pd.read_csv(args.test_csv)

ori_train.shape, ori_test.shape

((916, 12), (393, 11))

In [9]:
ori_train['passengerid'].nunique(), ori_train.shape[0]

(916, 916)

In [10]:
ori_train.drop('passengerid', axis=1, inplace=True)
ori_train.shape

(916, 11)

In [11]:
ori_test.set_index(['passengerid'], inplace=True)
ori_test.shape

(393, 10)

# ModelV3

## train_test_split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
ori_te = ori_test.copy()

y = ori_train['survived']
X = ori_train.drop(['survived'], axis=1)

In [14]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=ori_train['survived'], random_state=args.random_state)

X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((641, 10), (275, 10), (641,), (275,))

## Feature Extraction

Initial_name

In [15]:
X_tr['Initial_name'] = X_tr['name'].str.extract('([A-Za-z]+)\.') #lets extract the Salutations
X_te['Initial_name'] = X_te['name'].str.extract('([A-Za-z]+)\.') #lets extract the Salutations
ori_te['Initial_name'] = ori_te['name'].str.extract('([A-Za-z]+)\.') #lets extract the Salutations

X_tr.drop(['name'], axis=1, inplace=True)
X_te.drop(['name'], axis=1, inplace=True)
ori_te.drop(['name'], axis=1, inplace=True)

print(f'Initial_name: {X_tr["Initial_name"].isnull().sum()} / {X_te["Initial_name"].isnull().sum()} / {ori_te["Initial_name"].isnull().sum()}')

Initial_name: 0 / 0 / 0


Initial_name_type

In [16]:
X_tr['Initial_name'].unique()

array(['Mr', 'Miss', 'Mrs', 'Master', 'Jonkheer', 'Dr', 'Countess', 'Rev',
       'Col', 'Major', 'Sir', 'Capt'], dtype=object)

In [17]:
X_tr['Initial_name'].value_counts()

Initial_name
Mr          371
Miss        127
Mrs          93
Master       32
Rev           5
Dr            4
Col           3
Major         2
Jonkheer      1
Countess      1
Sir           1
Capt          1
Name: count, dtype: int64

성별 기준으로 최빈값이 Mr, Miss이며,   
Mr, Miss는 모두 Others 이다.

In [18]:
dict_initial_name = {
    'Mr': 'Others',
    'Miss': 'Others',
    'Mrs': 'Others',

    'Master': 'Officer',
    'Dr': 'Officer',
    'Rev': 'Officer',
    'Col': 'Officer',
    'Major': 'Officer',
    'Capt': 'Officer',

    'Jonkheer': 'Royalty',
    'Countess': 'Royalty',
    'Sir': 'Royalty'
}

def add_initial_name_type(initial_name):
  result = 'Others'
  try:
    result = dict_initial_name[initial_name]
  except:
    pass
  return result

X_tr['Initial_name_type'] = X_tr['Initial_name'].map(lambda x: add_initial_name_type(x))
X_te['Initial_name_type'] = X_te['Initial_name'].map(lambda x: add_initial_name_type(x))
ori_te['Initial_name_type'] = ori_te['Initial_name'].map(lambda x: add_initial_name_type(x))

X_tr.drop(['Initial_name'], axis=1, inplace=True)
X_te.drop(['Initial_name'], axis=1, inplace=True)
ori_te.drop(['Initial_name'], axis=1, inplace=True)

print(f'Initial_name_type: {X_tr["Initial_name_type"].isnull().sum()} / {X_te["Initial_name_type"].isnull().sum()} / {ori_te["Initial_name_type"].isnull().sum()}')

Initial_name_type: 0 / 0 / 0


initial_cabin

In [19]:
def add_initial_cabin(cabin, initial_cabin_unique=None):
  result = 'unknown'
  try:
    cabin = cabin.strip()
    if cabin[:1] in initial_cabin_unique:
      result = cabin[:1]
  except:
    pass

  return result

X_tr['initial_cabin'] = X_tr['cabin'].map(lambda cabin: add_initial_cabin(cabin))
initial_cabin_unique = list(X_tr['initial_cabin'].unique())

X_te['initial_cabin'] = X_te['cabin'].map(lambda cabin: add_initial_cabin(cabin, initial_cabin_unique))
ori_te['initial_cabin'] = ori_te['cabin'].map(lambda cabin: add_initial_cabin(cabin, initial_cabin_unique))

X_tr.drop(['cabin'], axis=1, inplace=True)
X_te.drop(['cabin'], axis=1, inplace=True)
ori_te.drop(['cabin'], axis=1, inplace=True)

print(f'initial_cabin: {X_tr["initial_cabin"].isnull().sum()} / {X_te["initial_cabin"].isnull().sum()} / {ori_te["initial_cabin"].isnull().sum()}')

initial_cabin: 0 / 0 / 0


len_ticket

In [20]:
X_tr['len_ticket'] = X_tr['ticket'].map(lambda x: len(x.strip().split(' ')))
X_te['len_ticket'] = X_te['ticket'].map(lambda x: len(x.strip().split(' ')))
ori_te['len_ticket'] = ori_te['ticket'].map(lambda x: len(x.strip().split(' ')))

X_tr.drop(['ticket'], axis=1, inplace=True)
X_te.drop(['ticket'], axis=1, inplace=True)
ori_te.drop(['ticket'], axis=1, inplace=True)

print(f'len_ticket: {X_tr["len_ticket"].isnull().sum()} / {X_te["len_ticket"].isnull().sum()} / {ori_te["len_ticket"].isnull().sum()}')

len_ticket: 0 / 0 / 0


## Data Cleaning

In [21]:
print(f'train: {X_tr.isnull().sum().sum()} / test: {X_te.isnull().sum().sum()} / ori_te: {ori_te.isnull().sum().sum()}')

train: 118 / test: 63 / ori_te: 85


In [22]:
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

def get_cols_cleaning(p_pd_list:list) -> list:
  cols_cleaning = []
  for df in p_pd_list:
    # 결측치 컬럼 조회
    tmp = df.isnull().sum()
    cols_cleaning += list(tmp[df.isnull().sum() > 0].index)

  cols_cleaning = list(set(cols_cleaning))
  print(f'cols_cleaning: {cols_cleaning}')
  return cols_cleaning

def add_cols_cleaning(p_train:pd.DataFrame, p_test:pd.DataFrame, p_ori_te:pd.DataFrame) -> None:
  pd_list = [p_train, p_test, p_ori_te]
  for col in get_cols_cleaning(pd_list):
    for i, df in enumerate(pd_list, start=1):

      if is_numeric_dtype(df[col]): # 숫자형 결측치 처리
        new_col = col+'_cleaning'
        df[new_col] = df[col]
        # random sampling
        random_sampling = (p_train[col].dropna().sample(df[col].isnull().sum()))
        # index 부여
        random_sampling.index = df[lambda x: x[col].isnull()].index
        # NA imputation
        df.loc[df[col].isnull(), new_col] = random_sampling

        # 기존 결측치 컬럼 제거
        if i == len(pd_list):
          for _df in pd_list:
            _df.drop([col], axis=1, inplace=True)

      else: # 범주형 결측치 처리
        df[col].fillna(p_train[col].mode().values[0], inplace=True)

  print(f'p_train: {p_train.isnull().sum().sum()} / p_test: {p_test.isnull().sum().sum()} / p_ori_te: {p_ori_te.isnull().sum().sum()}')


In [23]:
add_cols_cleaning(X_tr, X_te, ori_te)

cols_cleaning: ['fare', 'age', 'embarked']
p_train: 0 / p_test: 0 / p_ori_te: 0


In [24]:
print(f'train: {X_tr.isnull().sum().sum()} / test: {X_te.isnull().sum().sum()} / ori_te: {ori_te.isnull().sum().sum()}')

train: 0 / test: 0 / ori_te: 0


## Encoding

In [25]:
from sklearn.preprocessing import OneHotEncoder

In [26]:
def add_cols_encoding(p_train:pd.DataFrame, p_test:pd.DataFrame, p_ori_te:pd.DataFrame) -> None:
  print(f'before: {p_train.shape} / {p_test.shape} / {p_ori_te.shape}')
  results = []

  enc_cols = ['pclass', 'gender', 'embarked', 'Initial_name_type', 'initial_cabin', 'len_ticket']
  normal_cols = list(set(p_train.columns) - set(enc_cols))

  enc = OneHotEncoder()
  enc.fit(p_train[enc_cols])

  pd_list = [p_train, p_test, p_ori_te]
  for i, df in enumerate(pd_list, start=1):
    _df = pd.DataFrame(
      enc.transform(df[enc_cols]).toarray(),
      columns = enc.get_feature_names_out()
    )
    results.append(
      pd.concat(
        [df[normal_cols].reset_index(drop=True), _df.reset_index(drop=True)]
        , axis=1
      ).reset_index(drop=True)
    )

  print(f'after: {results[0].shape} / {results[1].shape} / {results[2].shape}')
  return results[0], results[1], results[2]

In [27]:
X_tr, X_te, ori_te = add_cols_encoding(X_tr, X_te, ori_te)

before: (641, 10) / (275, 10) / (393, 10)
after: (641, 19) / (275, 19) / (393, 19)


In [28]:
X_tr.head()

Unnamed: 0,parch,age_cleaning,sibsp,fare_cleaning,pclass_1,pclass_2,pclass_3,gender_female,gender_male,embarked_C,embarked_Q,embarked_S,Initial_name_type_Officer,Initial_name_type_Others,Initial_name_type_Royalty,initial_cabin_unknown,len_ticket_1,len_ticket_2,len_ticket_3
0,0,2.0,0,26.55,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,0,36.0,0,135.6333,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0,22.0,0,8.05,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0,26.0,2,8.6625,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,0,20.0,0,7.8792,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [55]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

import seaborn as sns

SEED = 42

In [57]:
model = LGBMClassifier(random_state=SEED)
cv = KFold(n_splits=5,shuffle=True,random_state=SEED)

scores = cross_val_score(model, X_tr, y_tr, cv = cv , scoring="roc_auc", n_jobs=-1)
base_score = scores.mean()
base_score

0.8863955205730605

In [59]:
sparse_features.shape[1]

29

In [61]:
from sklearn.decomposition import PCA

pca = PCA(n_components=sparse_features.shape[1], random_state=SEED)
pca.fit(sparse_features)
sum(pca.explained_variance_ratio_)

1.0000000000000002

In [63]:
model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model, X_tr, y_tr,cv = cv , scoring="roc_auc",n_jobs=-1)
print(f'score: {scores.mean()} / base_score: {base_score}')

score: 0.8863955205730605 / base_score: 0.8863955205730605


In [66]:
pca = PCA(n_components=2, random_state=SEED)
pca.fit(sparse_features)
print(sum(pca.explained_variance_ratio_))

model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model, X_tr, y_tr,cv = cv , scoring="roc_auc",n_jobs=-1)
print(f'score: {scores.mean()} / base_score: {base_score}')

0.44629071762679684
score: 0.8863955205730605 / base_score: 0.8863955205730605


In [69]:
best_score = 0
best_comp = 0

for comp in range(1, sparse_features.shape[1]):
  print(f'comp: {comp}')
  pca = PCA(n_components=comp, random_state=SEED)
  pca.fit(sparse_features)
  print(sum(pca.explained_variance_ratio_))

  model = LGBMClassifier(random_state=SEED)
  scores = cross_val_score(model, X_tr, y_tr,cv = cv , scoring="roc_auc",n_jobs=-1)
  print(f'score: {scores.mean()} / base_score: {base_score}')
  if best_score < scores.mean():
    best_score = scores.mean()
    best_comp = comp
print(f'best_comp: {best_comp}, best_score: {best_score}')

comp: 1
0.2585183084434899
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 2
0.44629071762679684
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 3
0.6154779583934853
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 4
0.7796910996065354
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 5
0.883256463349259
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 6
0.9386813789556426
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 7
0.9926118784489649
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 8
1.0000000000000002
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 9
1.0000000000000002
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 10
1.0000000000000002
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 11
1.0000000000000002
score: 0.8863955205730605 / base_score: 0.8863955205730605
comp: 12
1.0000000000000002
score: 0.886395520573060

In [None]:
from sklearn.decomposition import NMF
SEED = args.random_state

nmf = NMF(n_components=12, random_state=SEED, max_iter=500)
nmf.fit(sparse_features)

(nmf.components_ < 0).sum(), nmf.components_.shape

tmp = pd.DataFrame(nmf.transform(sparse_features)).add_prefix("nmf_")
x_train = pd.concat([features,tmp],axis=1)
print(f'after: {x_train.shape}')

model = LGBMClassifier(random_state=SEED)
base_score = cross_val_score(model,x_train,y_train,cv = cv , scoring="roc_auc",n_jobs=-1).mean()
print(f'base_score: {base_score}')

## Modeling

In [29]:
X_tr.isnull().sum().sum(), X_te.isnull().sum().sum(), ori_te.isnull().sum().sum()

(0, 0, 0)

In [30]:
X_tr.shape, X_te.shape, ori_te.shape

((641, 19), (275, 19), (393, 19))

In [31]:
from sklearn.ensemble import StackingClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [32]:
SEED = args.random_state
estimators = [
    ( "mlp" , MLPClassifier(max_iter=1000,random_state=SEED) ),
    ( "dtc" , DecisionTreeClassifier(random_state=SEED) ),
    ( "knn" , KNeighborsClassifier(n_neighbors= 5) )
]
hp = {
    "estimators" : estimators,
    "final_estimator" : LogisticRegression()
}
# from sklearn.model_selection import cross_val_predict # 교차검증 q
modelV3 = StackingClassifier(**hp,n_jobs=-1).fit(X_tr, y_tr)
# y_tr = cross_val_predict(modelV0, X_tr, y_tr, cv=5)
print(f'{X_tr.shape} / {y_tr.shape}')
# modelV0.fit(X_tr, y_tr)

(641, 19) / (641,)


## Evaluation

In [33]:
score_tr = modelV3.score(X_tr, y_tr)
score_te = modelV3.score(X_te, y_te)

score_tr, score_te

(0.8829953198127926, 0.8472727272727273)

In [34]:
from sklearn.metrics import roc_curve, auc

y_pred = modelV3.predict_proba(X_te)[:,1]
fpr, tpr, thresholds = roc_curve(y_te,y_pred)
auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.9009502923976608


In [35]:
ori_te_pred = modelV3.predict_proba(ori_te)[:,1]
ori_te_pred.shape

(393,)

In [36]:
# df_feature_importances = pd.DataFrame(modelV3.feature_importances_, X_tr.columns).sort_values(by=[0], ascending=False).reset_index()

# print(f'{df_feature_importances.shape}')
# df_feature_importances

In [37]:
args.results.append(
    {
        'model': 'modelV3',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        # 'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0220'
    }
)

In [38]:
pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)

Unnamed: 0,model,score_tr,score_te,auc_te,len_features,feaute_importances,create_dt,ori_te_pred
0,modelV4,0.954758,0.832727,0.91602,10,"[fare, age, sibsp, pclass, embarked_S, gender_...",217,
1,modelV3,0.882995,0.847273,0.90095,19,,220,"[0.8514475777997533, 0.8531134728151848, 0.853..."


# Submission

In [39]:
df_results = pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)
df_results

Unnamed: 0,model,score_tr,score_te,auc_te,len_features,feaute_importances,create_dt,ori_te_pred
0,modelV4,0.954758,0.832727,0.91602,10,"[fare, age, sibsp, pclass, embarked_S, gender_...",217,
1,modelV3,0.882995,0.847273,0.90095,19,,220,"[0.8514475777997533, 0.8531134728151848, 0.853..."


In [40]:
submission = pd.read_csv(args.default_submission_csv)
submission.head()

Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [41]:
df_results.head()

Unnamed: 0,model,score_tr,score_te,auc_te,len_features,feaute_importances,create_dt,ori_te_pred
0,modelV4,0.954758,0.832727,0.91602,10,"[fare, age, sibsp, pclass, embarked_S, gender_...",217,
1,modelV3,0.882995,0.847273,0.90095,19,,220,"[0.8514475777997533, 0.8531134728151848, 0.853..."


In [42]:
submission['survived'] = df_results.loc[1, ['ori_te_pred']].values[0]
print(f'{submission.isnull().sum().sum()}')
submission.head(10)

0


Unnamed: 0,passengerid,survived
0,916,0.851448
1,917,0.853113
2,918,0.853777
3,919,0.249296
4,920,0.874004
5,921,0.82398
6,922,0.100337
7,923,0.109839
8,924,0.750774
9,925,0.097636


In [43]:
submission.to_csv(args.default_path+'result/submission_v3.csv', header=True, index=False)

In [45]:
# submission['survived'] = df_results.loc[4, ['ori_te_pred']].values[0]
# submission.to_csv(args.default_path+'result/submission_v4.csv', header=True, index=False)

# Save Results

In [None]:
df_results.drop(['ori_te_pred'], axis=1, inplace=True)
df_results

In [None]:
df_results.to_json(args.save_results, orient="records")

In [None]:
import json

load_results = None
with open(args.save_results, 'r') as file:
    load_results = json.load(file)

len(load_results)