#### Import Libraries

In [93]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

#### Setting

In [55]:
SEED = 1234

#### Fetch Data

In [56]:
DATA_PATH = os.path.join(os.getcwd(), 'data')

##### Gender Submission

In [57]:
gs_df = pd.read_csv(os.path.join(DATA_PATH, 'gender_submission.csv'))
gs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [58]:
gs_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


##### Train And Test Data

In [59]:
data_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
data_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [60]:
label_df = data_df.pop('Survived')

In [61]:
data_df.columns = [
    'id', '티켓등급', '이름', '성별', '나이', '형제자매배우자수', 
    '부모자녀수', '티켓번호', '요금', '객실번호', '승선항'
]

#### Preprocessing Data

##### Null 값 체크

In [62]:
data_df.isna().sum()

id            0
티켓등급          0
이름            0
성별            0
나이          177
형제자매배우자수      0
부모자녀수         0
티켓번호          0
요금            0
객실번호        687
승선항           2
dtype: int64

##### `나이` 결측값 처리

In [63]:
data_df[data_df.나이.isna()]

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,승선항
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...
859,860,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [64]:
# 이름에서 호칭 분류
data_df['호칭'] = data_df['이름'].str.extract(r'([A-Za-z]+\.)')

In [65]:
data_df.head()

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,승선항,호칭
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.


In [66]:
data_df.isna().sum()

id            0
티켓등급          0
이름            0
성별            0
나이          177
형제자매배우자수      0
부모자녀수         0
티켓번호          0
요금            0
객실번호        687
승선항           2
호칭            0
dtype: int64

In [67]:
data_df.호칭.value_counts()

호칭
Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
Countess.      1
Capt.          1
Ms.            1
Sir.           1
Lady.          1
Mme.           1
Don.           1
Jonkheer.      1
Name: count, dtype: int64

In [68]:
result = data_df.groupby('호칭')['나이'].mean()
result

호칭
Capt.        70.000000
Col.         58.000000
Countess.    33.000000
Don.         40.000000
Dr.          42.000000
Jonkheer.    38.000000
Lady.        48.000000
Major.       48.500000
Master.       4.574167
Miss.        21.773973
Mlle.        24.000000
Mme.         24.000000
Mr.          32.368090
Mrs.         35.898148
Ms.          28.000000
Rev.         43.166667
Sir.         49.000000
Name: 나이, dtype: float64

In [69]:
# 나이값이 NaN인 컬럼의 나이값을 호칭컬럼의 평균값으로 채우기
data_df['새나이'] = data_df.apply(
    lambda row: result[row['호칭']] if pd.isna(row['나이']) else row['나이'], axis=1
)
data_df.head(20)

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,승선항,호칭,새나이
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,22.0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,38.0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,26.0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,35.0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,35.0
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr.,32.36809
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr.,54.0
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master.,2.0
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs.,27.0
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs.,14.0


##### 나이값 등급화: 0-18, 19-55, 55-99

In [70]:
# 나이값 구간 및 레이블 설정
# 미성년자와 노약자가 살 확률과 일반 성인이 살 확률 확인 필요
age_bins = [0, 18, 55, 999]
age_class = [0, 1, 2]
data_df['나이등급'] = pd.cut(data_df['새나이'], bins=age_bins, labels=age_class, right=False)
data_df.head(20)

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,승선항,호칭,새나이,나이등급
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr.,22.0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs.,38.0,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss.,26.0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs.,35.0,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr.,35.0,1
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,Mr.,32.36809,1
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,Mr.,54.0,1
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,Master.,2.0,0
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,Mrs.,27.0,1
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,Mrs.,14.0,0


##### 승선항 NaN값 처리

In [71]:
data_df.승선항.value_counts()

승선항
S    644
C    168
Q     77
Name: count, dtype: int64

In [72]:
# NaN값 2개를 'N'으로 처리
data_df.승선항.fillna('N', inplace=True)

##### 객실번호 NaN값 처리
* 객실번호는 중요한 항목이지만 NaN값이 너무 많다.

In [73]:
data_df[data_df.객실번호.isna()]['티켓등급'].value_counts()
# 객실번호가 없는 사람의 경우 티켓 등급이 3등급이 대부분임

티켓등급
3    479
2    168
1     40
Name: count, dtype: int64

In [74]:
data_df['티켓등급'].value_counts()

티켓등급
3    491
1    216
2    184
Name: count, dtype: int64

In [75]:
data_df[(data_df.티켓등급 == 3) & (data_df.객실번호.notna())]

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,승선항,호칭,새나이,나이등급
10,11,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,Miss.,4.0,0
75,76,3,"Moen, Mr. Sigurd Hansen",male,25.0,0,0,348123,7.65,F G73,S,Mr.,25.0,1
128,129,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C,Miss.,21.773973,1
205,206,3,"Strom, Miss. Telma Matilda",female,2.0,0,1,347054,10.4625,G6,S,Miss.,2.0,0
251,252,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29.0,1,1,347054,10.4625,G6,S,Mrs.,29.0,1
394,395,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...",female,24.0,0,2,PP 9549,16.7,G6,S,Mrs.,24.0,1
429,430,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32.0,0,0,SOTON/O.Q. 392078,8.05,E10,S,Mr.,32.0,1
699,700,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42.0,0,0,348121,7.65,F G63,S,Mr.,42.0,1
715,716,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,348124,7.65,F G73,S,Mr.,19.0,1
751,752,3,"Moor, Master. Meier",male,6.0,0,1,392096,12.475,E121,S,Master.,6.0,0


In [76]:
data_df.객실번호.value_counts()

객실번호
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [77]:
# 객실번호로 새로운 컬럼을 만듬
# 알파벳을 추출해 각 알파벳에 해당하는 컬럼을 만들고 값을 대입하고
# 가지고 있는 최대 숫자만큼 컬럼을 만들고 각각 할당해주고
# NaN값은 알파벳 X로 처리함

data_df['객실구분'] = data_df['객실번호'].apply(lambda x: re.findall(r'[A-Za-z]', str(x)) if pd.notna(x) else ['X'])
unique_letters = list(set([item for sublist in data_df.객실구분 if isinstance(sublist, list) for item in sublist]))
print(unique_letters)

['T', 'D', 'F', 'X', 'B', 'C', 'A', 'E', 'G']


In [78]:
for letter in unique_letters:
    data_df[f'객실구분{letter}'] = data_df.객실구분.apply(lambda x: 1 if letter in x else 0)

In [79]:
data_df[data_df.객실구분F==1]

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,...,객실구분,객실구분T,객실구분D,객실구분F,객실구분X,객실구분B,객실구분C,객실구분A,객실구분E,객실구분G
66,67,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5,F33,...,[F],0,0,1,0,0,0,0,0,0
75,76,3,"Moen, Mr. Sigurd Hansen",male,25.0,0,0,348123,7.65,F G73,...,"[F, G]",0,0,1,0,0,0,0,0,1
128,129,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,...,"[F, E]",0,0,1,0,0,0,0,1,0
148,149,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26.0,F2,...,[F],0,0,1,0,0,0,0,0,0
183,184,2,"Becker, Master. Richard F",male,1.0,2,1,230136,39.0,F4,...,[F],0,0,1,0,0,0,0,0,0
193,194,2,"Navratil, Master. Michel M",male,3.0,1,1,230080,26.0,F2,...,[F],0,0,1,0,0,0,0,0,0
340,341,2,"Navratil, Master. Edmond Roger",male,2.0,1,1,230080,26.0,F2,...,[F],0,0,1,0,0,0,0,0,0
345,346,2,"Brown, Miss. Amelia ""Mildred""",female,24.0,0,0,248733,13.0,F33,...,[F],0,0,1,0,0,0,0,0,0
516,517,2,"Lemore, Mrs. (Amelia Milley)",female,34.0,0,0,C.A. 34260,10.5,F33,...,[F],0,0,1,0,0,0,0,0,0
618,619,2,"Becker, Miss. Marion Louise",female,4.0,2,1,230136,39.0,F4,...,[F],0,0,1,0,0,0,0,0,0


In [80]:
# 객실번호 숫자부분 처리 
data_df['객실숫자번호'] = data_df['객실번호'].apply(lambda x: re.findall(r'\d+', str(x)) if pd.notna(x) else ['0'])

In [81]:
max_length = data_df['객실숫자번호'].apply(lambda x: len(x) if isinstance(x, list) else 0).max()

In [82]:
# 4개의 숫자컬럼을 만들고 그곳에 하났기 입력
for i in range(max_length):
    data_df[f'객실숫자번호{i+1}'] = data_df['객실숫자번호'].apply(lambda x: int(x[i]) if isinstance(x, list) and len(x) > i else 0)

In [83]:
data_df[data_df.객실숫자번호.apply(lambda x: len(x)==4)]

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,...,객실구분B,객실구분C,객실구분A,객실구분E,객실구분G,객실숫자번호,객실숫자번호1,객실숫자번호2,객실숫자번호3,객실숫자번호4
311,312,1,"Ryerson, Miss. Emily Borie",female,18.0,2,2,PC 17608,262.375,B57 B59 B63 B66,...,1,0,0,0,0,"[57, 59, 63, 66]",57,59,63,66
742,743,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21.0,2,2,PC 17608,262.375,B57 B59 B63 B66,...,1,0,0,0,0,"[57, 59, 63, 66]",57,59,63,66


In [84]:
data_df.isna().sum()

id            0
티켓등급          0
이름            0
성별            0
나이          177
형제자매배우자수      0
부모자녀수         0
티켓번호          0
요금            0
객실번호        687
승선항           0
호칭            0
새나이           0
나이등급          0
객실구분          0
객실구분T         0
객실구분D         0
객실구분F         0
객실구분X         0
객실구분B         0
객실구분C         0
객실구분A         0
객실구분E         0
객실구분G         0
객실숫자번호        0
객실숫자번호1       0
객실숫자번호2       0
객실숫자번호3       0
객실숫자번호4       0
dtype: int64

##### One-hot Encoding

In [85]:
# 티켓등급을 One-hot Encoding 함
oh_ticket = pd.get_dummies(data_df['티켓등급'], prefix='티켓등급')
oh_ticket

Unnamed: 0,티켓등급_1,티켓등급_2,티켓등급_3
0,False,False,True
1,True,False,False
2,False,False,True
3,True,False,False
4,False,False,True
...,...,...,...
886,False,True,False
887,True,False,False
888,False,False,True
889,True,False,False


In [86]:
# 같은방법으로 성별, 승선항, 호칭, 나이등급을 One-hot Encoding
oh_sex = pd.get_dummies(data_df['성별'], prefix='성별')
oh_embarked = pd.get_dummies(data_df['승선항'], prefix='승선항')
oh_title = pd.get_dummies(data_df['호칭'], prefix='호칭')
oh_age_level = pd.get_dummies(data_df['나이등급'], prefix='나이등급')

In [87]:
# data_df 추가
data_df = pd.concat([data_df, oh_ticket, oh_sex, oh_embarked, oh_title, oh_age_level], axis=1)
data_df.shape

(891, 58)

In [88]:
# one-hot encoding으로 생성된 true, false값을 숫자로 변환
data_df = data_df.replace({True: 1, False: 0})

##### 티켓번호 처리
* 티켓번호를 '문숫자'와 '숫자'로 분류하여, 문숫자는 '티켓번호구분', '숫자'는 '티켓숫자' 컬럼에 저장

In [89]:
data_df['티켓번호구분'] = ''
data_df['티켓숫자'] = ''

for idx, value in data_df['티켓번호'].items():
    parts = value.split()
    alpha_numerics = [re.sub(r'[^A-Za-z0-9]', '', part) for part in parts]
    
    for an_value in alpha_numerics:
        if any(char.isalpha() for char in an_value):
            data_df.at[idx, '티켓번호구분'] = an_value
        elif an_value.isnumeric():
            data_df.at[idx, '티켓숫자'] = an_value

In [90]:
data_df.head()

Unnamed: 0,id,티켓등급,이름,성별,나이,형제자매배우자수,부모자녀수,티켓번호,요금,객실번호,...,호칭_Mr.,호칭_Mrs.,호칭_Ms.,호칭_Rev.,호칭_Sir.,나이등급_0,나이등급_1,나이등급_2,티켓번호구분,티켓숫자
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,...,1,0,0,0,0,0,1,0,A5,21171
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,...,0,1,0,0,0,0,1,0,PC,17599
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,...,0,0,0,0,0,0,1,0,STONO2,3101282
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,...,0,1,0,0,0,0,1,0,,113803
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,...,1,0,0,0,0,0,1,0,,373450


In [91]:
data_df.티켓번호구분.value_counts()

티켓번호구분
           661
PC          60
CA          41
A5          21
SOTONOQ     15
STONO       12
WC          10
A4           7
SCPARIS      7
STONO2       6
SOC          6
FCC          5
C            5
SCParis      4
LINE         4
WEP          3
PP           3
SOPP         3
SCAH         2
SOTONO2      2
SWPP         2
PPP          2
FC           1
Basle        1
AS           1
SP           1
SC           1
SCOW         1
Fa           1
SOP          1
SCA4         1
CASOTON      1
Name: count, dtype: int64

In [92]:
# 티켓번호구분을 숫자로 변경
label_encoder = LabelEncoder()
data_df['티켓번호구분숫자'] = label_encoder.fit_transform(data_df.티켓번호구분)

##### 불필요 컬럼 제거

In [42]:
data_df.isna().sum()

id            0
티켓등급          0
이름            0
성별            0
나이          177
           ... 
나이등급_1        0
나이등급_2        0
티켓번호구분        0
티켓숫자          0
티켓번호구분숫자      0
Length: 61, dtype: int64

In [94]:
# 위에서 One-Hot Encoding한 컬럼을 포함해 아래 컬럼 삭제
drop_columns = [
    'id', '티켓등급', '이름', '성별', '나이', '티켓번호', '객실번호', 
    '호칭', '승선항', '나이등급', '객실구분', '객실숫자번호', '티켓번호구분' 
]

data_df = data_df.drop(columns=drop_columns)

##### object컬럼 변환

In [95]:
object_columns = data_df.select_dtypes(include=['object'])
object_columns

Unnamed: 0,티켓숫자
0,21171
1,17599
2,3101282
3,113803
4,373450
...,...
886,211536
887,112053
888,6607
889,111369


In [96]:
data_df.티켓숫자 = data_df.티켓숫자.replace('', 0)
data_df.티켓숫자 = data_df.티켓숫자.astype(int)

##### 숫자데이터 정규화

In [97]:
scaler = MinMaxScaler()

scale_columns = ['요금', '새나이']
data_df[scale_columns] = scaler.fit_transform(data_df[scale_columns])

#### 전처리결과 데이터 저장

In [98]:
data_df.to_csv(
    os.path.join(DATA_PATH, 'preprocessing_data.csv'),
    index=False
)
label_df.to_csv(
    os.path.join(DATA_PATH, 'preprocessing_label.csv'),
    index=False
)