In [1]:
# Python 3.10.12
# Numpy : 1.23.5
# Pandas : 1.5.3
# Matplotlib : 3.7.1
# Seaborn : 0.12.2
# Scikit-learn : 1.2.2
# Created: OCT. 23. 2023
# Author: D.W. SHIN

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# csv파일 경로 설정
trainCSV = '/content/drive/MyDrive/구글클라우드인공지능개발자과정/scikit-learn/data/titanic/train.csv'
testCSV  = '/content/drive/MyDrive/구글클라우드인공지능개발자과정/scikit-learn/data/titanic/test.csv'

# csv파일 불러오기
train_df = pd.read_csv(trainCSV)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 결측치 처리 부분

In [3]:
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
train_df['Age'].isnull().sum()

0

In [4]:
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
train_df['Embarked'].isnull().sum()

0

### 추가 컬럼

In [5]:
train_df['Age_Cat'] = pd.cut(train_df['Age'],
                             bins=[0, 3, 7, 15, 30, 60, 100],
                             include_lowest=True,
                             labels=['Baby', 'Children', 'Teenage', 'Young', 'Adult', 'Old'])

In [6]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

In [7]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.55
1,"(16.336, 32.252]",0.344168
2,"(32.252, 48.168]",0.404255
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


In [8]:
train_df.loc[ train_df['Age'] <= 16, 'Age'] = 0
train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 32), 'Age'] = 1
train_df.loc[(train_df['Age'] > 32) & (train_df['Age'] <= 48), 'Age'] = 2
train_df.loc[(train_df['Age'] > 48) & (train_df['Age'] <= 64), 'Age'] = 3
train_df.loc[ train_df['Age'] > 64, 'Age'] = 4

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_Cat,FamilySize,AgeBand
0,1,0,3,"Braund, Mr. Owen Harris",male,1.0,1,0,A/5 21171,7.25,,S,Young,2,"(16.336, 32.252]"
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,2.0,1,0,PC 17599,71.2833,C85,C,Adult,2,"(32.252, 48.168]"
2,3,1,3,"Heikkinen, Miss. Laina",female,1.0,0,0,STON/O2. 3101282,7.925,,S,Young,1,"(16.336, 32.252]"
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,2.0,1,0,113803,53.1,C123,S,Adult,2,"(32.252, 48.168]"
4,5,0,3,"Allen, Mr. William Henry",male,2.0,0,0,373450,8.05,,S,Adult,1,"(32.252, 48.168]"


### 레이블링 작업

In [9]:
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
train_df['Embarked'] = le.fit_transform(train_df['Embarked'])
train_df['Age_Cat'] = le.fit_transform(train_df['Age_Cat'])

### 컬럼 삭제

In [10]:
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'AgeBand'], axis=1, inplace=True)
train_df.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Cat,FamilySize
886,0,2,1,1.0,0,0,13.0,2,5,1
887,1,1,0,1.0,0,0,30.0,2,5,1
888,0,3,0,1.0,1,2,23.45,2,5,4
889,1,1,1,1.0,0,0,30.0,0,5,1
890,0,3,1,1.0,0,0,7.75,1,0,1


### 데이터셋 나누기(231128 수정 : test_size 0.2에서 0.1로 수정)

In [11]:
test_size = 0.1

x_train, x_test, y_train, y_test = train_test_split(train_df.drop(['Survived'], axis=1), train_df['Survived'], test_size=test_size, stratify=train_df['Survived'], random_state=11)

In [13]:
# 전처리된 데이터셋을 csv파일로 중간 저장해서 gcp로 복사(231128 추가)
x_train['Survived']=y_train
x_train_dble = pd.concat([x_train, x_train])
x_train_dble.to_csv('titanic_train.csv', index=False)

x_test['Survived']=y_test
x_test.to_csv('titanic_test.csv', index=False)