# 타이타닉 생존자 예측

In [2]:
import pandas as pd

titanic_df = pd.read_csv('titanic3.csv')
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
titanic_df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

#### 전처리 종류
- 결측값 제거
- 불필요한(중복) 컬럼 제거
- 파생변수 생성
- encoding

#### 내가 전처리하지 않은 파일

In [4]:
tit_df = pd.read_pickle('tdf.pkl')
tit_df

Unnamed: 0,survived,fare_cat,age_cat,family,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,0,0,1,0,0,0,1
1,1,1,4,3,0,1,0,0,1
2,0,1,4,3,1,0,0,0,1
3,0,1,0,3,0,1,0,0,1
4,0,1,0,3,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1304,0,3,3,1,1,0,1,0,0
1305,0,3,0,1,1,0,1,0,0
1306,0,4,0,0,0,1,1,0,0
1307,0,4,0,0,0,1,1,0,0


In [5]:
tit_df.columns

Index(['survived', 'fare_cat', 'age_cat', 'family', 'sex_female', 'sex_male',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [6]:
# 데이터 분리
from sklearn.model_selection import train_test_split

# 독립변수, 종속변수 
y_tdf = tit_df['survived']
x_tdf = tit_df.drop(['survived'], axis = 1)

# 학습용 데이터:평가용 데이터 = 8:2 로 분리
X_train, X_test, y_train, y_test = train_test_split(x_tdf, y_tdf, test_size = 0.2, random_state = 11)
print(X_train.shape, X_test.shape)

(1047, 8) (262, 8)


In [7]:
# 모델 학습 및 평가
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)    # 학습해라.. 분류능력 키워라..
rf_pred = rf_model.predict(X_test)    # 예측해..
accuracy_rf = accuracy_score(y_test, rf_pred).round(2)   # 비교해봐~


lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, lr_pred).round(2)

print(f'rf 정확도: {accuracy_rf}, lr 정확도: {accuracy_lr}')

rf 정확도: 0.77, lr 정확도: 0.81


In [9]:
import pickle
import joblib

filename = 'tcl_model.pkl'
joblib.dump(lr_model, filename)     

['tcl_model.pkl']

In [10]:
# 파일 불러오기: 객체가 나올 것

mdl = joblib.load('tcl_model.pkl')
mdl

LogisticRegression()

In [11]:
# 입력값 넣어보기 -> 데이터프레임형태로 넣어줘야 함
# 예측

data = [3, 0, 0, 0, 1, 0, 0, 1]
db_df = pd.DataFrame(columns = ['fare_cat', 'age_cat', 'family', 'sex_female', 'sex_male',
       'embarked_C', 'embarked_Q', 'embarked_S'])
db_df.loc[0] = data
y_pred = mdl.predict(db_df)
y_pred

array([0], dtype=int64)