In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import warnings

## 데이터 시각화 관련
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid') # matplotlib의 스타일에 관련한 함
warnings.filterwarnings(action='ignore')

## Scikit-Learn의 다양한 머신러닝 모듈을 불러옵니다.
## 분류 알고리즘 중에서 선형회귀, 서포트벡터머신, 랜덤포레스트, K-최근접이웃 알고리즘을 사용해보려고 합니다.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# 데이터를 우선 가져와야합니다.
train_df = pd.read_csv("C:\\kaggle\\title\\train.csv")
test_df = pd.read_csv("C:\\kaggle\\title\\test.csv")

In [3]:
# 데이터 미리보기
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# 데이터 정보 확인
train_df.info()
print('-'*20)
test_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    

# 각 데이터는 빈 부분이 있는가?
# 빈 부분이 있다면, drop할 것인가 아니면 default값으로 채워넣을 것인가
# cabin, Age, Embarked 세 항목에 주의
# 데이터는 float64로 변환할 수 있는가
# 아니라면 범주형 데이터로 만들 수 있는가

In [5]:
train_df=train_df.drop(['PassengerId', 'Name', 'Ticket'],axis=1)#axis=1(컬럼)
test_df=test_df.drop(['Name','Ticket'], axis=1)


In [6]:
# Pclass -> 1,2,3
train_df['Pclass'].value_counts()


3    491
1    216
2    184
Name: Pclass, dtype: int64

In [7]:
# Pclass를 one-hot-encoding
pclass_train_dummies=pd.get_dummies(train_df['Pclass'])
pclass_test_dummies=pd.get_dummies(test_df['Pclass'])

# 번경된 설정을 덮어 쓰겠다
train_df.drop(['Pclass'], axis=1, inplace=True)
test_df.drop(['Pclass'], axis=1, inplace=True)

train_df=train_df.join(pclass_train_dummies)
test_df=test_df.join(pclass_test_dummies)

In [8]:
# Sex를 one-hot-encoding
sex_train_dummies=pd.get_dummies(train_df['Sex'])
sex_test_dummies=pd.get_dummies(test_df['Sex'])

sex_train_dummies.columns=['Female', 'Male']
sex_test_dummies.columns = ['Female', 'Male']

train_df.drop(['Sex'], axis=1, inplace=True)
test_df.drop(['Sex'], axis=1, inplace=True)

train_df=train_df.join(sex_train_dummies)
test_df=test_df.join(sex_test_dummies)

In [9]:
# Age
# NaN 데이터 채우기
# 1.랜덤 2.평균값 3.중간값 4.데이터 버리기
train_df["Age"].fillna(train_df["Age"].mean() , inplace=True)
test_df["Age"].fillna(train_df["Age"].mean() , inplace=True)

# Fare(탑승료)
test_df["Fare"].fillna(0, inplace=True)

# Cabin
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

In [10]:
# Embarked(탑승 항구)
print(train_df['Embarked'].value_counts())
print(test_df['Embarked'].value_counts())

train_df["Embarked"].fillna('S', inplace=True)
test_df["Embarked"].fillna('S', inplace=True)

embarked_train_dummies = pd.get_dummies(train_df['Embarked'])
embarked_test_dummies = pd.get_dummies(test_df['Embarked'])

embarked_train_dummies.columns = ['S', 'C', 'Q']
embarked_test_dummies.columns = ['S', 'C', 'Q']

train_df.drop(['Embarked'], axis=1, inplace=True)
test_df.drop(['Embarked'], axis=1, inplace=True)

train_df = train_df.join(embarked_train_dummies)
test_df = test_df.join(embarked_test_dummies)

S    644
C    168
Q     77
Name: Embarked, dtype: int64
S    270
C    102
Q     46
Name: Embarked, dtype: int64


In [11]:
# 데이터 나누기
X_train=train_df.drop("Survived", axis=1)
Y_train=train_df["Survived"]
X_test=test_df.drop("PassengerId", axis=1).copy()


In [12]:
# 머신러닝 알고리즘 적용하기
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)

0.8058361391694725

In [13]:
# Support Vector Machines

svc = SVC()

svc.fit(X_train, Y_train)

Y_pred = svc.predict(X_test)

svc.score(X_train, Y_train)

0.6868686868686869

In [14]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

0.9820426487093153

In [15]:
knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(X_train, Y_train)

Y_pred = knn.predict(X_test)

knn.score(X_train, Y_train)

0.835016835016835

#제출용 파일 만들기

In [17]:
# Random Forests
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)

submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('titanic.csv', index=False)