In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns

In [47]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print(df.head())

# 결측치 확인
print(df.isnull().sum())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

In [48]:
# 결측치 제거
df.dropna(subset=["Embarked"], inplace=True) # Embarked 컬럼의 결측치 제거
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


In [42]:
# 레이블 확인
for label in df.columns:
    print("\n", df[label].value_counts())


 PassengerId
1      1
599    1
588    1
589    1
590    1
      ..
302    1
303    1
304    1
305    1
891    1
Name: count, Length: 889, dtype: int64

 Survived
0    549
1    340
Name: count, dtype: int64

 Pclass
3    491
1    214
2    184
Name: count, dtype: int64

 Name
Braund, Mr. Owen Harris              1
Boulos, Mr. Hanna                    1
Frolicher-Stehli, Mr. Maxmillian     1
Gilinski, Mr. Eliezer                1
Murdlin, Mr. Joseph                  1
                                    ..
McCoy, Mr. Bernard                   1
Johnson, Mr. William Cahoone Jr      1
Keane, Miss. Nora A                  1
Williams, Mr. Howard Hugh "Harry"    1
Dooley, Mr. Patrick                  1
Name: count, Length: 889, dtype: int64

 Sex
male      577
female    312
Name: count, dtype: int64

 Age
29.642093    177
24.000000     30
22.000000     27
18.000000     26
28.000000     25
            ... 
36.500000      1
55.500000      1
0.920000       1
23.500000      1
74.000000      1
Nam

In [49]:
# 불필요한 컬럼 제거
columns_to_delete = ['Name', 'PassengerId', 'Ticket', 'Cabin', 'Embarked']
df = df.drop(columns=columns_to_delete)

In [51]:
# 인코딩(숫자)
label_encoder = LabelEncoder()
columns_to_encode = ['Sex']
df['Sex'] = label_encoder.fit_transform(df['Sex'])

for label in df.columns:
    print("\n", df[label].value_counts())


 Survived
0    549
1    340
Name: count, dtype: int64

 Pclass
3    491
1    214
2    184
Name: count, dtype: int64

 Sex
1    577
0    312
Name: count, dtype: int64

 Age
29.642093    177
24.000000     30
22.000000     27
18.000000     26
28.000000     25
            ... 
36.500000      1
55.500000      1
0.920000       1
23.500000      1
74.000000      1
Name: count, Length: 89, dtype: int64

 SibSp
0    606
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64

 Parch
0    676
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64

 Fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
35.0000     1
28.5000     1
6.2375      1
14.0000     1
10.5167     1
Name: count, Length: 247, dtype: int64


In [54]:
x = df.drop('Survived', axis=1)
y = df['Survived']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, stratify=y)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [62]:
# 1. DT
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=0)
dt_model.fit(x_train, y_train)
y_pred_dt = dt_model.predict(x_test)

print("DT Accuracy:", accuracy_score(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))

DT Accuracy: 0.8202247191011236
[[138  27]
 [ 21  81]]


In [68]:
# 2. RF
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)

print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

RF Accuracy: 0.8352059925093633
[[145  20]
 [ 24  78]]


In [67]:
# 3. SVM
from sklearn.svm import SVC

svm_model = SVC(random_state=0)
svm_model.fit(x_train_scaled, y_train)
y_pred_svm = svm_model.predict(x_test_scaled)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))

SVM Accuracy: 0.8614232209737828
[[151  14]
 [ 23  79]]


In [66]:
# 4. LR
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0)
lr_model.fit(x_train_scaled, y_train)
y_pred_lr = lr_model.predict(x_test_scaled)

print("LR Accuracy:", accuracy_score(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

LR Accuracy: 0.8277153558052435
[[147  18]
 [ 28  74]]


In [72]:
# 5. KNN
from sklearn.neighbors import KNeighborsClassifier

k_range = range(1, 101)
scores = []

for k in k_range:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(x_train_scaled, y_train)
    y_pred_knn = knn_model.predict(x_test_scaled)
    scores.append(accuracy_score(y_test, y_pred_knn))
max_index = max(scores)
optimal_k = scores.index(max_index)

knn_model = KNeighborsClassifier(n_neighbors=optimal_k)
knn_model.fit(x_train_scaled, y_train)
y_pred_knn = knn_model.predict(x_test_scaled)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))

KNN Accuracy: 0.8539325842696629
[[147  18]
 [ 21  81]]
