In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

df = pd.read_csv('https://github.com/MyungKyuYi/AI-class/raw/refs/heads/main/titanic.csv')

# 결측치 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [30]:
# 결측치 제거 (Cabin, Embarked는 결측치가 있지만 컬럼 삭제 예정이므로 제거하지 않음)
mean_age = df['Age'].mean() # NaN 값을 제외한 Age 열의 평균값 계산
df['Age'] = df['Age'].fillna(mean_age) # Age 열의 결측치를 평균값으로 채우기

df.isnull().sum() # Age 열의 결측치가 사라진 것을 확인

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [31]:
# 레이블 확인 (imbalanced data)
df['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [32]:
# 특정 컬럼만 삭제 (아래의 속성은 예측에 필요한 요소가 아니기 때문에 제거)
df = df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket', 'Embarked'], axis=1)
print(df)

     Survived  Pclass     Sex        Age  SibSp  Parch     Fare
0           0       3    male  22.000000      1      0   7.2500
1           1       1  female  38.000000      1      0  71.2833
2           1       3  female  26.000000      0      0   7.9250
3           1       1  female  35.000000      1      0  53.1000
4           0       3    male  35.000000      0      0   8.0500
..        ...     ...     ...        ...    ...    ...      ...
886         0       2    male  27.000000      0      0  13.0000
887         1       1  female  19.000000      0      0  30.0000
888         0       3  female  29.699118      1      2  23.4500
889         1       1    male  26.000000      0      0  30.0000
890         0       3    male  32.000000      0      0   7.7500

[891 rows x 7 columns]


In [33]:
# 
beforeEncodeingSex=df['Sex'].value_counts()

label_encoder = LabelEncoder()
columns_to_encode = ['Sex']
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])
print("전: ", beforeEncodeingSex, "\n", "후: ", df['Sex'].value_counts())
print(df)

전:  Sex
male      577
female    314
Name: count, dtype: int64 
 후:  Sex
1    577
0    314
Name: count, dtype: int64
     Survived  Pclass  Sex        Age  SibSp  Parch     Fare
0           0       3    1  22.000000      1      0   7.2500
1           1       1    0  38.000000      1      0  71.2833
2           1       3    0  26.000000      0      0   7.9250
3           1       1    0  35.000000      1      0  53.1000
4           0       3    1  35.000000      0      0   8.0500
..        ...     ...  ...        ...    ...    ...      ...
886         0       2    1  27.000000      0      0  13.0000
887         1       1    0  19.000000      0      0  30.0000
888         0       3    0  29.699118      1      2  23.4500
889         1       1    1  26.000000      0      0  30.0000
890         0       3    1  32.000000      0      0   7.7500

[891 rows x 7 columns]


In [34]:
# 특징 및 레이블 분리
X = df.drop('Survived', axis=1)
y = df['Survived']

# 학습, 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("학습 데이터 개수:", len(X_train))
print("테스트 데이터 개수:", len(X_test))

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

학습 데이터 개수: 712
테스트 데이터 개수: 179


In [39]:
# Decision Tree 모델 생성 및 학습
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# 예측 및 평가, confusion matrix 출력
dt_pred = dt_model.predict(X_test)
print(f'Decision Tree Accuracy: {accuracy_score(y_test, dt_pred)}')
print(confusion_matrix(y_test, dt_pred))

Decision Tree Accuracy: 0.7597765363128491
[[83 22]
 [21 53]]


In [40]:
# Random Forest 모델 생성 및 학습
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# 예측 및 평가, confusion matrix 출력
rf_pred = rf_model.predict(X_test)
print(f'Random Forest Accuracy: {accuracy_score(y_test, rf_pred)}')
print(confusion_matrix(y_test, rf_pred))

Random Forest Accuracy: 0.8212290502793296
[[92 13]
 [19 55]]


In [41]:
# SVM 모델 생성 및 학습
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# 예측 및 평가, confusion matrix 출력
svm_pred = svm_model.predict(X_test)
print(f'SVM Accuracy: {accuracy_score(y_test, svm_pred)}')
print(confusion_matrix(y_test, svm_pred))

SVM Accuracy: 0.8100558659217877
[[92 13]
 [21 53]]


In [42]:
# Logistic Regression 모델 생성 및 학습
lr_model =LogisticRegression(max_iter=200, random_state=42)
lr_model.fit(X_train, y_train)

# 예측 및 평가, confusion matrix 출력
lr_pred = lr_model.predict(X_test)
print(f'Logistic Regression: {accuracy_score(y_test, lr_pred)}')
print(confusion_matrix(y_test, lr_pred))

Logistic Regression: 0.7988826815642458
[[90 15]
 [21 53]]
