In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import random
import warnings
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


warnings.filterwarnings(action='ignore')

In [None]:
train_data = pd.read_csv('/content/removed_train1.csv')
test_data = pd.read_csv('/content/removed_test1.csv')

In [None]:
data = pd.concat([test_data, train_data], ignore_index = True)
data = pd.concat([test_data, train_data], axis = 1)
data = pd.concat([test_data, train_data], axis = 0)

In [None]:
data.info()

In [None]:
X = data.drop(columns = 'Diagnosis')
y = data['Diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators = 100,
                            learning_rate = 0.1,
                            max_depth = 5,
                            use_label_encoder = False,
                            gamma = 0)

xgb_clf.fit(X_train, y_train)
predictions = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'정확도 : {accuracy}')

정확도 : 0.9855769230769231


In [None]:
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(xgb_clf, X_train, y_train, cv = k_fold)

for i, score in enumerate(scores, 1):
    print(f"Fold {i} Score: {score}")

print(f"Mean Score: {scores.mean()}")

Fold 1 Score: 0.9894815927873779
Fold 2 Score: 0.9849737039819685
Fold 3 Score: 0.9849737039819685
Fold 4 Score: 0.9887302779864763
Fold 5 Score: 0.98796992481203
Mean Score: 0.9872258407099641


In [None]:
lgb_clf = lgb.LGBMClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 7, random_state = 42)
lgb_clf.fit(X_train, y_train)

predictions = lgb_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'정확도 : {accuracy}')

[LightGBM] [Info] Number of positive: 3291, number of negative: 3363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000962 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2940
[LightGBM] [Info] Number of data points in the train set: 6654, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494590 -> initscore=-0.021642
[LightGBM] [Info] Start training from score -0.021642
정확도 : 0.9861778846153846


In [None]:
log_Reg = LogisticRegression(max_iter = 1000)
log_Reg.fit(X_train, y_train)

predictions = log_Reg.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'정확도 : {accuracy}')

정확도 : 0.9819711538461539


In [None]:
rf_clf = RandomForestClassifier(n_estimators = 500, random_state = 42)

rf_clf.fit(X_train, y_train)

predictions = rf_clf.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"정확도: {accuracy}%")

정확도: 0.9825721153846154%


In [None]:
from sklearn.neighbors import KNeighborsClassifier

svm = SVC(kernel='poly', C=1.5, random_state=42)
svm.fit(X_train, y_train)

predictions = svm.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"정확도: {accuracy}%")

정확도: 0.9867788461538461%


In [None]:
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(svm, X_train, y_train, cv = k_fold)

for i, score in enumerate(scores, 1):
    print(f"Fold {i} Score: {score}")

print(f"Mean Score: {scores.mean()}")

Fold 1 Score: 0.9864763335837716
Fold 2 Score: 0.9864763335837716
Fold 3 Score: 0.9864763335837716
Fold 4 Score: 0.9864763335837716
Fold 5 Score: 0.9909774436090225
Mean Score: 0.9873765555888218


In [None]:
ada_clf = AdaBoostClassifier(n_estimators=200, learning_rate = 0.1, algorithm = 'SAMME', random_state=42)
ada_clf.fit(X_train, y_train)

predictions = ada_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"정확도: {accuracy}%")

정확도: 0.9807692307692307%


In [None]:
gb_clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=7, random_state=42)
gb_clf.fit(X_train, y_train)

predictions = gb_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f'정확도: {accuracy}')

정확도: 0.984375


In [None]:
model1 = lgb.LGBMClassifier(n_estimators = 500, learning_rate = 0.1, max_depth = 7, random_state = 42)
model2 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=7, random_state=42)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

y_pred1 = model1.predict(X_train)
y_pred2 = model2.predict(X_train)

X_meta = np.column_stack((y_pred1, y_pred2))

meta_model = SVC(kernel='poly', C=1.0, random_state=42)

meta_model.fit(X_meta, y_train)

pred1_test = model1.predict(X_test)
pred2_test = model2.predict(X_test)

X_meta_test = np.column_stack((pred1_test, pred2_test))

final_predictions = meta_model.predict(X_meta_test)

print("최종 정확도 :", accuracy_score(y_test, final_predictions))

[LightGBM] [Info] Number of positive: 3291, number of negative: 3363
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2940
[LightGBM] [Info] Number of data points in the train set: 6654, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.494590 -> initscore=-0.021642
[LightGBM] [Info] Start training from score -0.021642
최종 정확도 : 0.9855769230769231


In [None]:
k_fold = KFold(n_splits = 5, shuffle = True, random_state = 42)
scores = cross_val_score(meta_model, X_train, y_train, cv = k_fold)

for i, score in enumerate(scores, 1):
    print(f"Fold {i} Score: {score}")

print(f"Mean Score: {scores.mean()}")

Fold 1 Score: 0.9849737039819685
Fold 2 Score: 0.98572501878287
Fold 3 Score: 0.9864763335837716
Fold 4 Score: 0.98572501878287
Fold 5 Score: 0.9902255639097745
Mean Score: 0.9866251278082508


In [None]:
!pip install pytorch_tabnet



In [None]:
import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

X1 = X.values
y1 = y.values

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

clf = TabNetClassifier()
clf.fit(X_train1, y_train1, max_epochs = 500, patience = 10)
predictions = clf.predict(X_test1)
accuracy = accuracy_score(y_test1, predictions)

print(f'정확도 : {accuracy}')

epoch 0  | loss: 0.57125 |  0:00:00s
epoch 1  | loss: 0.31302 |  0:00:00s
epoch 2  | loss: 0.18234 |  0:00:00s
epoch 3  | loss: 0.1149  |  0:00:00s
epoch 4  | loss: 0.08313 |  0:00:00s
epoch 5  | loss: 0.06957 |  0:00:01s
epoch 6  | loss: 0.06012 |  0:00:01s
epoch 7  | loss: 0.05117 |  0:00:01s
epoch 8  | loss: 0.05435 |  0:00:01s
epoch 9  | loss: 0.04428 |  0:00:01s
epoch 10 | loss: 0.04431 |  0:00:01s
epoch 11 | loss: 0.03733 |  0:00:02s
epoch 12 | loss: 0.04275 |  0:00:02s
epoch 13 | loss: 0.0379  |  0:00:02s
epoch 14 | loss: 0.03384 |  0:00:02s
epoch 15 | loss: 0.03796 |  0:00:02s
epoch 16 | loss: 0.03798 |  0:00:02s
epoch 17 | loss: 0.03427 |  0:00:03s
epoch 18 | loss: 0.03743 |  0:00:03s
epoch 19 | loss: 0.02921 |  0:00:03s
epoch 20 | loss: 0.03156 |  0:00:03s
epoch 21 | loss: 0.03275 |  0:00:03s
epoch 22 | loss: 0.03323 |  0:00:03s
epoch 23 | loss: 0.02771 |  0:00:04s
epoch 24 | loss: 0.0296  |  0:00:04s
epoch 25 | loss: 0.03657 |  0:00:04s
epoch 26 | loss: 0.02801 |  0:00:04s
e