# LightGBM

진영님 LightGBM 코드에 기반하여, 정규화 작업을 추가한 후 모델 성능을 평가합니다.

In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

### train.csv / test.csv

##### Dataset Version 1

In [12]:
# Load dataset
train_path = "/content/removed_train1.csv"
test_path = "/content/removed_test1.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Split the dataset into X and y
X_train = train_data.drop('Diagnosis', axis=1)
y_train = train_data['Diagnosis']

X_test = test_data.drop('Diagnosis', axis=1)
y_test = test_data['Diagnosis']

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)

# No Scaling
print("---No Scaling---")

# Fit the model
lgb_clf.fit(X_train, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

# MinMax Scaling
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.transform(X_test)
print("---MinMax Scaling---")

# Fit the model
lgb_clf.fit(X_train_scaled, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test_scaled)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

---No Scaling---
[LightGBM] [Info] Number of positive: 3715, number of negative: 3920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000696 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2941
[LightGBM] [Info] Number of data points in the train set: 7635, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.486575 -> initscore=-0.053713
[LightGBM] [Info] Start training from score -0.053713
Confusion Matrix:
 [[319   3]
 [  4 357]]
Accuracy: 0.9898
Precision: 0.9917
Recall: 0.9889
Specificity: 0.9907
F1 score: 0.9903
---MinMax Scaling---
[LightGBM] [Info] Number of positive: 3715, number of negative: 3920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2942
[LightGBM] [I

##### Dataset Version 2

In [7]:
# Load dataset
train_path = "/content/removed_train2.csv"
test_path = "/content/removed_test2.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Split the dataset into X and y
X_train = train_data.drop('Diagnosis', axis=1)
y_train = train_data['Diagnosis']

X_test = test_data.drop('Diagnosis', axis=1)
y_test = test_data['Diagnosis']

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)

# No Scaling
print("---No Scaling---")

# Fit the model
lgb_clf.fit(X_train, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

# MinMax Scaling
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.transform(X_test)
print("---MinMax Scaling---")

# Fit the model
lgb_clf.fit(X_train_scaled, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test_scaled)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

---No Scaling---
[LightGBM] [Info] Number of positive: 3633, number of negative: 3937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001910 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2947
[LightGBM] [Info] Number of data points in the train set: 7570, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.479921 -> initscore=-0.080360
[LightGBM] [Info] Start training from score -0.080360
Confusion Matrix:
 [[312   7]
 [  4 367]]
Accuracy: 0.9841
Precision: 0.9813
Recall: 0.9892
Specificity: 0.9781
F1 score: 0.9852
---MinMax Scaling---
[LightGBM] [Info] Number of positive: 3633, number of negative: 3937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2948
[LightGBM] [Info] Number of data points in the train set: 7570, number of use

##### Dataset Version 3

In [9]:
# Load dataset
train_path = "/content/removed_train3.csv"
test_path = "/content/removed_test3.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Split the dataset into X and y
X_train = train_data.drop('Diagnosis', axis=1)
y_train = train_data['Diagnosis']

X_test = test_data.drop('Diagnosis', axis=1)
y_test = test_data['Diagnosis']

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)

# No Scaling
print("---No Scaling---")

# Fit the model
lgb_clf.fit(X_train, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

# MinMax Scaling
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.transform(X_test)
print("---MinMax Scaling---")

# Fit the model
lgb_clf.fit(X_train_scaled, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test_scaled)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

---No Scaling---
[LightGBM] [Info] Number of positive: 4474, number of negative: 4453
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2947
[LightGBM] [Info] Number of data points in the train set: 8927, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501176 -> initscore=0.004705
[LightGBM] [Info] Start training from score 0.004705
Confusion Matrix:
 [[361   9]
 [  5 520]]
Accuracy: 0.9844
Precision: 0.9830
Recall: 0.9905
Specificity: 0.9757
F1 score: 0.9867
---MinMax Scaling---
[LightGBM] [Info] Number of positive: 4474, number of negative: 4453
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2947
[LightGBM] [Info] Number of data points in the train set: 8927, number of used 

##### Dataset Version 4

In [10]:
# Load dataset
train_path = "/content/removed_train4.csv"
test_path = "/content/removed_test4.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Split the dataset into X and y
X_train = train_data.drop('Diagnosis', axis=1)
y_train = train_data['Diagnosis']

X_test = test_data.drop('Diagnosis', axis=1)
y_test = test_data['Diagnosis']

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)

# No Scaling
print("---No Scaling---")

# Fit the model
lgb_clf.fit(X_train, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

# MinMax Scaling
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.transform(X_test)
print("---MinMax Scaling---")

# Fit the model
lgb_clf.fit(X_train_scaled, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test_scaled)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

---No Scaling---
[LightGBM] [Info] Number of positive: 4551, number of negative: 4471
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2951
[LightGBM] [Info] Number of data points in the train set: 9022, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504434 -> initscore=0.017735
[LightGBM] [Info] Start training from score 0.017735
Confusion Matrix:
 [[350  10]
 [  8 536]]
Accuracy: 0.9801
Precision: 0.9817
Recall: 0.9853
Specificity: 0.9722
F1 score: 0.9835
---MinMax Scaling---
[LightGBM] [Info] Number of positive: 4551, number of negative: 4471
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2952
[LightGBM] [Inf

### 전처리 전 dataset

In [11]:
# Load dataset
train_path = "/content/train_modified.csv"
test_path = "/content/test_edit.csv"

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Encode categorical variable
train_data['Sex'] = train_data['Sex'].map({'Male' : 0, 'Female' : 1})
train_data['Race'] = train_data['Race'].map({'Asian' : 0, 'Hispanic' : 1, 'Black' : 2, 'White' : 3})
train_data['Housing'] = train_data['Housing'].map({'Unstable' : 0, 'Stable' : 1})
train_data['Delay'] = train_data['Delay'].map({'No' : 0, 'Yes' : 1})

test_data['Sex'] = test_data['Sex'].map({'Male' : 0, 'Female' : 1})
test_data['Race'] = test_data['Race'].map({'Asian' : 0, 'Hispanic' : 1, 'Black' : 2, 'White' : 3})
test_data['Housing'] = test_data['Housing'].map({'Unstable' : 0, 'Stable' : 1})
test_data['Delay'] = test_data['Delay'].map({'No' : 0, 'Yes' : 1})

# Drop row that has missing value
train_data.dropna(inplace=True)

# Drop unused row
test_data.drop('Unnamed: 0', axis=1, inplace=True)

# Split the dataset into X and y
X_train = train_data.drop('Diagnosis', axis=1)
y_train = train_data['Diagnosis']

X_test = test_data.drop('Diagnosis', axis=1)
y_test = test_data['Diagnosis']

# Initialize LightGBM classifier
lgb_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)

# No Scaling
print("---No Scaling---")

# Fit the model
lgb_clf.fit(X_train, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

# MinMax Scaling
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.transform(X_test)
print("---MinMax Scaling---")

# Fit the model
lgb_clf.fit(X_train_scaled, y_train)

# Predict the test dataset
y_pred = lgb_clf.predict(X_test_scaled)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Extract true negatives (TN) and false positives (FP) from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: {:.4f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall: {:.4f}".format(recall))

# Calculate specificity
specificity = TN / (TN + FP)
print("Specificity: {:.4f}".format(specificity))

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 score: {:.4f}".format(f1))

---No Scaling---
[LightGBM] [Info] Number of positive: 4451, number of negative: 4377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3835
[LightGBM] [Info] Number of data points in the train set: 8828, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504191 -> initscore=0.016765
[LightGBM] [Info] Start training from score 0.016765
Confusion Matrix:
 [[402  10]
 [  9 579]]
Accuracy: 0.9810
Precision: 0.9830
Recall: 0.9847
Specificity: 0.9757
F1 score: 0.9839
---MinMax Scaling---
[LightGBM] [Info] Number of positive: 4451, number of negative: 4377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_c