## Catboost / LightGBM 분류 모델링

라이브러리 준비

In [5]:
# 경고 무시
import warnings
warnings.filterwarnings('ignore')

# 데이터 처리 및 분석
from scipy.stats import chi2_contingency, pearsonr, spearmanr
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', None)

# 시각화
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 머신러닝
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# AWS 관련
import awswrangler as wr
import sagemaker
import boto3

# 기타
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import json
from dotenv import load_dotenv

# 환경 변수 로드
load_dotenv()

True

SageMaker 세션 및 역할 설정

In [6]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

데이터셋 다운로드

In [7]:
# Kaggle API 인증
api = KaggleApi()
api.authenticate()

# 데이터셋 다운로드 경로 설정
data_repository_name = 'zzettrkalpakbal/full-filled-brain-stroke-dataset'
os.makedirs('dataset/brain-stroke', exist_ok=True)

# 데이터셋 다운로드
api.dataset_download_files(data_repository_name, path='dataset/brain-stroke', unzip=True)

print(f"'{data_repository_name}' 데이터셋이 'brain-stroke' 폴더에 다운로드되었습니다.")


Dataset URL: https://www.kaggle.com/datasets/zzettrkalpakbal/full-filled-brain-stroke-dataset
'zzettrkalpakbal/full-filled-brain-stroke-dataset' 데이터셋이 'brain-stroke' 폴더에 다운로드되었습니다.


데이터 확인

In [8]:
data = pd.read_csv('dataset/brain-stroke/full_data.csv')
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [10]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0


데이터 전처리

In [11]:
cat_ = data.select_dtypes(include='O').keys()
for c in cat_:
    print(f'{c}:  {data[c].unique()}')

gender:  ['Male' 'Female']
ever_married:  ['Yes' 'No']
work_type:  ['Private' 'Self-employed' 'Govt_job' 'children']
Residence_type:  ['Urban' 'Rural']
smoking_status:  ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [12]:
# 라벨 인코딩
le = LabelEncoder()

data['work_type'] = le.fit_transform(data['work_type'])
data['smoking_status'] = le.fit_transform(data['smoking_status'])
data['gender'] = le.fit_transform(data['gender'])
data['ever_married'] = le.fit_transform(data['ever_married'])
data['Residence_type'] = le.fit_transform(data['Residence_type'])

In [13]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,1,1,228.69,36.6,1,1
1,1,80.0,0,1,1,1,0,105.92,32.5,2,1
2,0,49.0,0,0,1,1,1,171.23,34.4,3,1
3,0,79.0,1,0,1,2,0,174.12,24.0,2,1
4,1,81.0,0,0,1,1,1,186.21,29.0,1,1


EDA

1. 뇌졸중 발생 분포

In [14]:
stroke_counts = data['stroke'].value_counts().reset_index()
stroke_counts.columns = ['stroke', 'count']
px.bar(stroke_counts, x='stroke', y='count', title='뇌졸중 발생 빈도')

2. 뇌졸중 위험 요인 분석

In [15]:
print("뇌졸중 고위험군:")

# 나이와 뇌졸중 상관관계
age_corr = data['age'].corr(data['stroke'])
print(f"- 고령자 (나이와의 상관계수: {age_corr:.4f})")

# 심장병과 뇌졸중 관계
heart_disease_ratio = data[data['heart_disease']==1]['stroke'].mean() / data['stroke'].mean()
print(f"- 심장병 환자 (뇌졸중 발생 비율: {heart_disease_ratio:.2f}배)")

# 혈당과 뇌졸중 상관관계
glucose_corr = data['avg_glucose_level'].corr(data['stroke'])
print(f"- 고혈당 환자 (혈당과의 상관계수: {glucose_corr:.4f})")

# 고혈압과 뇌졸중 관계
hypertension_ratio = data[data['hypertension']==1]['stroke'].mean() / data['stroke'].mean()
print(f"- 고혈압 환자 (뇌졸중 발생 비율: {hypertension_ratio:.2f}배)")

# 성별과 뇌졸중 관계
gender_ratio = data[data['gender']==0]['stroke'].mean() / data[data['gender']==1]['stroke'].mean()
print(f"- 여성 (남성 대비 뇌졸중 발생 비율: {gender_ratio:.2f}배)")

# BMI와 뇌졸중 상관관계
bmi_corr = data['bmi'].corr(data['stroke'])
print(f"- 과체중 (BMI와의 상관계수: {bmi_corr:.4f})")

# 흡연과 뇌졸중 관계
smoking_ratios = {status: data[data['smoking_status'] == status]['stroke'].mean() / data['stroke'].mean() 
                  for status in data['smoking_status'].unique()}
max_smoking_ratio = max(smoking_ratios.values())
print(f"- 흡연자 및 과거 흡연자 (뇌졸중 발생 비율: {max_smoking_ratio:.2f}배)")

# 결혼 여부와 뇌졸중 관계
married_ratio = data[data['ever_married']==1]['stroke'].mean() / data[data['ever_married']==0]['stroke'].mean()
print(f"- 기혼자 (미혼자 대비 뇌졸중 발생 비율: {married_ratio:.2f}배)")


뇌졸중 고위험군:
- 고령자 (나이와의 상관계수: 0.2465)
- 심장병 환자 (뇌졸중 발생 비율: 3.43배)
- 고혈당 환자 (혈당과의 상관계수: 0.1332)
- 고혈압 환자 (뇌졸중 발생 비율: 2.77배)
- 여성 (남성 대비 뇌졸중 발생 비율: 0.92배)
- 과체중 (BMI와의 상관계수: 0.0569)
- 흡연자 및 과거 흡연자 (뇌졸중 발생 비율: 1.62배)
- 기혼자 (미혼자 대비 뇌졸중 발생 비율: 3.92배)


3. 상관분석

| 상관계수 | 자료척도 | 형태 | 결과범위 | 특징 |
|---------|--------|------|---------|------|
| 파이 계수 | 명목척도 | 2x2 분할표 | 0 ~ 1 | 이진 변수 간의 관계 강도를 측정 |
| 크래머 V 계수 | 명목척도 | RxC 분할표 | 0 ~ 1 | 범주형 변수 간의 관계 강도를 측정, 변수의 범주 수에 영향을 받지 않음 |
| 피어슨 상관계수 | 등간/비율척도 | 연속형 변수 | -1 ~ 1 | 선형 관계의 강도와 방향을 측정 |
| 스피어만 상관계수 | 서열척도 | 순위 데이터 | -1 ~ 1 | 단조 관계의 강도와 방향을 측정, 이상치에 덜 민감 |


In [16]:
# 범주형 변수와 뇌졸중 간의 상관관계 분석


# 범주형 변수 목록
categorical_vars = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# 크래머의 V 계수 계산 함수
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# 파이 계수 계산 함수
def phi_coefficient(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / n)

# 결과 저장을 위한 딕셔너리
correlation_results = {}

# 각 변수에 대해 적절한 상관계수 계산
for var in categorical_vars:
    if var in ['work_type', 'smoking_status']:
        correlation = cramers_v(data[var], data['stroke'])
        method = "Cramer's V"
    elif var in ['gender', 'ever_married', 'Residence_type', 'hypertension', 'heart_disease']:
        correlation = phi_coefficient(data[var], data['stroke'])
        method = "Phi"
    correlation_results[var] = {'correlation': correlation, 'method': method}

# 연속형 변수에 대한 상관계수 계산
continuous_vars = ['age', 'avg_glucose_level', 'bmi']
for var in continuous_vars:
    correlation, _ = pearsonr(data[var], data['stroke'])
    correlation_results[var] = {'correlation': correlation, 'method': "Pearson"}

# 결과 출력
print("변수와 뇌졸중 간의 상관관계:")
for var, result in sorted(correlation_results.items(), key=lambda x: abs(x[1]['correlation']), reverse=True):
    print(f"{var}: {result['correlation']:.4f} ({result['method']})")

변수와 뇌졸중 간의 상관관계:
age: 0.2465 (Pearson)
avg_glucose_level: 0.1332 (Pearson)
heart_disease: 0.1326 (Phi)
hypertension: 0.1304 (Phi)
ever_married: 0.1074 (Phi)
work_type: 0.0949 (Cramer's V)
smoking_status: 0.0719 (Cramer's V)
bmi: 0.0569 (Pearson)
Residence_type: 0.0156 (Phi)
gender: 0.0079 (Phi)


In [17]:
# 시각화
fig = go.Figure(data=[go.Bar(
    x=list(correlation_results.keys()),
    y=[abs(result['correlation']) for result in correlation_results.values()],
    text=[f"{abs(result['correlation']):.4f}" for result in correlation_results.values()],
    textposition='auto',
    marker_color='royalblue'
)])

fig.update_layout(
    title='변수와 뇌졸중 간의 상관관계',
    xaxis_title='변수',
    yaxis_title='상관계수 (절대값)',
    xaxis_tickangle=-45,
    yaxis_range=[0, 1],
    width=1000,
    height=600
)

fig.show()

print("\n이 분석은 각 변수가 뇌졸중 발생과 얼마나 강한 연관성을 가지는지 보여줍니다.")
print("상관계수의 절대값이 높을수록 해당 변수와 뇌졸중 간의 연관성이 더 강합니다.")
print("각 변수의 특성에 따라 적절한 상관계수 방법을 사용했습니다.")



이 분석은 각 변수가 뇌졸중 발생과 얼마나 강한 연관성을 가지는지 보여줍니다.
상관계수의 절대값이 높을수록 해당 변수와 뇌졸중 간의 연관성이 더 강합니다.
각 변수의 특성에 따라 적절한 상관계수 방법을 사용했습니다.


데이터 분리 (훈련 / 검증 / 테스트)

In [18]:
# 특성과 레이블 분리
X = data.drop(['stroke'], axis=1)
y = data['stroke']

In [19]:
# 불균형 데이터 처리
print('---- SMOTE 적용 전 ----')
print(y.value_counts())
smt = SMOTE(random_state=42)
X_smt, y_smt = smt.fit_resample(X, y)
print('\n\n---- SMOTE 적용 후 ----')
print(y_smt.value_counts())

---- SMOTE 적용 전 ----
stroke
0    4733
1     248
Name: count, dtype: int64


---- SMOTE 적용 후 ----
stroke
1    4733
0    4733
Name: count, dtype: int64


In [20]:
X_tmp, X_test, y_tmp, y_test = train_test_split(X_smt, y_smt,test_size=0.1, random_state=2024)
X_train, X_val,  y_train, y_val =  train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=2024)
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(6815, 10) (1704, 10) (947, 10)
(6815,) (1704,) (947,)


S3 데이터 입출력 경로 설정 및 데이터 업로드

In [21]:
bucket_name = 'dante-sagemaker'
project_name = 'stroke-prediction'

In [22]:
# S3 입력 경로 설정
input_path = f's3://{bucket_name}/{project_name}/input'

# S3 출력 경로 설정
output_path = f's3://{bucket_name}/{project_name}/output'

# 모델 저장 경로 설정
model_path = f's3://{bucket_name}/{project_name}/model'

# 스크립트 저장 경로 설정
script_path = f's3://{bucket_name}/{project_name}/script'

# 훈련 데이터 경로 설정
train_path = f'{input_path}/train/train.csv'

# 검증 데이터 경로 설정
val_path = f'{input_path}/val/val.csv'

# 테스트 데이터 경로 설정
test_path = f'{input_path}/test/test.csv'

In [23]:
train_df = pd.concat([y_train, X_train], axis=1)
val_df = pd.concat([y_val, X_val], axis=1)
test_df = pd.concat([y_test, X_test], axis=1)

In [24]:
wr.s3.to_csv(train_df, train_path, index=False, header=False, boto3_session=boto3_session)
wr.s3.to_csv(val_df, val_path, index=False, header=False, boto3_session=boto3_session)
wr.s3.to_csv(test_df, test_path, index=False, header=False, boto3_session=boto3_session)

{'paths': ['s3://dante-sagemaker/stroke-prediction/input/test/test.csv'],
 'partitions_values': {}}

### SageMaker 빌트인 알고리즘 준비

모델 학습 및 배포 함수 정의

In [25]:
from sagemaker.inputs import TrainingInput
from sagemaker.utils import name_from_base
from sagemaker.estimator import Estimator
from sagemaker import hyperparameters

def train_and_deploy_model(project_name : str, train_path : str, val_path : str, output_path : str, model_type : str = 'lightgbm' ):
    if model_type not in ['lightgbm', 'catboost']:
        raise ValueError("잘못된 모델 유형입니다. 'lightgbm' 또는 'catboost'를 선택하세요.")
    
    # 모델 타입에 따라 모델 ID와 버전 설정
    train_model_id = "lightgbm-classification-model" if model_type == 'lightgbm' else "catboost-classification-model"
    model_version = "*"
    train_scope = "training"
    training_instance_type = "ml.m5.xlarge" 
    inference_instance_type = "ml.m5.large"
    
    # 이미지 URI 검색
    train_image_uri = sagemaker.image_uris.retrieve(
        region=None,
        framework=None,
        model_id=train_model_id,
        model_version=model_version,
        image_scope=train_scope,
        instance_type=training_instance_type,
    )

    # 스크립트 URI 검색
    train_source_uri = sagemaker.script_uris.retrieve(
        model_id=train_model_id, model_version=model_version, script_scope=train_scope
    )

    # 모델 URI 검색
    train_model_uri = sagemaker.model_uris.retrieve(
        model_id=train_model_id, model_version=model_version, model_scope=train_scope
    )
    # 추론 이미지 URI 검색
    deploy_image_uri = sagemaker.image_uris.retrieve(
        region=None,
        framework=None,
        image_scope="inference",
        model_id=train_model_id,
        model_version=model_version,
        instance_type=inference_instance_type,
    )
    
     # 추론 스크립트 URI 검색
    deploy_source_uri = sagemaker.script_uris.retrieve(
        model_id=train_model_id, model_version=model_version, script_scope="inference"
    )
    
    # 하이퍼파라미터 설정
    _hyperparameters = hyperparameters.retrieve_default(
        model_id=train_model_id, model_version=model_version
    )
    
    if model_type == 'lightgbm':
        # https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/lightgbm-hyperparameters.html
        _hyperparameters.update({
            'num_boost_round': '200',
            'metric': 'auc',
            'learning_rate': '0.01',
            'num_leaves': '100',
            'max_depth': '10'
        })
    else:
        # https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/catboost-hyperparameters.html
        _hyperparameters.update({
            'iterations': '1000',
            'depth': '10',
        })
      
    # 알고리즘 컨테이너 설정
    estimator = Estimator(
        role=role,
        sagemaker_session=sagemaker_session,
        image_uri=train_image_uri,
        source_dir=train_source_uri,
        model_uri=train_model_uri,
        entry_point="transfer_learning.py",
        instance_count=1,
        instance_type=training_instance_type,
        max_run=360000,
        max_wait=360000,
        use_spot_instances=True,
        hyperparameters=_hyperparameters,
        output_path=output_path,
    )
    
    training_job_name = name_from_base(f"{project_name}-{model_type}-training")
    
    # 모델 학습
    try:
        estimator.fit({
            "training": TrainingInput(train_path, content_type='text/csv'),
            "validation": TrainingInput(val_path, content_type='text/csv')
        }, logs=True, job_name=training_job_name)
    except Exception as e:
        print(f"모델 학습 중 오류 발생: {str(e)}")
        return None

    endpoint_name = name_from_base(f"{project_name}-{model_type}-inference")

    # 모델 배포
    try:
        estimator.deploy(
            initial_instance_count=1,
            instance_type=inference_instance_type,
            entry_point="inference.py",
            image_uri=deploy_image_uri,
            source_dir=deploy_source_uri,
            endpoint_name=endpoint_name,
        )
    except Exception as e:
        print(f"모델 배포 중 오류 발생: {str(e)}")
        return None
    
    return endpoint_name

모델 예측 함수 정의

In [26]:
def model_prediction(endpoint_name, features, batch_size=1500):
    # 테스트 데이터의 크기가 크기 때문에 엔드포인트에 쿼리하기 위해 더 작은 크기의 배치로 분할합니다.
    num_examples = features.shape[0]
    predict_prob = []
    
    client = boto3_session.client("runtime.sagemaker")
    
    for i in np.arange(0, num_examples, step=batch_size):
        features_batch = features.iloc[i : (i + batch_size), :]
        
        encoded_tabular_data = features_batch.to_csv(header=False, index=False).encode("utf-8")
        query_response_batch = client.invoke_endpoint(
            EndpointName=endpoint_name, ContentType="text/csv", Body=encoded_tabular_data
        )
        
        model_predictions = json.loads(query_response_batch["Body"].read())
        predicted_probabilities = model_predictions["probabilities"]
        predict_prob_batch =  np.array(predicted_probabilities)
            
        predict_prob.append(predict_prob_batch)
        
    predict_prob = np.concatenate(predict_prob, axis=0)
    
    predict_label = np.argmax(
        predict_prob, axis=1
    )
    
    return predict_label

모델 평가 함수 정의

In [27]:
def plot_confusion_matrix(ground_truth_label, predict_label):
    conf_matrix = confusion_matrix(y_true=ground_truth_label.values, y_pred=predict_label)

    fig = go.Figure(data=go.Heatmap(
        z=conf_matrix,
        x=['예측: 0', '예측: 1'],
        y=['실제: 0', '실제: 1'],
        colorscale='Blues',
        showscale=False
    ))

    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            fig.add_annotation(
                x=j,
                y=i,
                text=str(conf_matrix[i, j]),
                showarrow=False,
                font=dict(color='white' if conf_matrix[i, j] > conf_matrix.max() / 2 else 'black', size=20)
            )

    fig.update_layout(
        title='혼동 행렬',
        xaxis_title='예측',
        yaxis_title='실제',
        width=600,
        height=600
    )

    fig.show()
    
    eval_accuracy = accuracy_score(ground_truth_label.values, predict_label)
    eval_auc = roc_auc_score(ground_truth_label.values, predict_label)
    eval_recall = recall_score(ground_truth_label.values, predict_label)
    eval_precision = precision_score(ground_truth_label.values, predict_label)
    eval_f1 = f1_score(ground_truth_label.values, predict_label)

    print(f"Accuracy: {eval_accuracy}")
    print(f"AUC: {eval_auc}")
    print(f"Recall: {eval_recall}")
    print(f"Precision: {eval_precision}")
    print(f"F1 Score: {eval_f1}")

테스트 데이터 로드

In [28]:
test_data = wr.s3.read_csv(test_path, boto3_session=boto3_session, header=None)
test_data.columns = ["Target"] + [f"Feature_{i}" for i in range(1, test_data.shape[1])]
test_data.head(5)

Unnamed: 0,Target,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10
0,0,0,9.0,0,0,0,3,0,57.27,28.0,0
1,0,0,19.0,0,0,0,1,0,72.84,22.7,2
2,1,0,78.0,0,0,1,1,0,57.030793,25.013402,0
3,1,0,79.0,0,0,1,1,0,169.69097,27.98115,0
4,0,0,78.0,0,0,1,2,1,56.95,26.0,0


In [29]:
ground_truth_label, features = test_data.iloc[:, :1], test_data.iloc[:, 1:]

LightGBM 모델 학습 및 배포

In [150]:
# 모델 학습 및 배포
lightgbm_endpoint_name = train_and_deploy_model(project_name, train_path, val_path, output_path, model_type='lightgbm')
# 모델 예측
lightgbm_predict_label = model_prediction(lightgbm_endpoint_name, features)
# 모델 평가
plot_confusion_matrix(ground_truth_label, lightgbm_predict_label)


INFO:sagemaker:Creating training-job with name: stroke-prediction-lightgbm-training-2024-08-10-02-26-44-834


2024-08-10 02:26:45 Starting - Starting the training job...
2024-08-10 02:27:01 Starting - Preparing the instances for training...
2024-08-10 02:27:37 Downloading - Downloading the training image......
2024-08-10 02:28:43 Training - Training image download completed. Training in progress...bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-08-10 02:28:46,553 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-08-10 02:28:46,554 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-10 02:28:46,563 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-08-10 02:28:46,565 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-08-10 02:28:47,245 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.8 -m pip install -r requirement

INFO:sagemaker:Repacking model artifact (s3://dante-sagemaker/stroke-prediction/output/stroke-prediction-lightgbm-training-2024-08-10-02-26-44-834/output/model.tar.gz), script artifact (s3://jumpstart-cache-prod-ap-northeast-2/source-directory-tarballs/lightgbm/inference/classification/v1.2.2/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-ap-northeast-2-905418381372/sagemaker-jumpstart-2024-08-10-02-29-28-015/model.tar.gz. This may take some time depending on model size...


Training seconds: 114
Billable seconds: 22
Managed Spot Training savings: 80.7%


INFO:sagemaker:Creating model with name: sagemaker-jumpstart-2024-08-10-02-29-28-015
INFO:sagemaker:Creating endpoint-config with name stroke-prediction-lightgbm-inference-2024-08-10-02-29-28-015
INFO:sagemaker:Creating endpoint with name stroke-prediction-lightgbm-inference-2024-08-10-02-29-28-015


-------!

Accuracy: 0.9429778247096093
AUC: 0.9430730388702037
Recall: 0.9530916844349681
Precision: 0.9331941544885177
F1 Score: 0.9430379746835443


Catboost 모델 학습 및 배포

In [30]:
# 모델 학습 및 배포
catboost_endpoint_name = train_and_deploy_model(project_name, train_path, val_path, output_path, model_type='catboost')
# 모델 예측
catboost_predict_label = model_prediction(catboost_endpoint_name, test_data)
# 모델 평가
plot_confusion_matrix(ground_truth_label, catboost_predict_label)

Using model 'catboost-classification-model' with wildcard version identifier '*'. You can pin to version '2.1.0' for more stable results. Note that models may have different input/output signatures after a major version upgrade.
INFO:sagemaker:Creating training-job with name: stroke-prediction-catboost-training-2024-08-10-03-24-33-423


2024-08-10 03:24:33 Starting - Starting the training job...
2024-08-10 03:24:52 Starting - Preparing the instances for training...
2024-08-10 03:25:23 Downloading - Downloading the training image......
2024-08-10 03:26:29 Training - Training image download completed. Training in progress..bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-08-10 03:26:35,458 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-08-10 03:26:35,460 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-08-10 03:26:35,469 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-08-10 03:26:35,471 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-08-10 03:26:37,932 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:
/opt/conda/bin/python3.8 -m pip install -r requirements

INFO:sagemaker:Repacking model artifact (s3://dante-sagemaker/stroke-prediction/output/stroke-prediction-catboost-training-2024-08-10-03-24-33-423/output/model.tar.gz), script artifact (s3://jumpstart-cache-prod-ap-northeast-2/source-directory-tarballs/catboost/inference/classification/v1.1.2/sourcedir.tar.gz), and dependencies ([]) into single tar.gz file located at s3://sagemaker-ap-northeast-2-905418381372/sagemaker-jumpstart-2024-08-10-03-27-47-156/model.tar.gz. This may take some time depending on model size...
INFO:sagemaker:Creating model with name: sagemaker-jumpstart-2024-08-10-03-27-47-156
INFO:sagemaker:Creating endpoint-config with name stroke-prediction-catboost-inference-2024-08-10-03-27-47-156
INFO:sagemaker:Creating endpoint with name stroke-prediction-catboost-inference-2024-08-10-03-27-47-156


------!

Accuracy: 0.5047518479408659
AUC: 0.5
Recall: 0.0
Precision: 0.0
F1 Score: 0.0


엔드포인트 삭제

In [None]:
# 실행 중인 모든 엔드포인트 삭제
sagemaker_client = boto3_session.client('sagemaker')

# 모든 엔드포인트 목록 가져오기
endpoints = sagemaker_client.list_endpoints()['Endpoints']

# 각 엔드포인트 삭제
for endpoint in endpoints:
    endpoint_name = endpoint['EndpointName']
    try:
        print(f"{endpoint_name} 엔드포인트를 삭제 중입니다...")
        sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
        print(f"{endpoint_name} 엔드포인트가 성공적으로 삭제되었습니다.")
    except Exception as e:
        print(f"{endpoint_name} 엔드포인트 삭제 중 오류 발생: {str(e)}")

print("모든 엔드포인트 삭제 작업이 완료되었습니다.")