# drive 마운트, seed 고정

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
import os
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.cluster import MiniBatchKMeans


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 데이터 로드 및 전처리

In [None]:
train_data  = pd.read_csv('/content/drive/MyDrive/날씨 콘테스트/fog_train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/날씨 콘테스트/fog_test.csv')

# 열 이름 변경
train_data.columns = train_data.columns.str.replace('fog_train.', '')
test_data.columns = test_data.columns.str.replace('fog_test.', '')

# 'Unnamed: 0' 열 제거
train_data.drop(columns=['Unnamed: 0'], inplace=True)
test_data.drop(columns=['Unnamed: 0'], inplace=True)

# 테스트 데이터에서 class 열 제거, 목표변수가 test데이터 열에 포함되면 안되기 때문
test_data = test_data.drop(columns=['class'], errors='ignore')

# 결측값 처리 (-99, -99.9를 NaN으로 대체)
train_data.replace([-99, -99.9], np.nan, inplace=True)
test_data.replace([-99, -99.9], np.nan, inplace=True)

In [None]:
# hm 컬럼에서 값이 0인 경우를 1로 대체
train_data['hm'] = train_data['hm'].replace(0, 1)

# 본문

In [None]:
# train데이터는 총 3개년의 데이터가 있는데 한 년도에 29일이 포함되어잇다.
# test데이터에는 한 년도의 데이터가 있는데 29일이 포함되어잇지 않다.


In [None]:
train_data['year'] = train_data['year'].replace({'I': 2020, 'J': 2021, 'K': 2022})
test_data['year'] = test_data['year'].replace({'L': 2024})

In [None]:
# 'time' 열을 'hour' 열로 변경
train_data.rename(columns={'time': 'hour'}, inplace=True)
test_data.rename(columns={'time': 'hour'}, inplace=True)

In [None]:
# 'datetime' 열 추가
train_data['datetime'] = pd.to_datetime(train_data[['year', 'month', 'day', 'hour', 'minute']])

In [None]:
# 'datetime' 열 추가
test_data['datetime'] = pd.to_datetime(test_data[['year', 'month', 'day', 'hour', 'minute']])

# 삼각

In [None]:
# 삼각함수를 사용한 시계열 특성 생성 함수
def create_sin_cos_features(df):
    # 시간 단위로 변환
    df['seconds_in_day'] = df['hour'] * 3600 + df['minute'] * 60
    df['seconds_in_year'] = (df['datetime'] - pd.to_datetime(df['datetime'].dt.year.astype(str) + '-01-01')).dt.total_seconds()

    # 하루를 기준으로 시간(hour, minute) 변환
    df['time_sin'] = np.sin(2 * np.pi * df['seconds_in_day'] / 86400)
    df['time_cos'] = np.cos(2 * np.pi * df['seconds_in_day'] / 86400)

    # 1년을 기준으로 시간(year, month, day) 변환
    df['yearly_time_sin'] = np.sin(2 * np.pi * df['seconds_in_year'] / (365 * 86400))
    df['yearly_time_cos'] = np.cos(2 * np.pi * df['seconds_in_year'] / (365 * 86400))

    return df


In [None]:

# 시계열 특성 생성
interpolated_train_data_t = create_sin_cos_features(interpolated_train_data)

print("\nFinal DataFrame with Sin and Cos Features:")
print(interpolated_train_data_t[['datetime', 'time_sin', 'time_cos', 'yearly_time_sin', 'yearly_time_cos']].head())


Final DataFrame with Sin and Cos Features:
             datetime  time_sin  time_cos  yearly_time_sin  yearly_time_cos
0 2020-01-01 00:10:00  0.043619  0.999048         0.000120              1.0
1 2020-01-01 00:20:00  0.087156  0.996195         0.000239              1.0
2 2020-01-01 00:30:00  0.130526  0.991445         0.000359              1.0
3 2020-01-01 00:40:00  0.173648  0.984808         0.000478              1.0
4 2020-01-01 00:50:00  0.216440  0.976296         0.000598              1.0


In [None]:

# 시계열 특성 생성
interpolated_test_data_t = create_sin_cos_features(interpolated_test_data)

print("\nFinal DataFrame with Sin and Cos Features:")
print(interpolated_test_data_t[['datetime', 'time_sin', 'time_cos', 'yearly_time_sin', 'yearly_time_cos']].head())


Final DataFrame with Sin and Cos Features:
             datetime  time_sin  time_cos  yearly_time_sin  yearly_time_cos
0 2024-01-01 00:00:00  0.000000  1.000000         0.000000              1.0
1 2024-01-01 00:10:00  0.043619  0.999048         0.000120              1.0
2 2024-01-01 00:20:00  0.087156  0.996195         0.000239              1.0
3 2024-01-01 00:30:00  0.130526  0.991445         0.000359              1.0
4 2024-01-01 00:40:00  0.173648  0.984808         0.000478              1.0


In [None]:
interpolated_train_data_t.isnull().sum()

datetime                0
year                    0
month                   0
day                     0
hour                    0
minute                  0
stn_id                  0
ws10_deg                0
ws10_ms                 0
ta                      0
re                      0
hm                      0
sun10                   0
ts                      0
vis1                    0
class                   0
seconds_in_day          0
seconds_in_year         0
time_sin                0
time_cos                0
yearly_time_sin         0
yearly_time_cos         0
Temp_Diff               0
dew_point               0
dew_point_minus_ta      0
sqrt_rounded_diff       0
Fog_Likelihood_Index    0
ws10_deg_sin            0
ws10_deg_cos            0
dtype: int64

In [None]:
interpolated_test_data_t.isnull().sum()

datetime                 0
year                     0
month                    0
day                      0
hour                     0
minute                   0
stn_id                   0
ws10_deg                 0
ws10_ms                  0
ta                       0
re                       0
hm                       0
sun10                    0
ts                      54
seconds_in_day           0
seconds_in_year          0
time_sin                 0
time_cos                 0
yearly_time_sin          0
yearly_time_cos          0
Temp_Diff               54
dew_point                0
dew_point_minus_ta       0
sqrt_rounded_diff        0
Fog_Likelihood_Index    54
ws10_deg_sin             0
ws10_deg_cos             0
dtype: int64

# 파생변수

In [None]:
interpolated_train_data_t['Temp_Diff'] = interpolated_train_data_t['ta'] - interpolated_train_data_t['ts']
interpolated_test_data_t['Temp_Diff'] = interpolated_test_data_t['ta'] - interpolated_test_data_t['ts']

In [None]:
#interpolated_train_data_t['Fog_Likelihood_Index'] = (interpolated_train_data_t['Temp_Diff'] * interpolated_train_data_t['hm']) / (interpolated_train_data_t['ws10_ms'] + 1)
#interpolated_test_data_t['Fog_Likelihood_Index'] = (interpolated_test_data_t['Temp_Diff'] * interpolated_test_data_t['hm']) / (interpolated_test_data_t['ws10_ms'] + 1)

In [None]:
import pandas as pd
import numpy as np

# 매그너스 공식의 상수
a = 17.62
b = 243.12

# 이슬점 계산 함수
def calculate_dew_point(temperature, humidity):
    gamma = (a * temperature) / (b + temperature) + np.log(humidity / 100.0)
    dew_point = (b * gamma) / (a - gamma)
    return dew_point


In [None]:
# 이슬점 온도 계산 및 컬럼 추가
interpolated_train_data_t['dew_point'] = interpolated_train_data_t.apply(lambda row: calculate_dew_point(row['ta'], row['hm']), axis=1)

In [None]:
interpolated_test_data_t['dew_point'] = interpolated_test_data_t.apply(lambda row: calculate_dew_point(row['ta'], row['hm']), axis=1)

In [None]:
interpolated_train_data_t['dew_point_minus_ta'] = -(interpolated_train_data_t['dew_point'] - interpolated_train_data_t['ta']).round(1)

In [None]:
interpolated_test_data_t['dew_point_minus_ta'] = -(interpolated_test_data_t['dew_point'] - interpolated_test_data_t['ta']).round(1)

In [None]:
interpolated_train_data_t['sqrt_rounded_diff'] = np.sqrt(interpolated_train_data_t['dew_point_minus_ta'])

In [None]:
interpolated_test_data_t['sqrt_rounded_diff'] = np.sqrt(interpolated_test_data_t['dew_point_minus_ta'])

In [None]:
interpolated_train_data_t['Fog_Likelihood_Index'] = (interpolated_train_data_t['Temp_Diff'] * interpolated_train_data_t['hm']) / (interpolated_train_data_t['ws10_ms'] + 1)
interpolated_test_data_t['Fog_Likelihood_Index'] = (interpolated_test_data_t['Temp_Diff'] * interpolated_test_data_t['hm']) / (interpolated_test_data_t['ws10_ms'] + 1)

In [None]:
# 풍향을 주기적인 특성으로 변환하는 함수
def transform_wind_direction(df):
    df['ws10_deg_sin'] = np.sin(np.deg2rad(df['ws10_deg']))
    df['ws10_deg_cos'] = np.cos(np.deg2rad(df['ws10_deg']))
    return df

# 데이터프레임에 변환 적용
interpolated_train_data_t = transform_wind_direction(interpolated_train_data_t)

# 결과 출력
print(interpolated_train_data_t[['ws10_deg', 'ws10_deg_sin', 'ws10_deg_cos']])


        ws10_deg  ws10_deg_sin  ws10_deg_cos
0            0.0      0.000000      1.000000
1            0.0      0.000000      1.000000
2            0.0      0.000000      1.000000
3           27.8      0.466387      0.884581
4           59.7      0.863396      0.504528
...          ...           ...           ...
157817     305.3     -0.816138      0.577858
157818     293.8     -0.914960      0.403545
157819     274.2     -0.997314      0.073238
157820     270.3     -0.999986      0.005236
157821     254.8     -0.965016     -0.262189

[3156459 rows x 3 columns]


In [None]:

# 풍향을 주기적인 특성으로 변환하는 함수
def transform_wind_direction(df):
    df['ws10_deg_sin'] = np.sin(np.deg2rad(df['ws10_deg']))
    df['ws10_deg_cos'] = np.cos(np.deg2rad(df['ws10_deg']))
    return df

# 데이터프레임에 변환 적용
interpolated_test_data_t = transform_wind_direction(interpolated_test_data_t)

# 결과 출력
print(interpolated_test_data_t[['ws10_deg', 'ws10_deg_sin', 'ws10_deg_cos']])


       ws10_deg  ws10_deg_sin  ws10_deg_cos
0         329.5     -0.507538      0.861629
1         321.8     -0.618408      0.785857
2           0.4      0.006981      0.999976
3         323.6     -0.593419      0.804894
4         208.4     -0.475624     -0.879649
...         ...           ...           ...
52555     338.8     -0.361625      0.932324
52556     335.2     -0.419452      0.907777
52557     337.1     -0.389124      0.921185
52558     322.0     -0.615661      0.788011
52559      33.6      0.553392      0.832921

[262800 rows x 3 columns]


# 축약

In [None]:
# 필요 없는 열 제거
columns_to_drop = ['datetime', 'year', 'month', 'day', 'hour', 'minute', 'seconds_in_day', 'seconds_in_year','dew_point_minus_ta', 'dew_point','re','ws10_deg']

train_data_final_drop = interpolated_train_data_t.drop(columns=columns_to_drop)
# 필요 없는 열 제거
test_data_final_drop = interpolated_test_data_t.drop(columns=columns_to_drop)

In [None]:
train_data_final_drop

Unnamed: 0,stn_id,ws10_ms,ta,hm,sun10,ts,vis1,class,time_sin,time_cos,yearly_time_sin,yearly_time_cos,Temp_Diff,sqrt_rounded_diff,Fog_Likelihood_Index,ws10_deg_sin,ws10_deg_cos
0,AA,0.0,-6.4,38.9,0.0,-2.8,20000.0,4.0,0.043619,0.999048,0.000120,1.0,-3.6,3.420526,-140.040000,0.000000,1.000000
1,AA,0.0,-6.3,37.9,0.0,-2.7,20000.0,4.0,0.087156,0.996195,0.000239,1.0,-3.6,3.478505,-136.440000,0.000000,1.000000
2,AA,0.0,-6.3,40.0,0.0,-2.6,20000.0,4.0,0.130526,0.991445,0.000359,1.0,-3.7,3.376389,-148.000000,0.000000,1.000000
3,AA,0.4,-6.2,39.5,0.0,-2.6,20000.0,4.0,0.173648,0.984808,0.000478,1.0,-3.6,3.405877,-101.571429,0.466387,0.884581
4,AA,0.5,-6.1,39.8,0.0,-2.5,20000.0,4.0,0.216440,0.976296,0.000598,1.0,-3.6,3.391165,-95.520000,0.863396,0.504528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157817,EC,7.7,2.5,50.2,0.0,-0.9,20000.0,4.0,-0.216440,0.976296,-0.000598,1.0,3.4,3.049590,19.618391,-0.816138,0.577858
157818,EC,5.7,2.3,50.1,0.0,-1.3,20000.0,4.0,-0.173648,0.984808,-0.000478,1.0,3.6,3.049590,26.919403,-0.914960,0.403545
157819,EC,4.9,2.2,51.0,0.0,-1.4,20000.0,4.0,-0.130526,0.991445,-0.000359,1.0,3.6,3.016621,31.118644,-0.997314,0.073238
157820,EC,4.6,2.1,51.7,0.0,-1.6,20000.0,4.0,-0.087156,0.996195,-0.000239,1.0,3.7,2.983287,34.158929,-0.999986,0.005236


In [None]:
import pandas as pd



# 검증용 데이터로 사용할 지역 목록
val_regions = ['AA', 'EC', 'DA', 'BB']

# train_df_A3_val과 train_df_A3_train 데이터프레임 생성
train_df_A3_val = train_data_final_drop[train_data_final_drop['stn_id'].isin(val_regions)]
train_df_A3_train = train_data_final_drop[~train_data_final_drop['stn_id'].isin(val_regions)]



In [None]:
train_df_A3_train

Unnamed: 0,stn_id,ws10_deg,ws10_ms,ta,hm,sun10,ts,time_sin,time_cos,yearly_time_sin,yearly_time_cos,Temp_Diff,sqrt_rounded_diff,Fog_Likelihood_Index,ws10_deg_sin,ws10_deg_cos
0,AI,329.5,0.6,-2.5,87.5,0.0,-1.6,0.000000,1.000000,0.000000,1.000000,-0.9,1.341641,-49.218750,-0.507538,0.861629
1,AI,321.8,1.2,-2.5,88.2,0.0,-1.6,0.043619,0.999048,0.000120,1.000000,-0.9,1.303840,-36.081818,-0.618408,0.785857
2,AI,0.4,0.4,-2.5,88.6,0.0,-1.7,0.087156,0.996195,0.000239,1.000000,-0.8,1.264911,-50.628571,0.006981,0.999976
3,AI,323.6,0.7,-2.6,88.7,0.0,-1.6,0.130526,0.991445,0.000359,1.000000,-1.0,1.264911,-52.176471,-0.593419,0.804894
4,AI,208.4,0.2,-2.6,89.0,0.0,-1.6,0.173648,0.984808,0.000478,1.000000,-1.0,1.264911,-74.166667,-0.475624,-0.879649
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52555,ED,338.8,6.1,4.5,71.3,0.0,1.9,-0.216440,0.976296,0.016616,0.999862,2.6,2.167948,26.109859,-0.361625,0.932324
52556,ED,335.2,5.8,4.6,71.4,0.0,1.9,-0.173648,0.984808,0.016735,0.999860,2.7,2.167948,28.350000,-0.419452,0.907777
52557,ED,337.1,5.3,4.5,72.1,0.0,1.8,-0.130526,0.991445,0.016855,0.999858,2.7,2.144761,30.900000,-0.389124,0.921185
52558,ED,322.0,3.7,4.3,72.5,0.0,1.4,-0.087156,0.996195,0.016974,0.999856,2.9,2.121320,44.734043,-0.615661,0.788011


#XGBOOST

In [None]:
train_df_A3_train = train_df_A3_train.drop('stn_id', axis=1)
train_df_A3_val = train_df_A3_val.drop('stn_id', axis=1)

In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix

# 특징과 라벨 분리 (훈련 데이터)
X_train_final = train_df_A3_train.drop(['class', 'vis1'], axis=1)
y_train_final = train_df_A3_train['class']

X_val = train_df_A3_val.drop(['class', 'vis1'], axis=1)
y_val = train_df_A3_val['class']

# 오버샘플링 적용 전 클래스 비율 확인
print("Original class distribution in training set:")
print(y_train_final.value_counts(normalize=True))

# 각 클래스의 샘플 개수 확인
class_counts = y_train_final.value_counts()

# 샘플링 전략 설정
sampling_strategy = {
    1: int(class_counts[1] * 23.0),  # 클래스 1의 목표 샘플 수
    2: int(class_counts[2] * 17.0),  # 클래스 2의 목표 샘플 수
    3: int(class_counts[3] * 17.0),  # 클래스 3의 목표 샘플 수
    4: class_counts[4]  # 클래스 4는 원래 개수 유지
}

# SMOTE 적용
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_final, y_train_final)

# 오버샘플링 적용 후 클래스 비율 확인
print("Class distribution after SMOTE in training set:")
print(pd.Series(y_train_smote).value_counts(normalize=True))

# 데이터셋 크기 확인
print('훈련 데이터셋 크기 (SMOTE 적용 후):', X_train_smote.shape, y_train_smote.shape)
print('검증 데이터셋 크기:', X_val.shape, y_val.shape)


NameError: name 'train_df_A3_train' is not defined

In [None]:
# 클래스 라벨을 0부터 시작하도록 변환 (라벨 값이 1부터 시작하는 경우)
y_train_smote = y_train_smote - y_train_smote.min()
y_val = y_val - y_val.min()

# 라벨 값이 0부터 num_class-1 사이에 있는지 확인
assert y_train_smote.min() >= 0 and y_train_smote.max() < 4, "Train labels must be in the range [0, num_class)."
assert y_val.min() >= 0 and y_val.max() < 4, "Validation labels must be in the range [0, num_class)."

In [None]:

# XGBoost 모델 학습
dtrain = xgb.DMatrix(X_train_smote, label=y_train_smote)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    'objective': 'multi:softprob',
    'num_class': 4,  # 클래스의 수
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# 모델 훈련
evals = [(dtrain, 'train'), (dval, 'eval')]
bst = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# 검증 데이터에 대한 예측
y_pred = bst.predict(dval)
y_pred_classes = np.argmax(y_pred, axis=1)

# 혼동 행렬 출력
conf_matrix = confusion_matrix(y_val, y_pred_classes)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:


# 클래스 가중치 계산
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_smote), y=y_train_smote)
class_weights = {cls: weight for cls, weight in zip(np.unique(y_train_smote), class_weights)}

# 가중치 부여
weights = np.array([class_weights[label] for label in y_train_smote])

# DMatrix 생성
dtrain = xgb.DMatrix(X_train_smote, label=y_train_smote, weight=weights)
dval = xgb.DMatrix(X_val, label=y_val)


In [None]:

# XGBoost 모델 학습
param = {
    'objective': 'multi:softmax',
    'num_class': 4,
    'eval_metric': 'mlogloss',
    'eta': 0.3,
    'max_depth': 6,
    'seed': 42
}

num_round = 1000
evallist = [(dval, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

# 검증 데이터에 대한 예측
y_val_pred = bst.predict(dval)



[0]	eval-mlogloss:1.00736	train-mlogloss:1.23716
[1]	eval-mlogloss:0.80213	train-mlogloss:1.14669
[2]	eval-mlogloss:0.66366	train-mlogloss:1.08411
[3]	eval-mlogloss:0.56705	train-mlogloss:1.03559
[4]	eval-mlogloss:0.50006	train-mlogloss:1.00000
[5]	eval-mlogloss:0.44986	train-mlogloss:0.97186
[6]	eval-mlogloss:0.41315	train-mlogloss:0.94905
[7]	eval-mlogloss:0.38507	train-mlogloss:0.93028
[8]	eval-mlogloss:0.36451	train-mlogloss:0.91628
[9]	eval-mlogloss:0.34616	train-mlogloss:0.90085
[10]	eval-mlogloss:0.32837	train-mlogloss:0.88741
[11]	eval-mlogloss:0.31599	train-mlogloss:0.87647
[12]	eval-mlogloss:0.30454	train-mlogloss:0.86693
[13]	eval-mlogloss:0.29469	train-mlogloss:0.85840
[14]	eval-mlogloss:0.28562	train-mlogloss:0.85006
[15]	eval-mlogloss:0.27881	train-mlogloss:0.84192
[16]	eval-mlogloss:0.27041	train-mlogloss:0.83396
[17]	eval-mlogloss:0.26511	train-mlogloss:0.82745
[18]	eval-mlogloss:0.25633	train-mlogloss:0.82102
[19]	eval-mlogloss:0.25181	train-mlogloss:0.81543
[20]	eval-

In [None]:
# 최종 평가
y_val = y_val + 1
y_val_pred = y_val_pred + 1


In [None]:
# 혼동 행렬 계산
cm_val = confusion_matrix(y_val, y_val_pred, labels=[1, 2, 3, 4])


# 혼동 행렬 출력
print("Confusion Matrix for Validation Data:")
print(cm_val)


# 검증 데이터의 다중 CSI 계산
H_val = cm_val[0, 0] + cm_val[1, 1] + cm_val[2, 2]  # H11, H22, H33
F_val = (cm_val[0, 1] + cm_val[0, 2] + cm_val[1, 0] +
         cm_val[1, 2] + cm_val[2, 0] + cm_val[2, 1] +
         cm_val[3, 0] + cm_val[3, 1] + cm_val[3, 2])  # F12, F13, F21, F23, F31, F32, F41, F42, F43
M_val = cm_val[0, 3] + cm_val[1, 3] + cm_val[2, 3]  # M14, M24, M34

CSI_val = H_val / (H_val + F_val + M_val)
print(f"CSI for Validation Data: {CSI_val:.2f}")


In [None]:
# Feature importance 구하기
importance = bst.get_score(importance_type='weight')
importance_df = pd.DataFrame(importance.items(), columns=['Feature', 'Importance'])

# 중요도 내림차순으로 정렬
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# feature importance 출력
print("Feature Importance:")
print(importance_df)

# 시각화
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis()
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

In [None]:
# 테스트 데이터 예측
dtest = xgb.DMatrix(test_data_final_drop)
y_test_pred = bst.predict(dtest)



In [None]:
# y_test_pred 값별 비율 계산
unique, counts = np.unique(y_test_pred, return_counts=True)
ratios = counts / counts.sum()

# 값과 비율 출력
for value, ratio in zip(unique, ratios):
    print(f"Value: {value}, Ratio: {ratio:.9f}")

In [None]:

# 예측 결과를 원래 클래스 라벨로 변환
y_test_pred = y_test_pred.astype(int) + 1

# 제출하기 위해 test데이터 다시 불러옴
sub_test_data = pd.read_csv('/content/drive/MyDrive/날씨 콘테스트/fog_test.csv')
sub_test_data = sub_test_data.drop(columns=['fog_test.class'], errors='ignore')

# 예측 결과를 데이터프레임으로 저장
sub_test_data['fog_test.class'] = y_test_pred

# 예측 결과 저장
sub_test_data.to_csv('240184.csv', index=False)
sub_test_data["fog_test.class"].value_counts()

In [None]:
# 파일 다운로드
from google.colab import files
files.download('240184.csv')