## OS
- OS : GooGle Colab Pro (Ubuntu 22.04.2 LTS)
- RAM : 51.0 GB
- CPU : Intel(R) Xeon(R) CPU @ 2.20GHz
- Python 3.10.12

# 시작


In [3]:
import warnings
import bisect
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from supervised.automl import AutoML
from sklearn.model_selection import KFold
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

train = pd.read_csv('/train.csv').drop(columns=['SAMPLE_ID'])
test = pd.read_csv('/test.csv').drop(columns=['SAMPLE_ID'])

In [None]:
train['ATA'] = pd.to_datetime(train['ATA'])
test['ATA'] = pd.to_datetime(test['ATA'])

for df in [train, test]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

In [None]:
test_ = test
test_

In [None]:
train['CI_HOUR'] = pd.to_timedelta(train['CI_HOUR'], unit="h")
train['Bert_time'] = train['ATA'] + train['CI_HOUR']

# Bert_time을 기준으로 풍향, 기온 정보와 CI_HOUR 매핑
bert_time_mapping = train.set_index('Bert_time')[['ARI_CO', 'ARI_PO', 'U_WIND', 'V_WIND', 'BN', 'AIR_TEMPERATURE', 'CI_HOUR', 'ATA']]

def get_avg_bert_time(row):
    # 조건에 따라 동일한 값을 가진 Bert_time 찾기
    matching_rows = bert_time_mapping[
        (bert_time_mapping['ARI_CO'] == row['ARI_CO']) &
        (bert_time_mapping['ARI_PO'] == row['ARI_PO']) &
        (bert_time_mapping['U_WIND'] == row['U_WIND']) &
        (bert_time_mapping['V_WIND'] == row['V_WIND']) &
        (bert_time_mapping['BN'] == row['BN']) &
        (bert_time_mapping['AIR_TEMPERATURE'] == row['AIR_TEMPERATURE'])
    ]

    # 해당 조건을 만족하는 모든 행의 Bert_time 값을 평균냄
    avg_bert_time = None
    if not matching_rows.empty:
        avg_bert_time = matching_rows.index.mean()

    return avg_bert_time

# 결측치가 아닌 test_ 데이터 선택
valid_conditions = (
    (test_['U_WIND'] != 0) & (pd.notna(test_['U_WIND'])) &
    (test_['V_WIND'] != 0) & (pd.notna(test_['V_WIND'])) &
    (test_['BN'] != 0) & (pd.notna(test_['BN'])) &
    (test_['AIR_TEMPERATURE'] != 0) & (pd.notna(test_['AIR_TEMPERATURE']))
)

# 각 test_ 행에 대해 평균 Bert_time 값을 계산 후, 새로운 컬럼에 할당
test_['Avg_Bert_Time'] = test_[valid_conditions].apply(get_avg_bert_time, axis=1)

In [None]:
test_['target'] = test_['Avg_Bert_Time'] - test_['ATA']

In [None]:
test_['target'] = test_['target'].apply(lambda x: x.total_seconds() / 3600)
test_['target']

In [None]:
test_.loc[test_['target'] < 0, 'target'] = 0

In [None]:
# test_.to_csv('/content/drive/MyDrive/Colab Notebooks/2023_dacon_HDAI/test_berttime.csv', index=False)
test_ = pd.read_csv('/test_berttime.csv')

In [None]:
train = pd.read_csv('/train.csv').drop(columns=['SAMPLE_ID'])

train['ATA'] = pd.to_datetime(train['ATA'])

for df in [train]:
    df['year'] = df['ATA'].dt.year
    df['month'] = df['ATA'].dt.month
    df['day'] = df['ATA'].dt.day
    df['hour'] = df['ATA'].dt.hour
    df['minute'] = df['ATA'].dt.minute
    df['weekday'] = df['ATA'].dt.weekday

train

In [None]:
train.drop(columns=['ATA'], inplace=True)
test_.drop(columns=['ATA'], inplace=True)

In [None]:
train_ = train[(train['DIST'] != 0)].reset_index(drop=True)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
train_['CI_HOUR'].hist(bins=100)

plt.subplot(1, 2, 2)
np.log1p(train_['CI_HOUR']).hist(bins=100)

train_['CI_trans'] = np.log1p(train_['CI_HOUR'])

# 파생변수 생성

In [None]:
# 겹치는 항구명 처리
train_['ARI_PO'] = train_['ARI_CO'] + train_['ARI_PO']
test_['ARI_PO'] = test_['ARI_CO'] + test_['ARI_PO']

len(train_['ARI_PO'].unique()), len(train_['PORT_SIZE'].unique())

In [None]:
def compute_smoothed_means(df, group_cols, target_col, m, global_mean=None):
    if global_mean is None:
        global_mean = df[target_col].mean()

    grouped_mean = df.groupby(group_cols)[target_col].mean()
    grouped_count = df.groupby(group_cols)[target_col].count()

    smoothed_means = (grouped_mean * grouped_count + global_mean * m) / (grouped_count + m)
    return smoothed_means

def compute_smoothed_median(df, group_cols, target_col, m, global_median=None):
    if global_median is None:
        global_median = df[target_col].median()

    grouped_median = df.groupby(group_cols)[target_col].median()
    grouped_count = df.groupby(group_cols)[target_col].count()

    smoothed_medians = (grouped_median * grouped_count + global_median * m) / (grouped_count + m)
    return smoothed_medians


global_mean = train_['CI_trans'].mean()
global_median = train_['CI_trans'].median()

smoothed_means = compute_smoothed_means(train_, ['ARI_CO', 'ARI_PO'], 'CI_trans', m=100)
train_['CO_PO_mean'] = train_.set_index(['ARI_CO', 'ARI_PO']).index.map(smoothed_means)
test_['CO_PO_mean'] = test_.set_index(['ARI_CO', 'ARI_PO']).index.map(smoothed_means)


smoothed_mean_ship_type = compute_smoothed_means(train_, ['SHIP_TYPE_CATEGORY'], 'CI_trans', m=100)

grouped_mean_co = compute_smoothed_means(train_, ['ARI_CO', 'SHIP_TYPE_CATEGORY'], 'CI_trans', m=100)
train_['CO_SHIP_mean'] = train_.set_index(['ARI_CO', 'SHIP_TYPE_CATEGORY']).index.map(grouped_mean_co)
test_['CO_SHIP_mean'] = test_.set_index(['ARI_CO', 'SHIP_TYPE_CATEGORY']).index.map(grouped_mean_co)
train_['CO_SHIP_mean'].fillna(train_['SHIP_TYPE_CATEGORY'].map(smoothed_mean_ship_type), inplace=True)
test_['CO_SHIP_mean'].fillna(test_['SHIP_TYPE_CATEGORY'].map(smoothed_mean_ship_type), inplace=True)

grouped_mean_po = compute_smoothed_means(train_, ['ARI_PO', 'SHIP_TYPE_CATEGORY'], 'CI_trans', m=100)
train_['PO_SHIP_mean'] = train_.set_index(['ARI_PO', 'SHIP_TYPE_CATEGORY']).index.map(grouped_mean_po)
test_['PO_SHIP_mean'] = test_.set_index(['ARI_PO', 'SHIP_TYPE_CATEGORY']).index.map(grouped_mean_po)
train_['PO_SHIP_mean'].fillna(train_['SHIP_TYPE_CATEGORY'].map(smoothed_mean_ship_type), inplace=True)
test_['PO_SHIP_mean'].fillna(test_['SHIP_TYPE_CATEGORY'].map(smoothed_mean_ship_type), inplace=True)


smoothed_mean_dist = compute_smoothed_means(train_, ['ARI_PO'], 'DIST', m=100)
smoothed_mean_ci_hour = compute_smoothed_means(train_, ['ARI_PO'], 'CI_HOUR', m=100)

grouped = pd.DataFrame({'ARI_PO': smoothed_mean_dist.index,
                        'smoothed_mean_dist': smoothed_mean_dist.values,
                        'smoothed_mean_ci_hour': smoothed_mean_ci_hour.values})
grouped['CI_per_dist'] = grouped['smoothed_mean_ci_hour'] / grouped['smoothed_mean_dist']

train_ = train_.merge(grouped[['ARI_PO', 'CI_per_dist']], on='ARI_PO', how='left')
test_ = test_.merge(grouped[['ARI_PO', 'CI_per_dist']], on='ARI_PO', how='left')


grouped_mean_po = compute_smoothed_means(train_, ['SHIP_TYPE_CATEGORY', 'DEPTH'], 'CI_trans', m=100)
train_['SHIP_DEPTH_mean'] = train_.set_index(['SHIP_TYPE_CATEGORY', 'DEPTH']).index.map(grouped_mean_po)
test_['SHIP_DEPTH_mean'] = test_.set_index(['SHIP_TYPE_CATEGORY', 'DEPTH']).index.map(grouped_mean_po)
test_['SHIP_DEPTH_mean'].fillna(global_mean, inplace=True)

grouped_median_po = compute_smoothed_median(train_, ['SHIP_TYPE_CATEGORY', 'DEPTH'], 'CI_trans', m=100)
train_['SHIP_DEPTH_median'] = train_.set_index(['SHIP_TYPE_CATEGORY', 'DEPTH']).index.map(grouped_median_po)
test_['SHIP_DEPTH_median'] = test_.set_index(['SHIP_TYPE_CATEGORY', 'DEPTH']).index.map(grouped_median_po)
test_['SHIP_DEPTH_median'].fillna(global_median, inplace=True)

In [None]:
def map_smoothed_means_to_data(train_df, test_df, group_cols, target_col, m):
    smoothed_means = compute_smoothed_means(train_df, group_cols, target_col, m)
    train_df['target_encoding1'] = train_df.set_index(group_cols).index.map(smoothed_means).values
    test_df['target_encoding1'] = test_df.set_index(group_cols).index.map(smoothed_means).values
    return train_df, test_df

m = 100  # 예시 값이며, 실제 환경에 따라 적절한 값을 설정해야 합니다.
target_col = 'CI_trans'

# 첫 번째 그룹화
group_cols = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DEPTH']
train_, test_ = map_smoothed_means_to_data(train_, test_, group_cols, target_col, m)

# NaN 확인 후 두 번째 그룹화
if test_['target_encoding1'].isna().any():
    group_cols = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY']
    train_, test_ = map_smoothed_means_to_data(train_, test_, group_cols, target_col, m)

    # NaN 확인 후 세 번째 그룹화
    if test_['target_encoding1'].isna().any():
        group_cols = ['ARI_CO', 'ARI_PO']
        train_, test_ = map_smoothed_means_to_data(train_, test_, group_cols, target_col, m)


In [None]:
def map_smoothed_medians_to_data(train_df, test_df, group_cols, target_col, m):
    smoothed_medians = compute_smoothed_median(train_df, group_cols, target_col, m)
    train_df['target_encoding2'] = train_df.set_index(group_cols).index.map(smoothed_medians).values
    test_df['target_encoding2'] = test_df.set_index(group_cols).index.map(smoothed_medians).values
    return train_df, test_df

m = 100  # 예시 값이며, 실제 환경에 따라 적절한 값을 설정해야 합니다.
target_col = 'CI_trans'

# 첫 번째 그룹화
group_cols = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DEPTH']
train_, test_ = map_smoothed_medians_to_data(train_, test_, group_cols, target_col, m)

# NaN 확인 후 두 번째 그룹화
if test_['target_encoding2'].isna().any():
    group_cols = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY']
    train_, test_ = map_smoothed_medians_to_data(train_, test_, group_cols, target_col, m)

    # NaN 확인 후 세 번째 그룹화
    if test_['target_encoding2'].isna().any():
        group_cols = ['ARI_CO', 'ARI_PO']
        train_, test_ = map_smoothed_medians_to_data(train_, test_, group_cols, target_col, m)


In [None]:
# https://www.cello-square.com/kr-ko/blog/view-29.do

def classify_dwt(value):
    if value < 40000:
        return 0
    elif value < 55000:
        return 1
    elif value < 60000:
        return 2
    elif value < 100000:
        return 3
    elif value < 180000:
        return 4
    elif value < 200000:
        return 5
    else:
        return 6

# 데이터프레임 예시: train_
train_['DWT_class'] = train_['DEADWEIGHT'].apply(classify_dwt)
test_['DWT_class'] = test_['DEADWEIGHT'].apply(classify_dwt)

train_

In [None]:
columns_to_convert = ['DWT_class', 'weekday']
for col in columns_to_convert:
    train_[col] = train_[col].astype('object')
    test_[col] = test_[col].astype('object')

print(train_.dtypes)

In [None]:
# train 기준 IQR * 3 밖에 있는 데이터를 변수로 나타내줌
grouped = train_.groupby('ARI_PO')['DIST']
Q1 = grouped.quantile(0.25)
Q3 = grouped.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR

is_outlier_train = train_.apply(lambda row: (row['DIST'] < lower_bound[row['ARI_PO']] or
                                           row['DIST'] > upper_bound[row['ARI_PO']]), axis=1)

is_outlier_test = test_.apply(lambda row: (row['DIST'] < lower_bound[row['ARI_PO']] or
                                          row['DIST'] > upper_bound[row['ARI_PO']]), axis=1)

train_['CI_trans_outlier'] = is_outlier_train.astype(int)

test_['CI_trans_outlier'] = is_outlier_test.astype(int)

In [None]:
# 여러 변수 정의
def feature_engineering(data):
    data['Ship_Age_Impact'] = data['BUILT'] **2
    data['Dist_to_Weight_Ratio'] = data['DIST'] / data['PORT_SIZE']
    data['Size_Ship'] = data['LENGTH'] * data['BREADTH'] * data['DEPTH']
    data['Port_Entry_Efficiency'] = data['PORT_SIZE'] / (data['LENGTH'] * data['BREADTH'])
    data['Draught_to_Depth_Ratio'] = data['DRAUGHT'] / data['DEPTH']

    return data

train_ = feature_engineering(train_)
test_ = feature_engineering(test_)

In [None]:
from sklearn.preprocessing import LabelEncoder

# 범주형 변수 레이블 인코딩
categorical_features = ['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DWT_class', 'weekday', 'ID', 'SHIPMANAGER', 'FLAG']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train_[feature] = le.fit_transform(train_[feature].astype(str))
    le_classes_set = set(le.classes_)
    test_[feature] = test_[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test_[feature] = le.transform(test_[feature].astype(str))
    encoders[feature] = le

In [None]:
from sklearn.preprocessing import MinMaxScaler

numeric_cols = ['DEADWEIGHT', 'GT', 'LENGTH', 'Ship_Age_Impact', 'Dist_to_Weight_Ratio', 'Size_Ship', 'Port_Entry_Efficiency', 'Draught_to_Depth_Ratio']

scaler = MinMaxScaler()

train_[numeric_cols] = scaler.fit_transform(train_[numeric_cols])
test_[numeric_cols] = scaler.transform(test_[numeric_cols])

train_

In [None]:
numeric_cols = train_.select_dtypes(include=[np.number])

plt.figure(figsize=(30, 30))
sns.heatmap(numeric_cols.corr(), annot=True, fmt='.2f')
plt.show()

# mljar

In [None]:
!pip install mljar-supervised

In [None]:
from supervised.automl import AutoML

In [None]:
train_

In [None]:
train_x = train_.drop(["CI_HOUR", "CI_trans", 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN'], axis=1)
train_y = train_['CI_trans']

In [None]:
Cross_validation = {
    "validation_type": "kfold",
    "k_folds": 10,
    "shuffle": True,
    "random_seed": 112
}

In [None]:
automl = AutoML(mode="Compete", algorithms = ['Decision Tree', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1, eval_metric='mae', validation_strategy=Cross_validation, ml_task = "regression",
                total_time_limit=3600,
                results_path="/model_weight")
automl.fit(train_x, train_y)

In [None]:
loaded_automl = AutoML(results_path="/model_weight")

test_nozerodist = test_[test_['DIST'] != 0].reset_index(drop=True)

test_data = test_nozerodist.drop(columns=['U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN', 'Avg_Bert_Time', 'target']) # 모델과 사용하는 컬럼에 따라 수정이 필요

pred = loaded_automl.predict_all(test_data)

In [None]:
prediction = np.expm1(pred)

In [None]:
prediction

In [None]:
submit = pd.read_csv('/sample_submission.csv')

new_test = pd.read_csv('/test.csv')

test_filtered = new_test[new_test['DIST'] != 0]

new_test['predictions'] = None
new_test.loc[test_filtered.index, 'predictions'] = prediction['prediction']

def generate_predictions(predictions):
    for pred in predictions:
        yield pred

pred_generator = generate_predictions(prediction['prediction'])

final_predictions = []

for _, row in new_test.iterrows():
    if row['DIST'] == 0:
        final_predictions.append(0)
    else:
        final_predictions.append(next(pred_generator))

new_test['predictions'] = final_predictions

In [None]:
mask = ~test_['target'].isna()
new_test.loc[mask, 'predictions'] = test_.loc[mask, 'target']

In [None]:
new_test.loc[new_test['predictions'] < 0, 'predictions'] = 0

In [None]:
submit = pd.read_csv('/sample_submission.csv')

submit.iloc[:, 1] = new_test['predictions']

submit.to_csv('/dacon_HDAI_mljar_final_test_load.csv', index=False)

-------------------------------