In [1]:
import random
import glob
import os
import re
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
CFG = {
    'WINDOW_SIZE':500, # 500 Step
    'EPOCHS':30,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':128,
    'SEED':41
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [5]:
train_paths = glob.glob('/content/drive/MyDrive/Colab Notebooks/2023_dacon_HDAI_본선/train/*.csv')
test_paths = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2023_dacon_HDAI_본선/test.csv')['data_path'].values

In [61]:
import pandas as pd
from tqdm import tqdm
import numpy as np

def calculate_fft_features(data, n_coefficients=20):
    features = []
    # 각 열에 대해 FFT 수행
    for column in data.columns:
        # FFT 계산
        fft_values = np.fft.rfft(data[column].values)
        # 절대값을 취해 크기를 구함
        fft_magnitudes = np.abs(fft_values)
        # 가장 중요한 n 개의 계수만 유지
        features.extend(fft_magnitudes[:n_coefficients])
    return features

def make_train_data(train_paths, window_size=CFG['WINDOW_SIZE'], stride=10, extremes=None):
    feature_matrix = []
    labels = []

    for path in tqdm(train_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        data.drop(columns='Time[s]', inplace=True)
        label_value = float(path.split('/')[-1].split('.')[0].split('_')[0][:-2])

        for i in range(0, len(data) - window_size + 1, stride):
            window_data = data[i:i + window_size]
            # FFT 특징 추출
            fft_features = calculate_fft_features(window_data)
            feature_matrix.append(fft_features)
            labels.append(label_value)

    # 특징과 라벨을 데이터프레임으로 변환
    feature_df = pd.DataFrame(feature_matrix)
    label_series = pd.Series(labels, name='Label')
    # 특징 데이터프레임에 라벨 열 추가
    final_df = pd.concat([feature_df, label_series], axis=1)

    return final_df


# 데이터 준비 함수 호출
final_data = make_train_data(train_paths)

100%|██████████| 16/16 [00:10<00:00,  1.48it/s]


In [62]:
final_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,131,132,133,134,135,136,137,138,139,Label
0,158035.984085,5.317676e-12,2.151088e-12,1.434106e-12,1.329550e-12,0.000000,8.864835e-13,6.147785e-13,5.379843e-13,5.911640e-13,...,22.848561,25.699934,11.328859,4.616932,32.734629,23.617146,6.017430,3.284297,19.927120,0.0
1,158035.984085,5.317676e-12,2.151088e-12,1.434106e-12,1.329550e-12,0.000000,8.864835e-13,6.147785e-13,5.379843e-13,5.911640e-13,...,14.285115,29.613810,3.197130,12.189016,24.675020,20.380934,13.520367,9.515750,24.508596,0.0
2,158035.984085,5.317676e-12,2.151088e-12,1.434106e-12,1.329550e-12,0.000000,8.864835e-13,6.147785e-13,5.379843e-13,5.911640e-13,...,16.806290,24.389233,7.441504,17.517292,23.673408,12.483825,16.145985,7.325150,16.084586,0.0
3,158035.984085,5.317676e-12,2.151088e-12,1.434106e-12,1.329550e-12,0.000000,8.864835e-13,6.147785e-13,5.379843e-13,5.911640e-13,...,21.617446,19.706264,10.045878,14.290352,28.255316,16.733846,11.527360,3.128915,18.203007,0.0
4,158035.984085,5.317676e-12,2.151088e-12,1.434106e-12,1.329550e-12,0.000000,8.864835e-13,6.147785e-13,5.379843e-13,5.911640e-13,...,22.544123,21.524922,7.527611,10.601914,26.314243,17.982134,14.002049,8.348318,20.436946,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18448,157251.593585,7.167531e+02,5.350028e+02,2.947094e+02,6.542720e+01,94.924120,1.571916e+02,1.301944e+02,5.126652e+01,3.358796e+01,...,316.317290,369.135681,367.313623,306.759886,263.874012,215.653570,171.552706,103.622737,34.446946,402.0
18449,157318.428278,6.655432e+02,5.232310e+02,3.274581e+02,1.267063e+02,34.935586,1.258642e+02,1.421494e+02,9.974188e+01,2.984993e+01,...,203.579863,236.231351,238.109266,202.258620,181.649010,152.418103,149.516405,142.657150,82.980707,402.0
18450,157385.743188,6.113835e+02,5.032641e+02,3.490422e+02,1.806377e+02,31.619407,7.728452e+01,1.277920e+02,1.246883e+02,8.255368e+01,...,188.520811,216.221259,216.695028,183.312538,164.718164,137.496530,141.330370,146.465181,105.675066,402.0
18451,157450.011166,5.572807e+02,4.763237e+02,3.573711e+02,2.208389e+02,89.192859,2.175654e+01,8.999693e+01,1.178768e+02,1.077403e+02,...,134.374948,153.537432,155.379899,131.957020,118.369564,95.192057,109.444752,132.212681,126.828638,402.0


In [None]:
!pip install mljar-supervised

In [63]:
from supervised.automl import AutoML

train_x = final_data.drop(['Label'], axis=1)
train_y = final_data['Label']

In [64]:
from sklearn.preprocessing import MinMaxScaler

numeric_cols = train_x.columns

scaler = MinMaxScaler()

train_x[numeric_cols] = scaler.fit_transform(train_x[numeric_cols])
# test_[numeric_cols] = scaler.transform(test_[numeric_cols])

train_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
0,0.323975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.001956,0.002709,0.003930,0.001643,0.000637,0.008184,0.006968,0.002187,0.001210,0.007561
1,0.323975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.001934,0.001658,0.004534,0.000426,0.001793,0.006127,0.005985,0.005085,0.003764,0.009342
2,0.323975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.002888,0.001967,0.003727,0.001061,0.002606,0.005872,0.003585,0.006099,0.002866,0.006067
3,0.323975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.003179,0.002558,0.003005,0.001451,0.002114,0.007041,0.004877,0.004315,0.001146,0.006891
4,0.323975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.002955,0.002671,0.003285,0.001074,0.001551,0.006546,0.005256,0.005271,0.003286,0.007759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18448,0.278981,0.169347,0.281230,0.212763,0.068844,0.123916,0.255809,0.250054,0.105529,0.085325,...,0.029034,0.038721,0.056933,0.054924,0.046755,0.067155,0.065331,0.066128,0.042345,0.013207
18449,0.282814,0.157247,0.275042,0.236406,0.133322,0.045606,0.204828,0.273016,0.205313,0.075829,...,0.018394,0.024887,0.036421,0.035585,0.030804,0.046176,0.046113,0.057616,0.058348,0.032079
18450,0.286676,0.144451,0.264546,0.251989,0.190070,0.041277,0.125771,0.245440,0.256664,0.209715,...,0.017029,0.023039,0.033333,0.032380,0.027913,0.041857,0.041578,0.054454,0.059910,0.040904
18451,0.290362,0.131668,0.250384,0.258002,0.232370,0.116434,0.035406,0.172850,0.242642,0.273697,...,0.011173,0.016394,0.023659,0.023203,0.020074,0.030032,0.028721,0.042138,0.054066,0.049129


In [65]:
train_y = train_y.astype('int')
train_y

0          0
1          0
2          0
3          0
4          0
        ... 
18448    402
18449    402
18450    402
18451    402
18452    402
Name: Label, Length: 18453, dtype: int64

In [None]:
Cross_validation = {
    "validation_type": "kfold",
    "k_folds": 10,
    "shuffle": True,
    "random_seed": 112
}


automl = AutoML(mode="Compete", algorithms = ['Decision Tree', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1, eval_metric='mae', validation_strategy=Cross_validation, ml_task = "regression")
automl.fit(train_x, train_y)

AutoML directory: AutoML_18
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Decision Tree', 'LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree mae 226.915504 trained in 19.73 seconds
2_DecisionTree mae 211.009811 trained in 22.09 seconds
3_DecisionTree mae 211.009811 trained in 25.52 seconds
* Step default_algorithms will try to check up to 3 models


In [51]:
import pandas as pd
from tqdm import tqdm
import numpy as np

def make_predict_data(test_paths, window_size=CFG['WINDOW_SIZE'], n_coefficients=10):
    '''
        This function is implemented assuming that most Test Samples have 500 Time Steps.
        Inference Window Size optimized for 500.
    '''
    feature_matrix = []

    for path in tqdm(test_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        path = '/content/drive/MyDrive/Colab Notebooks/2023_dacon_HDAI_본선/test' + (path[6:])

        data = pd.read_csv(path)
        data.drop(columns='Time[s]', inplace=True)

        # Assuming the test data already fits into a single window.
        window_data = data.iloc[:window_size]

        # If the test data has fewer rows than the window_size, pad with zeros.
        if len(data) < window_size:
            padding = np.zeros((window_size - len(data), data.shape[1]))
            window_data = np.vstack([data.values, padding])

        # FFT feature extraction.
        fft_features = calculate_fft_features(pd.DataFrame(window_data), n_coefficients=n_coefficients)
        feature_matrix.append(fft_features)

    return np.array(feature_matrix)

# Assuming test_paths is defined and CFG['WINDOW_SIZE'] is set.
test_features = make_predict_data(test_paths)


100%|██████████| 4048/4048 [00:31<00:00, 129.75it/s]


In [52]:
test_x = pd.DataFrame(test_features)

In [53]:
test_x = pd.DataFrame(test_features)
test_x[numeric_cols] = scaler.transform(test_x[numeric_cols])

test_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.626574,0.238463,0.281611,0.111980,0.293120,0.141576,0.241479,0.259027,0.138915,0.288418,...,0.555228,0.467777,0.087330,0.079347,0.310510,0.640153,0.391605,0.495730,0.555746,0.252760
1,0.152908,0.252482,0.278393,0.193475,0.110791,0.272712,0.288138,0.116452,0.145070,0.251330,...,0.294479,0.251594,0.151439,0.216210,0.103806,0.168426,0.064555,0.039395,0.054508,0.102047
2,0.217645,0.353584,0.334344,0.392226,0.570891,0.468662,0.348968,0.354756,0.250352,0.484266,...,0.554478,0.031749,0.071565,0.154571,0.086276,0.309651,0.075035,0.116042,0.128975,0.160225
3,0.323975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.576738,0.029110,0.035302,0.060516,0.059643,0.078212,0.059230,0.052189,0.089801,0.102277
4,0.536870,0.423338,0.381474,0.248783,0.388181,0.319011,0.286340,0.436666,0.258044,0.319795,...,0.156346,0.186115,0.205620,0.162396,0.164263,0.195293,0.061707,0.086089,0.185050,0.063304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4043,0.651477,0.558502,0.621574,0.422791,0.589653,0.674308,0.534160,0.443197,0.488147,0.551028,...,0.582341,0.449313,0.076497,0.323568,0.141161,0.178436,0.045170,0.121954,0.103119,0.095682
4044,0.237704,0.253892,0.114875,0.221201,0.211252,0.157484,0.279412,0.102648,0.273296,0.111437,...,0.385411,0.015332,0.013692,0.013171,0.024522,0.035874,0.018123,0.002497,0.010018,0.028602
4045,0.266521,0.219798,0.387117,0.342284,0.229549,0.124888,0.253414,0.313504,0.222673,0.127174,...,0.248614,0.106035,0.063878,0.141038,0.073523,0.053077,0.029129,0.048694,0.096688,0.119438
4046,0.449289,0.422162,0.302333,0.456230,0.474237,0.313034,0.340929,0.256030,0.486683,0.404464,...,0.512369,0.397539,0.304870,0.089159,0.173512,0.251179,0.098746,0.186278,0.091352,0.146826


In [54]:
pred = automl.predict_all(test_x)

In [55]:
pred

Unnamed: 0,prediction
0,399.731285
1,402.030992
2,601.430848
3,403.140273
4,512.048481
...,...
4043,186.860439
4044,332.030565
4045,402.164495
4046,614.254946


In [59]:
a = (np.round(pred, 0)) / 100
b = (np.round(a)) * 100
b

Unnamed: 0,prediction
0,400.0
1,400.0
2,600.0
3,400.0
4,500.0
...,...
4043,200.0
4044,300.0
4045,400.0
4046,600.0


In [38]:
b.max()

prediction    800.0
dtype: float64

In [60]:
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2023_dacon_HDAI_본선/sample_submission.csv')

submit['weight'] = b

submit.to_csv('/content/drive/MyDrive/Colab Notebooks/2023_dacon_HDAI_본선/result/submit_11.csv', index=False)