# 라이브러리 및 seed고정

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.2.tar.gz (265.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.3/265.3 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tabulate (from h2o)
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.46.0.2-py2.py3-none-any.whl size=265365378 sha256=9dee9c85b30e05b432a1a44cc35469db36d878822ba678e399a19b5242dad6ce
  Stored in directory: /root/.cache/pip/wheels/63/14/f4/9fff736a0df59884631031b604e7b000a70409f5b7adafc2c6
Successfully built h2o
Installing collected packages: tabulate, h2o
Successfully installed h2o-3.46.0.2 tabulate-0.9.0


In [3]:
import os
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')

import h2o
from h2o.automl import H2OAutoML
from sklearn.metrics import roc_auc_score


In [4]:
# 시드 고정 함수
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    h2o.no_progress()

seed_everything(42)

# h2o

In [5]:
# H2O 서버 초기화 및 메모리 설정 조정
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.22" 2024-01-16; OpenJDK Runtime Environment (build 11.0.22+7-post-Ubuntu-0ubuntu222.04.1); OpenJDK 64-Bit Server VM (build 11.0.22+7-post-Ubuntu-0ubuntu222.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp57p5t0w9
  JVM stdout: /tmp/tmp57p5t0w9/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp57p5t0w9/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.2
H2O_cluster_version_age:,19 days
H2O_cluster_name:,H2O_from_python_unknownUser_71hfe8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,29.97 Gb
H2O_cluster_total_cores:,96
H2O_cluster_allowed_cores:,96


In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/웹 광고 클릭률/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/웹 광고 클릭률/test.csv')

In [7]:
# 불필요한 컬럼 및 결측치 비율이 높은 컬럼 제거
cols_to_drop = ['ID', 'F03', 'F15', 'F20', 'F24', 'F26', 'F27', 'F29']
train_df.drop(columns=cols_to_drop, inplace=True)
test_ids = test_df['ID']
test_df.drop(columns=cols_to_drop, inplace=True)

In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28605391 entries, 0 to 28605390
Data columns (total 33 columns):
 #   Column  Dtype  
---  ------  -----  
 0   Click   int64  
 1   F01     object 
 2   F02     object 
 3   F04     float64
 4   F05     object 
 5   F06     int64  
 6   F07     object 
 7   F08     object 
 8   F09     object 
 9   F10     object 
 10  F11     float64
 11  F12     object 
 12  F13     object 
 13  F14     int64  
 14  F16     object 
 15  F17     object 
 16  F18     float64
 17  F19     float64
 18  F21     object 
 19  F22     object 
 20  F23     object 
 21  F25     object 
 22  F28     object 
 23  F30     object 
 24  F31     object 
 25  F32     float64
 26  F33     float64
 27  F34     object 
 28  F35     object 
 29  F36     float64
 30  F37     object 
 31  F38     float64
 32  F39     object 
dtypes: float64(8), int64(3), object(22)
memory usage: 7.0+ GB


In [8]:
# 데이터 타입 최적화
def reduce_mem_usage(df):
    """메모리 절약을 위해 데이터 프레임의 타입을 다운캐스트합니다."""
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
# H2O 프레임으로 변환
train_h2o = h2o.H2OFrame(train_df)
test_h2o = h2o.H2OFrame(test_df)

In [10]:
# X's & Y Split
Y = 'Click'
X = train_h2o.columns
X.remove(Y)

In [11]:
# H2O AutoML
aml = H2OAutoML(seed=42, max_runtime_secs=7200, balance_classes=True)
aml.train(x=X, y=Y, training_frame=train_h2o)


01:33:19.209: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

Failed polling AutoML progress log: Local server has died unexpectedly. RIP.
Job request failed Local server has died unexpectedly. RIP., will retry after 3s.
Job request failed Local server has died unexpectedly. RIP., will retry after 3s.
Failed polling AutoML progress log: Local server has died unexpectedly. RIP.


H2OConnectionError: Local server has died unexpectedly. RIP.

In [None]:
# 리더보드 출력
lb = aml.leaderboard
print(lb)

In [None]:
# 예측
pred = aml.predict(test_h2o)

In [None]:
# 제출 파일 생성
submission = pd.read_csv('/content/drive/MyDrive/웹 광고 클릭률/sample_submission.csv')
submission['Click'] = h2o.as_list(pred['p1']).values

submission.to_csv('click_submission_h2o.csv', index=False)
submission


In [None]:
# 파일 다운로드
from google.colab import files
files.download('click_submission_h2o.csv')