In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e8:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F76727%2F9045607%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240829%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240829T123310Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7880b796162fdda0745a833b42b74acc16bdf57f7d25cd1313b636d71b450c476b9f2f1ce4b577ffa0e142c33d5a49daa2747ffb05bd258a524b139da74dfba3656bfa5ef26259e4833616e893e35178d4ce76bd147b7842be18af8beb6f9386caa15a2a63d686d12ca15d9f40743fc626d95387cd8d89053665246e71e498110b2a35a1a4fca772107a52bba2e9a7dfd301054d1e822ae0484f4747643c02127b0c0175bb24cf14dcf93a047db161da0b9019597dfb87082dfe3623024029dd4c920a8f44d8ff70ffb335b795c264fde9d431490ba046bf6b9fc4d4457631975617b0f658225fd2657b9ccce6ea5b05ecfe8e1a8b0bf8a4a887c4ed78a3f9cc'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading playground-series-s4e8, 86301661 bytes compressed
Downloaded and uncompressed: playground-series-s4e8
Data source import complete.


In [2]:
!pip install ray==2.10.0
!pip install autogluon.tabular
!pip install -U ipywidgets

Collecting ray==2.10.0
  Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (13 kB)
Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl (65.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.1/65.1 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ray
Successfully installed ray-2.10.0
Collecting autogluon.tabular
  Downloading autogluon.tabular-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting scipy<1.13,>=1.5.4 (from autogluon.tabular)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting autogluon.core==1.1.1 (from autogluon.tabular)
  Downloading autogluon.core-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.features==1.1.1 (from autogluon.tabular)
  Downloading autogluon.features-1.1.1-py3-none-any.whl.metadata (

In [3]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor

In [6]:
train_df = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv", index_col='id')
# orig_df = pd.read_csv("/kaggle/input/secondary-mushroom-dataset-data-set/MushroomDataset/secondary_data.csv", sep=";")
test_df = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv", index_col='id')

In [7]:
train_df.duplicated().sum()

0

In [8]:
train_df.drop_duplicates(inplace=True)

In [9]:
target = 'class'

In [10]:
features = train_df.drop(target, axis=1).columns.to_list()
features

['cap-diameter',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-height',
 'stem-width',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [11]:
features_with_high_null_values = [feature for feature in features if (train_df[feature].isna().sum()/len(train_df)*100)>20]
features_with_high_null_values

['cap-surface',
 'gill-spacing',
 'stem-root',
 'stem-surface',
 'veil-type',
 'veil-color',
 'spore-print-color']

In [12]:
categorical_features = train_df[features].select_dtypes(include='object').columns.to_list()
categorical_features

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [13]:
numerical_features = list(set(features) - set(categorical_features))
numerical_features

['stem-height', 'cap-diameter', 'stem-width']

In [14]:
def cleaner(df):
    for col in categorical_features:
        df[col] = df[col].fillna('missing')
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < 100, col] = "noise"
        df[col] = df[col].astype('category')

    return df

In [15]:
train_df = cleaner(train_df)
test_df = cleaner(test_df)

In [16]:
cap_diameter_mean = pd.concat([train_df['cap-diameter'], test_df['cap-diameter']]).mean(numeric_only=True)
train_df['cap-diameter'].fillna(cap_diameter_mean, inplace=True)
test_df['cap-diameter'].fillna(cap_diameter_mean, inplace=True)

In [17]:
predictor = TabularPredictor(label='class',
                            eval_metric='mcc',
                            problem_type='binary').fit(train_df,
                                                       presets='best_quality',
                                                        time_limit=3600*10,
                                                       verbosity=2,
                                                       excluded_model_types=['KNN'],
                                                       ag_args_fit={'num_gpus': 1}
                                                      )
results = predictor.fit_summary()

No path specified. Models will be saved in: "AutogluonModels/ag-20240829_123944"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          12
Memory Avail:       79.65 GB / 83.48 GB (95.4%)
Disk Space Avail:   167.65 GB / 201.23 GB (83.3%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validati

[36m(_ray_fit pid=7906)[0m [1000]	valid_set's binary_logloss: 0.0364905	valid_set's mcc: 0.984262


[36m(_ray_fit pid=10766)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=10766)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=10766)[0m This will raise in a future version.
[36m(_ray_fit pid=10766)[0m [32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=10766)[0m [1000]	valid_set's binary_logloss: 0.0359562	valid_set's mcc: 0.984478


[36m(_ray_fit pid=13551)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=13551)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=13551)[0m This will raise in a future version.
[36m(_ray_fit pid=13551)[0m [32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=13551)[0m [1000]	valid_set's binary_logloss: 0.0362517	valid_set's mcc: 0.984477


[36m(_ray_fit pid=16328)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=16328)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=16328)[0m This will raise in a future version.
[36m(_ray_fit pid=16328)[0m [32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=16328)[0m [1000]	valid_set's binary_logloss: 0.0364248	valid_set's mcc: 0.984384


[36m(_ray_fit pid=19105)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=19105)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=19105)[0m This will raise in a future version.
[36m(_ray_fit pid=19105)[0m [32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=19105)[0m [1000]	valid_set's binary_logloss: 0.0370471	valid_set's mcc: 0.984116


[36m(_ray_fit pid=21884)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=21884)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=21884)[0m This will raise in a future version.
[36m(_ray_fit pid=21884)[0m [32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=21884)[0m [1000]	valid_set's binary_logloss: 0.0360824	valid_set's mcc: 0.984567


[36m(_ray_fit pid=24661)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=24661)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=24661)[0m This will raise in a future version.
[36m(_ray_fit pid=24661)[0m [32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=24661)[0m [1000]	valid_set's binary_logloss: 0.0372925	valid_set's mcc: 0.98418


[36m(_ray_fit pid=27418)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=27418)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=27418)[0m This will raise in a future version.
[36m(_ray_fit pid=27418)[0m [32m [repeated 2x across cluster][0m


[36m(_ray_fit pid=27418)[0m [1000]	valid_set's binary_logloss: 0.0365758	valid_set's mcc: 0.984454


[36m(_dystack pid=6892)[0m 	0.9846	 = Validation score   (mcc)
[36m(_dystack pid=6892)[0m 	5309.44s	 = Training   runtime
[36m(_dystack pid=6892)[0m 	555.75s	 = Validation runtime
[36m(_dystack pid=6892)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 608.7s of the 3606.95s of remaining time.
[36m(_dystack pid=6892)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1.0 workers, per: cpus=1, gpus=1, memory=1.08%)
[36m(_ray_fit pid=30419)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=30419)[0m 
[36m(_ray_fit pid=30419)[0m You can install it with `pip install dask[dataframe]` or `conda install dask`.
[36m(_ray_fit pid=30419)[0m This will raise in a future version.
[36m(_ray_fit pid=30419)[0m 
[36m(_ray_fit pid=30744)[0m Dask dataframe query planning is disabled because dask-expr is not installed.
[36m(_ray_fit pid=30744)[0m [32m [repeated 2x across cluster]

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val eval_metric  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L3   0.984879         mcc    2161.911935  20336.727506                0.421159          39.043306            3       True         15
1     ExtraTreesGini_BAG_L2   0.984869         mcc    1699.843977  17710.582196              104.941682         169.041391            2       True          9
2     ExtraTreesEntr_BAG_L2   0.984858         mcc    1713.449089  17725.945582              118.546794         184.404777            2       True         10
3   RandomForestEntr_BAG_L2   0.984825         mcc    1724.826225  17914.208343              129.923930         372.667537            2       True          8
4   RandomForestGini_BAG_L2   0.984816         mcc    1708.412703  17886.353657              113.510408         344.812852            2       True  

In [18]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.984879,mcc,2161.911935,20336.727506,0.421159,39.043306,3,True,15
1,ExtraTreesGini_BAG_L2,0.984869,mcc,1699.843977,17710.582196,104.941682,169.041391,2,True,9
2,ExtraTreesEntr_BAG_L2,0.984858,mcc,1713.449089,17725.945582,118.546794,184.404777,2,True,10
3,RandomForestEntr_BAG_L2,0.984825,mcc,1724.826225,17914.208343,129.92393,372.667537,2,True,8
4,RandomForestGini_BAG_L2,0.984816,mcc,1708.412703,17886.353657,113.510408,344.812852,2,True,7
5,NeuralNetFastAI_BAG_L2,0.984796,mcc,1623.905856,20995.484794,29.003561,3453.943989,2,True,11
6,LightGBMXT_BAG_L1,0.984788,mcc,1215.853803,13715.794357,1215.853803,13715.794357,1,True,1
7,WeightedEnsemble_L2,0.984788,mcc,1216.276582,13725.287068,0.422779,9.492712,2,True,4
8,NeuralNetTorch_BAG_L2,0.984786,mcc,1620.18643,18804.115685,25.284135,1262.57488,2,True,13
9,XGBoost_BAG_L2,0.984691,mcc,1609.783762,17854.097932,14.881467,312.557127,2,True,12


In [19]:
y_pred = predictor.predict(test_df)

In [50]:
y_prop = predictor.predict_proba(test_df)

In [51]:
y_prop.to_csv('submission_autogluon_pre_prop.csv', index=False)

In [48]:
sub = pd.read_csv('/kaggle/input/playground-series-s4e8/sample_submission.csv')
sub['class'] = y_pred.to_list() # 저장이 안 돼서 리스트로 변환 후 저장
sub.to_csv('submission_autogluon_pre.csv', index=False)

In [49]:
sub.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
