In [4]:
import numpy as np
import pandas as pd 
import os 
import random
import warnings
warnings.filterwarnings(action='ignore')

from autogluon.tabular import TabularDataset, TabularPredictor

In [7]:
SAVE_PATH  = "ocean180"

TRAIN_PATH = "dataset/train_data.csv"
TEST_PATH = "dataset/test_data.csv"
SAMPLE_SUBISSION_PATH = "dataset/submit_example.csv"
SUBMISSION_PATH = "submission.csv"
TARGET = "cover"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

MODEL_EVAL_METRIC = 'rmse'
MODEL_TIME_LIMIT = 60*10
MODEL_SAVE_PATH = "autogluon_model/"
MODEL_VERBOSE = 1
MODEL_PRESETS = "best_quality"

NUM_BAG_FOLDS = 5
NUM_BAG_SETS = 1
NUM_STACK_LEVELS = 1

In [8]:
train_data = pd.read_csv(TRAIN_PATH).drop(['Unnamed: 0', 'YMD', 'Landsat_StartTime', 'PRODUCT_ID'], axis=1)

predictor = TabularPredictor(
    label=TARGET, 
    eval_metric=MODEL_EVAL_METRIC, 
    path=MODEL_SAVE_PATH, 
    verbosity=MODEL_VERBOSE).fit(
    train_data, 
    presets=MODEL_PRESETS, 
    time_limit=MODEL_TIME_LIMIT,
    num_bag_folds=NUM_BAG_FOLDS, 
    num_bag_sets=NUM_BAG_SETS, 
    num_stack_levels=NUM_STACK_LEVELS
)

AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and label-values can't be converted to int).
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.
E0204 20:10:30.788828798   40597 chttp2_transport.cc:1103]   Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings"


In [9]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                    model  score_val  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L2  -0.105616      12.404819  2921.666051                0.000498           0.193635            2       True          6
1  RandomForestMSE_BAG_L1  -0.107823      11.193334  2659.160835               11.193334        2659.160835            1       True          5
2       LightGBMXT_BAG_L1  -0.107910       1.210987   262.311581                1.210987         262.311581            1       True          3
3         LightGBM_BAG_L1  -0.143461       0.956117    46.219859                0.956117          46.219859            1       True          4
4   KNeighborsDist_BAG_L1  -0.148939      28.939413     2.209532               28.939413           2.209532            1       True          2
5   KNeighborsUnif_BAG_L1  -0.161443      27.883395     2.039090               2

{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L1': 'StackerEnsembleModel_RF',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'KNeighborsUnif_BAG_L1': -0.1614428157399724,
  'KNeighborsDist_BAG_L1': -0.14893901703520052,
  'LightGBMXT_BAG_L1': -0.10791012393769661,
  'LightGBM_BAG_L1': -0.1434612927505292,
  'RandomForestMSE_BAG_L1': -0.1078229188793768,
  'WeightedEnsemble_L2': -0.10561630541663561},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'KNeighborsUnif_BAG_L1': 'autogluon_model/models/KNeighborsUnif_BAG_L1/',
  'KNeighborsDist_BAG_L1': 'autogluon_model/models/KNeighborsDist_BAG_L1/',
  'LightGBMXT_BAG_L1': 'autogluon_model/models/LightGBMXT_BAG_L1/',
  'LightGBM_BAG_L1': 'autogluon_model/models/LightGBM_BAG_L1/',
  'RandomForestMSE_BAG_L1': 'a

In [16]:
#test_data = TabularDataset('dataset/test_data.csv').drop(['Unnamed: 0', 'YMD', 'Landsat_StartTime', 'PRODUCT_ID'], axis=1)
# test_data.head()

test_data = pd.read_csv(TEST_PATH).drop(['Unnamed: 0', 'YMD', 'Landsat_StartTime', 'PRODUCT_ID'], axis=1)

# predictor = TabularPredictor.load(SAVE_PATH) 
pred_test = predictor.predict(test_data)

pd.DataFrame(pred_test).to_csv("submission.csv", header=False)

In [18]:
# test data doesn't have target . so this will fail
predictor.leaderboard(test_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist_BAG_L1,-0.009571,-0.148939,41.540526,28.939413,2.209532,41.540526,28.939413,2.209532,1,True,2
1,RandomForestMSE_BAG_L1,-0.039869,-0.107823,0.674801,11.193334,2659.160835,0.674801,11.193334,2659.160835,1,True,5
2,WeightedEnsemble_L2,-0.048087,-0.105616,3.002562,12.404819,2921.666051,0.003235,0.000498,0.193635,2,True,6
3,LightGBMXT_BAG_L1,-0.058692,-0.10791,2.324526,1.210987,262.311581,2.324526,1.210987,262.311581,1,True,3
4,KNeighborsUnif_BAG_L1,-0.125894,-0.161443,24.319644,27.883395,2.03909,24.319644,27.883395,2.03909,1,True,1
5,LightGBM_BAG_L1,-0.131257,-0.143461,1.784866,0.956117,46.219859,1.784866,0.956117,46.219859,1,True,4
