In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import gc
import pickle
import random
from pathlib import Path

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_error

In [2]:
class CFG:
    VER = 1
    AUTHOR = 'takaito'
    METHOD = 'lightgbm'
    COMPETITION = 'ocean180'
    DATA_PATH = Path('dataset') # データへのpathを設定
    target_col = 'cover'
    seed = 42
    num_fold = 3
    boosting_type = 'gbdt'
    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.1,
        'boosting': boosting_type,
        'seed': seed,
    }

In [3]:

def preprocessing(input_df: pd.DataFrame)->pd.DataFrame:
    output_df = input_df.copy()
    # 特徴量を追加していく
    return output_df
def get_train_data()->pd.DataFrame:
    df = pd.read_csv(CFG.DATA_PATH / 'train_data.csv').drop(['Unnamed: 0', 'YMD', 'Landsat_StartTime', 'PRODUCT_ID'], axis=1)
    return preprocessing(df)
def get_test_data():
    df = pd.read_csv(CFG.DATA_PATH / 'test_data.csv').drop(['Unnamed: 0', 'YMD', 'Landsat_StartTime', 'PRODUCT_ID'], axis=1)
    return preprocessing(df)

In [4]:
def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series
def lgb_training(train, features):
    CFG.folds = get_groupkfold(train, CFG.target_col, 'mesh20', CFG.num_fold)
    CFG.folds.to_csv('folds.csv', index=False)
    oof_pred = np.zeros(len(train), dtype=np.float64)
    df_importance = pd.DataFrame({'feature': features})
    df_importance['importance'] = 0
    for fold in range(CFG.num_fold):
        idx_train = CFG.folds!=fold
        idx_valid = CFG.folds==fold
        x_train = train[idx_train][features]
        y_train = train[idx_train][CFG.target_col]
        x_valid = train[idx_valid][features]
        y_valid = train[idx_valid][CFG.target_col]
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_valid, y_valid)
        model = lgb.train(
            params = CFG.lgb_params,
            train_set = lgb_train,
            num_boost_round = 1000,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 50,
        )
        model_path = f'{CFG.METHOD}_fold{fold}_seed{CFG.seed}_ver{CFG.VER}.pkl'
        pickle.dump(model, open(model_path, 'wb'))
        oof_pred[idx_valid] = model.predict(x_valid)

        f_importance = np.array(model.feature_importance())
        temp_importance = pd.DataFrame({'feature': features, 'importance': f_importance})
        df_importance['importance'] += temp_importance['importance']

    df_importance['importance'] = df_importance['importance'] / np.sum(df_importance['importance'])
    df_importance = df_importance.sort_values('importance', ascending=False)
    df_importance.to_csv(f'{CFG.METHOD}_feature_importance_seed{CFG.seed}_ver{CFG.VER}.csv', header=True, index=False)
    display(df_importance.head(50))
    score = np.sqrt(mean_squared_error(train[CFG.target_col], oof_pred))
    np.save(f'oof_pred_{CFG.METHOD}_seed{CFG.seed}_ver{CFG.VER}', oof_pred)
    print(f'CV Score: {score}')
    

In [5]:
def lgb_Predicting(test, features):
    test_preds = np.zeros((len(test), CFG.num_fold))
    x_test = test[features]
    for fold in range(CFG.num_fold):
        model_path = f'{CFG.METHOD}_fold{fold}_seed{CFG.seed}_ver{CFG.VER}.pkl'
        model = pickle.load(open(model_path, 'rb'))
        test_preds[:, fold] = model.predict(x_test)
    np.save(f'oof_pred_{CFG.METHOD}_seed{CFG.seed}_ver{CFG.VER}', test_preds)
    pd.DataFrame(np.mean(test_preds, axis=1)).to_csv(f'submit_{CFG.METHOD}_seed{CFG.seed}_ver{CFG.VER}.csv', header=False)

In [6]:
train = get_train_data()
test = get_test_data()
features = list(train.drop([CFG.target_col, 'mesh20'], axis=1))
lgb_training(train, features)
lgb_Predicting(test, features)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 879722
[LightGBM] [Info] Number of data points in the train set: 9425, number of used features: 3459
[LightGBM] [Info] Start training from score 0.211930
Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.0754667	valid_1's rmse: 0.198665
[100]	training's rmse: 0.0567183	valid_1's rmse: 0.200235
Early stopping, best iteration is:
[14]	training's rmse: 0.128005	valid_1's rmse: 0.19586




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 879849
[LightGBM] [Info] Number of data points in the train set: 9429, number of used features: 3459
[LightGBM] [Info] Start training from score 0.162047
Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.0770266	valid_1's rmse: 0.20184
[100]	training's rmse: 0.0561219	valid_1's rmse: 0.201596
[150]	training's rmse: 0.0448069	valid_1's rmse: 0.201289
Early stopping, best iteration is:
[75]	training's rmse: 0.0644221	valid_1's rmse: 0.200648




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 879851
[LightGBM] [Info] Number of data points in the train set: 9426, number of used features: 3459
[LightGBM] [Info] Start training from score 0.175582
Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 0.0767807	valid_1's rmse: 0.195513
[100]	training's rmse: 0.0572677	valid_1's rmse: 0.198365
Early stopping, best iteration is:
[40]	training's rmse: 0.0839509	valid_1's rmse: 0.194903


Unnamed: 0,feature,importance
5,depth_original,0.012403
15,hist_warm_sst,0.011111
19,warm_sst,0.010594
2,area,0.007494
270,MIN_GARI,0.007235
4,month,0.005943
12,depth,0.005943
13,fetch,0.005943
3005,MIN_IF_2017,0.005685
3,year,0.005168


CV Score: 0.19715192675748644


In [7]:
len(features)

3459

In [8]:
train.shape

(14140, 3461)

In [9]:
test.shape

(4039, 3460)

In [10]:
features

['lat',
 'lon',
 'area',
 'year',
 'month',
 'depth_original',
 'cliff_length',
 'aicial_length',
 'beach_length',
 'coast_length',
 'coastal_dist',
 'cold_sst',
 'depth',
 'fetch',
 'hist_cold_sst',
 'hist_warm_sst',
 'river_area',
 'river_dist',
 'Salinity_annual',
 'warm_sst',
 'sst_annual',
 'sst_ymd',
 'sst_diff',
 'Blue',
 'Green',
 'Red',
 'NIR',
 'SWIR1',
 'TIRS1',
 'TIRS2',
 'SWIR2',
 'MSAVI',
 'EVI',
 'NBR',
 'NBR2',
 'NDMI',
 'NDVI',
 'NDWI',
 'SAVI',
 'TSAVI',
 'Alteration',
 'AVI',
 'BWDRVI',
 'Chlgreen',
 'Cigreen',
 'Cirededge',
 'Chlred_edge',
 'CVI',
 'CI',
 'CTVI',
 'CRI550',
 'D678_500',
 'DVIMSS',
 'DSWI',
 'DSWI_5',
 'Fe3',
 'Ferric_Oxides',
 'Ferrous_Silicates',
 'GEMI',
 'GVMI',
 'Gossan',
 'GARI',
 'GLI',
 'H',
 'IR550',
 'mCRIG',
 'mCRIRE',
 'MVI',
 'MCARI1',
 'MCARI2',
 'NLI',
 'NormG',
 'NormR',
 'PPR',
 'PSNDc2',
 'RDVI',
 'IF',
 'SLAVI',
 'SIPI2',
 'VARIgreen',
 'sunAzimuth',
 'sunElevation',
 'Date_Acquired',
 'Landsat_num',
 'MAX_AFRI1600',
 'MAX_AOT',
 '