In [1]:
%%writefile data_file.py
import os,sys,warnings,time,re,math,gc
warnings.filterwarnings("ignore")
from glob import glob
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from sklearn.metrics import roc_auc_score,auc
from sklearn.model_selection import train_test_split,StratifiedGroupKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cgb

print("scrpit start")

path_to_train = "/home/sohail/Downloads/credit_risk/train"
path_to_test = "/home/sohail/Downloads/credit_risk/test"


all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)



train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt64),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("week_num"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).cast(pl.UInt8).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)


test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt64),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("week_num")
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).cast(pl.UInt8).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)


def rename_cols(df:pl.DataFrame,name:str):
    if name == "intshallow":
        return df.rename(
            {
                "dpdmaxdatemonth_442T":"dpdmaxdatemonth_442D",
                "dpdmaxdatemonth_89T":"dpdmaxdatemonth_89D",
                "dpdmaxdateyear_596T":"dpdmaxdateyear_596D",
                "dpdmaxdateyear_896T":"dpdmaxdateyear_896D",
                "overdueamountmaxdatemonth_284T":"overdueamountmaxdatemonth_284D",
                "overdueamountmaxdatemonth_365T":"overdueamountmaxdatemonth_365D",
                "overdueamountmaxdateyear_2T":"overdueamountmaxdateyear_2D",
                "overdueamountmaxdateyear_994T":"overdueamountmaxdateyear_994D",
            }
        )
    elif name == "intdepth":
        return df.rename(
            {
                "pmts_month_158T":"pmts_month_158D",
                "pmts_month_706T":"pmts_month_706D",
                "pmts_year_1139T":"pmts_year_1139D",
                "pmts_year_507T":"pmts_year_507D",
            }
        )
    elif name == "extshallow":
        return df.rename(
            {
                "dpdmaxdatemonth_804T":"dpdmaxdatemonth_804D",
                "dpdmaxdateyear_742T":"dpdmaxdateyear_742D",
                "overdueamountmaxdatemonth_494T":"overdueamountmaxdatemonth_494D",
                "overdueamountmaxdateyear_432T":"overdueamountmaxdateyear_432D",
            }
        )
    else:
        return df
    
    
    

def convert_dtype(df:pl.DataFrame):
    return (
        df
        .select(
            cs.by_name("case_id").cast(pl.UInt64),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            (cs.ends_with("T","M") | (cs.ends_with("L") & cs.string())).cast(pl.String),
            (cs.ends_with("L") & cs.integer()).cast(pl.Int32),
            (cs.ends_with("L") & cs.float()).cast(pl.Float32),
            (cs.ends_with("P","A") & cs.unsigned_integer()).cast(pl.UInt32),
            (cs.ends_with("P","A") & cs.signed_integer()).cast(pl.Int32),
            (cs.ends_with("P","A") & cs.float()).cast(pl.Float32),
            pl.col(pl.Boolean)
        )
    )


def grouping(df:pl.DataFrame):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.numeric().max().prefix("max_"),
            cs.numeric().mean().prefix("mean_"),
            cs.numeric().first().prefix("first_"),
            cs.numeric().last().prefix("last_"),
            cs.date().first().prefix("first_"),
            cs.date().last().prefix("last_"),
            (cs.string() | cs.boolean()).drop_nulls().mode().first().prefix("mode_"),
            (cs.string() | cs.boolean()).last().prefix("last_"),
            (cs.string() | cs.boolean()).first().prefix("first_")
        )
    )


def preprocess(
        string_name:str,
        prefix_string:str,
        train_base_df:pl.DataFrame,
        test_base_df:pl.DataFrame
        ):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(string_name))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(string_name))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(train_file)
                    .pipe(rename_cols,prefix_string)
                    .pipe(convert_dtype)
                    .pipe(grouping)
                    for train_file in train_files_list
                ],
                parallel=False,
                rechunk=False
            )
        )
        test_schema = {"_".join(col_name.split("_")[1:]) if col_name != "case_id" else col_name:dtype for col_name,dtype in train_df.schema.items()}
        test_columns = {col_name for col_name in test_schema.keys()}
        if prefix_string == "staticexternal":
            train_df = (
                train_df
                .with_columns(
                    cs.contains("302T").str.split("%").list.gather([0,1])
                    .map_elements(
                        lambda x: (int(x[0]) + int(x[1].strip().split(" ")[1]))/200,return_dtype=pl.Float32
                    )
                )
            )
        
#         cat_cols += train_df.select((~(cs.numeric() | cs.date())).prefix(f"{prefix_string}_")).columns
#         num_cols += train_df.select((cs.numeric() | cs.date()).exclude("case_id").prefix(f"{prefix_string}_")).columns
        train_base_df = train_base_df.join(
            train_df
            .with_columns(pl.col(pl.String).cast(pl.Categorical).rank("dense"))
            .select(pl.col("case_id"),pl.all().exclude("case_id").shrink_dtype().prefix(f"{prefix_string}_")),
            on="case_id",
            how="left"
        )
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(test_file)
                    .pipe(rename_cols,prefix_string)
                    .select(test_columns)
                    .cast(test_schema)
                    .pipe(grouping)
                    for test_file in test_files_list
                ],
                parallel=False,
                rechunk=False
            )
        )
        if prefix_string == "staticexternal":
            test_df = (
                test_df
                .with_columns(
                    cs.contains("302T").str.split("%").list.gather([0,1])
                    .map_elements(
                        lambda x: (int(x[0]) + int(x[1].strip().split(" ")[1]))/200,return_dtype=pl.Float32
                    )
                )
            )
        test_base_df = test_base_df.join(
            test_df
            .with_columns(pl.col(pl.String).cast(pl.Categorical).rank("dense"))
            .select(pl.col("case_id"),pl.all().exclude("case_id").shrink_dtype().prefix(f"{prefix_string}_")),
            on="case_id",
            how="left"
        )
    return train_base_df,test_base_df


string_list = ["applprev_1","applprev_2","static_0","static_cb","person_1","person_2","other_1","deposit_1","debitcard","bureau_a_1","bureau_a_2","bureau_b_1","bureau_b_2","registry_a","registry_b","registry_b"]
prefix_string_list = ["pastshallow","pastdepth","staticbase","staticexternal","personshallow","persondepth","othershallow","depositshallow","cardshallow","intshallow","intdepth","extshallow","extdepth","rega","regb","regc"]


count = 0
for string_name,prefix_name in zip(string_list,prefix_string_list):
    train_base,test_base = preprocess(string_name,prefix_name,train_base,test_base)
    print(f"done: {count+1} for {prefix_name}")
    count += 1
    gc.collect()

if test_base.height > 10:
    (
        train_base
        .with_columns(
            (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
            pl.col(pl.Date).exclude("Date").dt.weekday().prefix("weekdaydate_"),
            pl.col(pl.Date).exclude("Date").dt.month().prefix("monthdate_"),
            pl.col(pl.Date).exclude("Date").dt.week().prefix("weekdate_")
        )
        .drop("Date","case_id")
        .with_columns(pl.col(pl.Boolean).cast(pl.UInt8))
        .write_parquet("/home/sohail/Downloads/train_df.parquet")
    )
else:
    (
        train_base
        .with_columns(
            (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
            pl.col(pl.Date).exclude("Date").dt.weekday().prefix("weekdaydate_"),
            pl.col(pl.Date).exclude("Date").dt.month().prefix("monthdate_"),
            pl.col(pl.Date).exclude("Date").dt.week().prefix("weekdate_")
        )
        .drop("Date","case_id")
        .with_columns(pl.col(pl.Boolean).cast(pl.UInt8))
        [:10000]
        .write_parquet("/home/sohail/Downloads/train_df.parquet")
    )
    
    
    
(
    test_base
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.Date).exclude("Date").dt.weekday().prefix("weekdaydate_"),
        pl.col(pl.Date).exclude("Date").dt.month().prefix("monthdate_"),
        pl.col(pl.Date).exclude("Date").dt.week().prefix("weekdate_")
    )
    .drop("Date")
    .with_columns(pl.col(pl.Boolean).cast(pl.UInt8))
    .write_parquet("/home/sohail/Downloads/test_df.parquet")
)

# all_date_cols = train_base.select(cs.contains(["weekdaydate","monthdate","weekdate"])).columns
# base_cols = ["week","year","month","weekday"]
# cat_cols += base_cols
# cat_cols += all_date_cols


print("first scrip done")

Writing data_file.py


In [2]:
!python data_file.py

scrpit start
done: 1 for pastshallow
done: 2 for pastdepth
done: 3 for staticbase
done: 4 for staticexternal
done: 5 for personshallow
done: 6 for persondepth
done: 7 for othershallow
done: 8 for depositshallow
done: 9 for cardshallow
done: 10 for intshallow
done: 11 for intdepth
done: 12 for extshallow
done: 13 for extdepth
done: 14 for rega
done: 15 for regb
done: 16 for regc
first scrip done


In [5]:
%%writefile lgb_training.py


import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import gc,joblib
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

class_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.01,
    "n_estimators": 5000,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 420,
    "reg_alpha": 0.15,
    "reg_lambda": 15,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

cols = (
    pl.read_parquet("/home/sohail/Downolads/lgb_imp.parquet")
    .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean"))
    .filter(pl.col("mean") > 5000)
    ["index"].to_list()
)

total_df = (
    pl.read_parquet(
        "/home/sohail/Downloads/train_df.parquet",
        low_memory=True,
        columns=cols + ["week_num","target"]
    )
    .select(pl.all().shrink_dtype())
)

if total_df.height == 10000:
    class_params["n_estimators"] = 1500
    class_params["learning_rate"] = 0.1

week_num = total_df["week_num"]

total_df = total_df.drop("week_num")

gc.collect()

def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )


cv = StratifiedGroupKFold(n_splits=5,shuffle=True)

for i,(train_ind,valid_ind) in enumerate(
    cv.split(total_df,total_df.select("target"),
    groups=week_num)):
    print(f"Training start for LGBClassifier: {i+1}")
    lgb_model = lgb.LGBMClassifier(**class_params)
    lgb_model.fit(
        (
            total_df
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            total_df
            .select('target')
            .pipe(filter_ind,train_ind)
            .to_series()
            .to_numpy()
            .ravel()
        ),
        eval_set=[(
            (
                total_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                total_df
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_series()
                .to_numpy()
                .ravel()
            )
        )],
        callbacks=[lgb.log_evaluation(500),lgb.early_stopping(800)]
    )
    joblib.dump(lgb_model,f"/home/sohail/Downloads/lgb_model_{i+1}.pkl")
    gc.collect()

Writing lgb_training.py


In [6]:
!python lgb_training.py

Training start for LGBClassifier: 1
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.646108
Early stopping, best iteration is:
[119]	valid_0's auc: 0.679254
Training start for LGBClassifier: 2
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.631871
Early stopping, best iteration is:
[73]	valid_0's auc: 0.671531
Training start for LGBClassifier: 3
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.65552
Early stopping, best iteration is:
[42]	valid_0's auc: 0.712347
Training start for LGBClassifier: 4
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.708935
Early stopping, best iteration is:
[58]	valid_0's auc: 0.749307
Training start for LGBClassifier: 5
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.66412
Early stopping, best iteration is:
[111]	valid_0's auc: 0.695804


In [7]:
%%writefile xgb_training.py


import numpy as np
import pandas as pd
import polars as pl
import xgboost as xgb
import gc,joblib
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

xgb_params = {
    "objective": "binary:logistic",
    "n_estimators": 1500,
    "eval_metric": "auc",
    "seed": 420,
    "booster": "gbtree",
    "device": "cuda",
    "eta": 0.01,
    "gamma": 5,
    "max_depth": 128,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "lambda": 10,
    "alpha": 2,
    "max_leaves": 256,
    "enable_categorical": True
}

cols = (
    pl.read_parquet("/home/sohail/Downloads/xgb_imp.parquet")
    .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean"))
    .filter(pl.col("mean") > 0.00075)
    ["index"].to_list()
)

total_df = (
    pl.read_parquet(
        "/home/sohail/Downloads/train_df.parquet",
        low_memory=True,
        columns=cols + ["week_num","target"]
    )
    .select(pl.all().shrink_dtype())
)

if total_df.height == 10000:
    xgb_params["n_estimators"] = 1500
    xgb_params["eta"] = 0.1

week_num = total_df["week_num"]

total_df = total_df.drop("week_num")

gc.collect()

def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )

cv = StratifiedGroupKFold(n_splits=5,shuffle=True)

for i,(train_ind,valid_ind) in enumerate(
    cv.split(total_df,total_df.select("target"),
    groups=week_num)):
    print(f"Training start for XGBClassifier: {i+1}")
    early_stop = xgb.callback.EarlyStopping(rounds=800)
    log_eval = xgb.callback.EvaluationMonitor(period=500)
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(
        (
            total_df
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            total_df
            .select("target")
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                total_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                total_df
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[early_stop,log_eval],
        verbose=False
    )
    joblib.dump(xgb_model,f"/home/sohail/Downloads/xgb_model_{i+1}.pkl")
    gc.collect()
    

Writing xgb_training.py


In [8]:
!python xgb_training.py

Training start for XGBClassifier: 1
[0]	validation_0-auc:0.51155
[500]	validation_0-auc:0.71277
[824]	validation_0-auc:0.71277
Training start for XGBClassifier: 2
[0]	validation_0-auc:0.55762
[500]	validation_0-auc:0.69426
[826]	validation_0-auc:0.69426
Training start for XGBClassifier: 3
[0]	validation_0-auc:0.57913
[500]	validation_0-auc:0.72077
[841]	validation_0-auc:0.72077
Training start for XGBClassifier: 4
[0]	validation_0-auc:0.56779
[500]	validation_0-auc:0.67582
[837]	validation_0-auc:0.67582
Training start for XGBClassifier: 5
[0]	validation_0-auc:0.57221
[500]	validation_0-auc:0.73395
[839]	validation_0-auc:0.73395


In [9]:
%%writefile cat_training.py


import numpy as np
import pandas as pd
import polars as pl
import catboost as cgb
import gc,joblib
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

cat_params = {
    "eval_metric": "AUC",
    "task_type": "GPU",
    "iterations": 5000,
    "learning_rate": 0.01,
    "bootstrap_type": "Poisson",
    "random_seed": 420,
    "l2_leaf_reg": 15,
    "subsample": 0.8,
    "depth": 32,
    "max_leaves": 64,
    "grow_policy": "Lossguide",
    "od_type": "Iter",
    "od_wait": 1000,
    "verbose": 500
}

cols = (
    pl.read_parquet("/home/sohail/Downloads/cat_imp.parquet")
    .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean"))
    .filter(pl.col("mean") > 0.03)
    ["index"].to_list()
)

total_df = (
    pl.read_parquet(
        "/home/sohail/Downloads/train_df.parquet",
        low_memory=True,
        columns=cols + ["week_num","target"]
    )
    .select(pl.all().shrink_dtype())
)

if total_df.height == 10000:
    cat_params["iterations"] = 1500
    cat_params["learning_rate"] = 0.1

week_num = total_df["week_num"]

total_df = total_df.drop("week_num")

gc.collect()

def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )


cv = StratifiedGroupKFold(n_splits=5,shuffle=True)

for i,(train_ind,valid_ind) in enumerate(
    cv.split(total_df,total_df.select("target"),
    groups=week_num)):
    print(f"Training start for CatBoostClassifier: {i+1}")
    cat_model = cgb.CatBoostClassifier(**cat_params)
    cat_model.fit(
        (
            total_df
            .drop("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        (
            total_df
            .select("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        eval_set=[(
            (
                total_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            ),
            (
                total_df
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            )
        )]
    )
    joblib.dump(cat_model,f"/home/sohail/Downloads/cat_model_{i+1}.pkl")
    gc.collect()

Writing cat_training.py


In [10]:
!python cat_training.py

Training start for CatBoostClassifier: 1
Default metric period is 5 because AUC is/are not implemented for GPU
0:	test: 0.5507124	best: 0.5507124 (0)	total: 33.1s	remaining: 13h 45m 58s
500:	test: 0.6924635	best: 0.7364079 (41)	total: 42.5s	remaining: 1m 24s
1000:	test: 0.6871579	best: 0.7364079 (41)	total: 52s	remaining: 25.9s
bestTest = 0.7364079356
bestIteration = 41
Shrink model to first 42 iterations.
Training start for CatBoostClassifier: 2
Default metric period is 5 because AUC is/are not implemented for GPU
0:	test: 0.5035299	best: 0.5035299 (0)	total: 21ms	remaining: 31.4s
500:	test: 0.6663445	best: 0.7215475 (33)	total: 9.79s	remaining: 19.5s
1000:	test: 0.6584135	best: 0.7215475 (33)	total: 19.3s	remaining: 9.61s
bestTest = 0.721547544
bestIteration = 33
Shrink model to first 34 iterations.
Training start for CatBoostClassifier: 3
Default metric period is 5 because AUC is/are not implemented for GPU
0:	test: 0.5038078	best: 0.5038078 (0)	total: 17.1ms	remaining: 25.6s
500:	t

In [None]:
%%writefile inference.py

import numpy as np
import pandas as pd
import polars as pl
import joblib,gc


test_df = (
    pl.read_parquet("/home/sohail/Downloads/test_df.parquet",low_memory=True)
    .select(pl.all().shrink_dtype())
)

lgb_cols = (
    pl.read_parquet("/home/sohail/Downloads/lgb_imp.parquet")
    .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean"))
    .filter(pl.col("mean") > 5000)
    ["index"].to_list()
)

xgb_cols = (
    pl.read_parquet("/home/sohail/Downloads/xgb_imp.parquet")
    .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean"))
    .filter(pl.col("mean") > 0.00075)
    ["index"].to_list()
)

cat_cols = (
    pl.read_parquet("/home/sohail/Downloads/cat_imp.parquet")
    .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean"))
    .filter(pl.col("mean") > 0.03)
    ["index"].to_list()
)

path_to_models = "home/sohail/Downloads"

test_case_ids = test_df["case_id"].to_list()
test_df.drop("target","case_id")

n_models = 5

y_pred = []

for n in range(n_models):
    model = joblib.load(f"{path_to_models}/lgb_model_{n+1}.pkl")
    y_pred.append(model.predict_proba(test_df.select(lgb_cols))[:,1])

y_pred = y_pred * 2

for n in range(n_models):
    model = joblib.load(f"{path_to_models}/lgb_model_{n+1}.pkl")
    y_pred.append(model.predict_proba(test_df.select(xgb_cols))[:,1])

y_pred = y_pred * 2


for n in range(n_models):
    model = joblib.load(f"{path_to_models}/lgb_model_{n+1}.pkl")
    y_pred.append(model.predict_proba(test_df.select(cat_cols).to_pandas())[:,1])

y_pred = y_pred * 4

y_pred = np.mean(y_pred,axis=0)

sub_df = pd.DataFrame({
    "case_id": test_case_ids,
    "score": y_pred
}).set_index("case_id")
sub_df.to_csv("./submission.csv")
sub_df
