In [27]:
import numpy as np
import pandas as pd
import polars as pl
import os, gc
from sklearn.preprocessing import LabelEncoder


pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [28]:
def calculate_woe_iv_categorical(feature, response):
    # Calculate the total number of events (positive responses) and non-events (negative responses)
    total_events = response.sum()
    total_non_events = response.count() - total_events
    
    # Create a new DataFrame with the feature and response values
    df = pd.DataFrame({'bin': feature.fillna('missing'), 'response': response})
    
    # Calculate the percentage of events and non-events for each bin of the feature
    bin_summary = df.groupby('bin')['response'].agg(['sum', 'count']).reset_index()
    bin_summary.columns = ['bin', 'events', 'total']
    bin_summary['non-events'] = (bin_summary['total'] - bin_summary['events']) 
    bin_summary['event_rate'] = (bin_summary['events'] / total_events)
    bin_summary['non-event_rate'] = (bin_summary['non-events'] / total_non_events) + 1e-10 # epsilon so that that the non event rate is not 0

    # Calculate the Weight of Evidence (WOE) and Information Value (IV) for each bin
    bin_summary['WOE'] = np.log1p(bin_summary['event_rate'] / bin_summary['non-event_rate'])
    bin_summary['IV'] = (bin_summary['event_rate'] - bin_summary['non-event_rate']) * bin_summary['WOE']

    # # Calculate the total Information Value (IV) for the feature
    total_IV = bin_summary['IV'].sum()
    
    return total_IV

def calculate_woe_iv_numeric(feature, response,quantiles = 50):
    # Calculate the total number of events (positive responses) and non-events (negative responses)
    total_events = response.sum()
    total_non_events = response.count() - total_events
    
    # Create a new DataFrame with the feature and response values
    df = pd.DataFrame({'feature': feature, 'response': response})
    
    # we want to support missing values
    df['bin'] = -1
    df.loc[df['feature'].notnull(),'bin'] = pd.qcut(df.loc[df['feature'].notnull(),'feature'], q=quantiles,duplicates='drop',labels=False)

    del df['feature']
    # Calculate the percentage of events and non-events for each bin of the feature
    bin_summary = df.groupby('bin')['response'].agg(['sum', 'count']).reset_index()
    bin_summary.columns = ['bin', 'events', 'total']
    bin_summary['non-events'] = (bin_summary['total'] - bin_summary['events']) 
    bin_summary['event_rate'] = (bin_summary['events'] / total_events)
    bin_summary['non-event_rate'] = (bin_summary['non-events'] / total_non_events) + 1e-10 # epsilon so that that the non event rate is not 0

    # Calculate the Weight of Evidence (WOE) and Information Value (IV) for each bin
    bin_summary['WOE'] = np.log1p(bin_summary['event_rate'] / bin_summary['non-event_rate'])
    bin_summary['IV'] = (bin_summary['event_rate'] - bin_summary['non-event_rate']) * bin_summary['WOE']

    # # Calculate the total Information Value (IV) for the feature
    total_IV = bin_summary['IV'].sum()
    
    return total_IV

def calculate_psi_categorical(old,new): 
    old = old.fillna("missing")
    new = new.fillna("missing")    
    
    bins = list(set(old.tolist()+new.tolist())) 
    bin_summary = pd.DataFrame(bins,columns=['bin'])
    bin_summary['prop_old'] = (bin_summary['bin'].apply(lambda x: (old==x).sum()) / len(old)) + 1e-10 # epsilon
    bin_summary['prop_new'] = (bin_summary['bin'].apply(lambda x: (new==x).sum()) / len(new)) + 1e-10 # epsilon

    
    return np.sum((bin_summary['prop_old'] - bin_summary['prop_new']) * np.log(bin_summary['prop_old']/bin_summary['prop_new']))

def calculate_psi_numeric(old,new,q=10): 

    old = pd.DataFrame(old,columns=['val'])
    new = pd.DataFrame(new,columns=['val'])
    
    # set up initial bins for missing values
    old['bin'] = -1
    new['bin'] = -1
    
    
    # return 0 in the event that theres only unique bin across both
    if (old['bin'].fillna(-9999).nunique() + new['bin'].fillna(-9999).nunique()) == 0:
        return 0
    else: 
        # assign each value to a quantile 
        old.loc[old.notnull(),'bin'] = pd.qcut(old.loc[old.notnull(),'val'], q=quantiles,duplicates='drop',labels=False)
        new.loc[new.notnull(),'bin'] = pd.qcut(new.loc[old.notnull(),'val'], q=quantiles,duplicates='drop',labels=False)
        
    
        bins = list(set(old['bin'].tolist()+new['bin'].tolist())) 
        bin_summary = pd.DataFrame(bins,columns=['bin'])
        bin_summary['prop_old'] = (bin_summary['bin'].apply(lambda x: (old['bin']==x).sum()) / len(old)) + 1e-10 # epsilon
        bin_summary['prop_new'] = (bin_summary['bin'].apply(lambda x: (new['bin']==x).sum()) / len(new)) + 1e-10 # epsilon
    
        return np.sum((bin_summary['prop_old'] - bin_summary['prop_new']) * np.log(bin_summary['prop_old']/bin_summary['prop_new']))

# Preprocessing

[Data Info](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data) <br>
[Discussion on how the data is setup](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/473950) <br>
[Starter Notebook](https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook)
* depth=0 - These are static features directly tied to a specific case_id.
* depth=1 - Each case_id has an associated historical record, indexed by num_group1.
* depth=2 - Each case_id has an associated historical record, indexed by both num_group1 and num_group2.

In [29]:
class Aggregator:
    def __init__(self,numeric_cols,string_cols,date_cols,criteria):
        self.numeric_cols = numeric_cols
        self.string_cols  = string_cols
        self.date_cols    = date_cols
        self.criteria = criteria
        
    def num_expr(self,col):
        
        expr_max    = [pl.max(col).cast(pl.Float32).alias(f"{col}_MAX")]
        expr_min    = [pl.min(col).cast(pl.Float32).alias(f"{col}_MIN")]
        expr_last   = [pl.last(col).cast(pl.Float32).alias(f"{col}_LAST")]
        expr_mean   = [pl.mean(col).cast(pl.Float32).alias(f"{col}_MEAN")]
        expr_median = [pl.median(col).cast(pl.Float32).alias(f"{col}_MEDIAN")]
        expr_var    = [pl.var(col).cast(pl.Float32).alias(f"{col}_VAR")]

        return expr_max + expr_last + expr_mean + expr_median + expr_var + expr_min

    def date_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST")]
        expr_mean   = [pl.mean(col).alias(f"{col}_MEAN")]

        return expr_max + expr_last + expr_mean 

    def str_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST")]
        return expr_max + expr_last 

    def count_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX_{self.criteria}")]

        return expr_max

    def get_exprs(self,df):
        expr = []
        new_date_cols = []
        new_str_cols = []
        for col in df.columns:
            if 'num_group' in col:
                expr.extend(self.count_expr(col))
            elif col in self.numeric_cols:
                expr.extend(self.num_expr(col))
            elif col in self.string_cols:
                new_str_cols.extend([f"{col}_MAX",f"{col}_LAST"])
                expr.extend(self.str_expr(col))
            elif col in self.date_cols:
                new_date_cols.extend([f"{col}_MAX",f"{col}_LAST",f"{col}_MEAN"])
                expr.extend(self.date_expr(col))
        
        return expr, new_date_cols, new_str_cols

In [42]:
class DatasetBuilder:
    """ This class is used to create the dataset """
    def __init__(self, 
                 n_samples   = None, 
                 parent_path = "/kaggle/input/home-credit-credit-risk-model-stability"):
        


        self.parent_path = parent_path
        self.n_samples = n_samples

        self.feat_info = pd.read_csv(f"{parent_path}/feature_definitions.csv")
        self.date_cols = []
        self.string_cols = []
        self.numeric_cols = []
        
        self.run()

    def explain_feat(self,feat_name:str):
        assert feat_name in self.feat_info['Variable'].unique(), "feature not found in feature info dataframe"
        return self.feat_info[self.feat_info['Variable']==feat_name]['Description'].values[0]

    def set_table_dtypes(self,df):
        for col in df.columns:
            
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float32))
                if col not in self.numeric_cols:
                    self.numeric_cols.append(col)                
            elif (col[-1] in ("M",)) or (col in self.string_cols):
                df = df.with_columns(pl.col(col).cast(pl.String))
                if col not in self.string_cols:
                    self.string_cols.append(col)
            elif col[-1] in ("L","T"): # we dont know the transform needed, just going to assume its either float and if not, then string
                try:
                    df = df.with_columns(pl.col(col).cast(pl.Float32))
                    if col not in self.numeric_cols:
                        self.numeric_cols.append(col) 
                except:
                    df = df.with_columns(pl.col(col).cast(pl.String))
                    if col not in self.string_cols:
                        self.string_cols.append(col) 
                    continue
                
            elif col[-1] in ("D",) or (col in self.date_cols):
                df = df.with_columns(pl.col(col).cast(pl.Date))
                if col not in self.date_cols:
                    self.date_cols.append(col)
                
        return df

    def feature_engineer_dates(self,df,date_cols=None):
        if date_cols is None:
            date_cols = self.date_cols
        for col in date_cols:
            if col in df.columns:
                df = df.with_columns((pl.col("date_decision") - pl.col(col)).dt.total_days().alias(f'{col}_DAYS_SINCE'))  # days since
                df = df.drop(col)
        
        return df
  

    
    def create_base_dataset(self):
        
        # load in the training dataset 
        if self.n_samples:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .pipe(self.set_table_dtypes).sample(n=self.n_samples).with_columns(pl.lit('train').alias('partition'))
        else:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .pipe(self.set_table_dtypes).with_columns(pl.lit('train').alias('partition'))
        
        # load in the test dataset
        test =  pl.read_parquet(f"{self.parent_path}/parquet_files/test/test_base.parquet")\
                .pipe(self.set_table_dtypes).with_columns(pl.lit('test').alias('partition'))
        
        # concat train and test
        self.df = pl.concat([train,test],how='diagonal_relaxed')
        
        # get all case_ids
        self.case_ids = self.df.get_column('case_id').to_list()
        
        # store base cols
        self.base_df_cols = self.df.columns
        
        # features
        self.df = self.df.with_columns(self.df["date_decision"].dt.month().alias("dd_month"))
        self.df = self.df.with_columns(self.df["date_decision"].dt.year().alias("dd_year"))

    def read_in_files_with_criteria(self, criteria:str):
        train_df  = pl.concat([pl.read_parquet(f"{self.parent_path}/parquet_files/train/{x}").pipe(self.set_table_dtypes).filter(pl.col('case_id').is_in(self.case_ids))
                       for x in os.listdir(f"{self.parent_path}/parquet_files/train") if (criteria in x)],how='diagonal_relaxed')
        test_df  =  pl.concat([pl.read_parquet(f"{self.parent_path}/parquet_files/test/{x}").pipe(self.set_table_dtypes)
                       for x in os.listdir(f"{self.parent_path}/parquet_files/test") if (criteria in x)],how='diagonal_relaxed')
        
        # being in train partition doesnt gaurentee it is in the test partition, so we have to ensure it 
        columns_in_common = list(set(train_df.columns).intersection(test_df.columns))
        
        df = pl.concat([train_df.select(columns_in_common),
                         test_df.select(columns_in_common)],how='diagonal_relaxed')
        
        
        return df
       
        

    def evaluate_features(self,df):
        """
        1) calculates weight of evidence * information value for measuring predictive power
        2) iterates through months to get psi (currently trying to figure out why so many nan out)
        
        """
        feats = [x for x in df.columns if x not in self.base_df_cols]
        
        # predictive power - woe*iv
        woeivs  = []
        for col in feats:
            if col in self.string_cols:
                woeiv = calculate_woe_iv_categorical(df[col].to_pandas(), df['target'].to_pandas())
                woeivs.append(woeiv)
            else:
                woeiv = calculate_woe_iv_numeric(df[col].to_pandas(), df['target'].to_pandas())
                woeivs.append(woeiv)

        
#         # stability - psi and woe*iv
#         psi_res = {x:[] for x in feats}
#         woe_res = {x:[] for x in feats}
#         for i in range(len(year_months)-1):
#             psis = []
#             old = df.filter((pl.col("ym_decision") == year_months[i]))
#             new = df.filter((pl.col("ym_decision") == year_months[i+1]))
#             for col in feats:
#                 if col in self.string_cols:
#                     psi = calculate_psi_categorical(old[col].to_pandas(),new[col].to_pandas())
#                 else:
#                     psi = calculate_psi_numeric(old[col].to_pandas(),new[col].to_pandas())
                    
#                 psi_res[col].append(psi)
                    


            
        feature_scores = pd.DataFrame(feats,columns=['feature'])
        feature_scores['prop_null'] = feature_scores['feature'].apply(lambda feat: df[feat].to_pandas().isna().sum()) / len(self.df)
        feature_scores['woe_iv'] = woeivs
        
        # lots of these ended up as nulls, will deal with later
#         feature_scores['eligible_psi'] = feature_scores['feature'].apply(lambda feat: sum([0 if np.isnan(x) else 1 for x in psi_res[feat]]))
#         feature_scores['avg_psi'] = feature_scores['feature'].apply(lambda feat: np.nanmean(psi_res[feat]))
#         feature_scores['std_psi'] = feature_scores['feature'].apply(lambda feat: np.nanstd(psi_res[feat]))
#         feature_scores['max_psi'] = feature_scores['feature'].apply(lambda feat: np.max(psi_res[feat]))

        return feature_scores
        
    
    def select_features(self,df,score="woe_iv",top_k=150):
        feature_scores = self.evaluate_features(df)
        top_k = min(top_k,len(feature_scores)-1)
        chosen_features = feature_scores.sort_values(score,ascending=False).reset_index(drop=True).loc[:top_k,'feature'].to_list()
        print(f"selected {top_k}/{len(feature_scores)} features for the model dataset")
        return chosen_features

    
    def to_pandas(self,df_data):
        df_data = df_data.to_pandas()
        cat_cols = [x for x in df_data.columns if (x in self.string_cols) and (x not in self.base_df_cols)]
        df_data[cat_cols] = df_data[cat_cols].astype("category")
        
        return df_data, cat_cols

    def reduce_mem_usage(self,df):
        """ iterate through all the columns of a dataframe and modify the data type
            to reduce memory usage.        
        """
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

        for col in df.columns[7:]:
            col_type = df[col].dtype
            if str(col_type)=="category":
                continue

            if col_type not in self.string_cols:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
#                 else:
#                     if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                         df[col] = df[col].astype(np.float16)
#                     elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                         df[col] = df[col].astype(np.float32)
#                     else:
#                         df[col] = df[col].astype(np.float32)
#                 df[col] = df[col].replace([np.inf, -np.inf], np.nan)

            else:
                continue
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return df
     
    def process_depth0(self):
        """
        These files can be used as is except for the dates, so just collect them, do feature engineering on the dates, then 
        throw out the date columns, grab top k features, join back to base
        """
        depth0_criterias = ["static_0","static_cb_0"]
        df = self.df[['case_id','target','date_decision']]
        for criteria in depth0_criterias:
            df = df.join(self.read_in_files_with_criteria(criteria), on=['case_id'], how='left')
            
        df = self.feature_engineer_dates(df)
        depth0_feats = self.select_features(df,score="woe_iv",top_k=75)
        self.df = self.df.join(df[['case_id']+depth0_feats], on='case_id', how='left')   
    

    def process_depth1(self):
        """
        These files have one group; collect them, auto aggregate, do feature engineering on the dates,
        throw out the date columns, grab top k features, join back to base
        """
        depth1_criterias = ["applprev_1","other_1",
                            "tax_registry_a_1","tax_registry_b_1","tax_registry_c_1",
                            "credit_bureau_a_1","credit_bureau_b_1",
                            "deposit_1","person_1","debitcard_1"]
        df = self.df[['case_id','target','date_decision']]
        agg_dt_cols_coll = []
        for criteria in depth1_criterias:
            criteria_df = self.read_in_files_with_criteria(criteria)
            aggr = Aggregator(self.numeric_cols,self.string_cols,self.date_cols,f"{criteria.upper()}_DEPTH1")
            agg_expr, agg_dt_cols,agg_str_cols = aggr.get_exprs(criteria_df)
            agg_dt_cols_coll.extend(agg_dt_cols)
            self.string_cols.extend(agg_str_cols)
            criteria_df = criteria_df.group_by("case_id").agg(agg_expr)
            df = df.join(criteria_df, on=['case_id'], how='inner')
        
        df = self.feature_engineer_dates(df,date_cols=agg_dt_cols_coll)
        depth1_feats = self.select_features(df,score="woe_iv",top_k=75)
        self.df = self.df.join(df[['case_id']+depth1_feats], on='case_id', how='left') 
    
    def run(self):
        self.create_base_dataset()
        self.process_depth0()
        self.process_depth1()
        
    def get_datasets(self):
        ds,cat_cols = self.to_pandas(self.df)
        ds = self.reduce_mem_usage(ds)
        return {"train":ds[ds['partition']=='train'].reset_index(drop=True), 
                "test": ds[ds['partition']=='test'].reset_index(drop=True), 
                "features": [x for x in ds.columns if x not in self.base_df_cols],
                "cat_features": cat_cols}

In [43]:
ds = DatasetBuilder().get_datasets()

selected 75/219 features for the model dataset
selected 75/975 features for the model dataset
Memory usage of dataframe is 1151.66 MB
Memory usage after optimization is: 1148.75 MB
Decreased by 0.3%


In [44]:
print(ds['train'].shape)
ds['train']['target'].value_counts(normalize=True)

(1526659, 160)


target
0.0    0.968563
1.0    0.031437
Name: proportion, dtype: float64

In [45]:
# del DSBuilder
# gc.collect()

# Training XGBoost

In [46]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split,StratifiedGroupKFold
import xgboost as xgb
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

In [47]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [None]:
def get_base_params():
    base_params = {
        'max_cat_to_onehot': 4,
        'max_delta_step':0.7,
        'random_state': 117,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',

        # turn on when gpu 
        'device': 'cuda',
        'sampling_method':'gradient_based',
    }
    return base_params

In [48]:
search_space = {
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 0.6), # col_sample reduces correlation b/c cols to a limit and reduces computation
    'colsample_bynode': hp.uniform('colsample_bynode', 0.2, 0.6),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 0.6),
    'gamma': hp.loguniform('gamma',np.log(0.00001), np.log(100)),
    'max_depth': scope.int(hp.uniform('max_depth', 2, 15)),
    'min_child_weight': hp.loguniform('min_child_weight', np.log(0.00001), np.log(100)),
#     'reg_alpha': hp.loguniform('reg_alpha', np.log(.00001), np.log(100)),
#     'reg_lambda':hp.loguniform('reg_lambda',np.log(.00001), np.log(100)),
    'scale_pos_weight': hp.uniform('scale_pos_weight',1, 20),
    'subsample': hp.uniform('subsample', 0.2, 0.5),
    'learning_rate' : hp.loguniform('learning_rate', np.log(0.00001), np.log(.5)),
    'n_estimators':scope.int(hp.uniform('n_estimators', 100, 1000)),
    
    
   
    
    # turn off when gpu
#     'n_jobs': 10,
#     'tree_method':'hist',

}

In [49]:
# do splits ahead of time to improve trial speed
k = 4
skf = StratifiedGroupKFold(n_splits=k)
idx = np.arange(len(ds['train']))
ds_train_dmatrix_splits = []
for train_idx, valid_idx in skf.split(idx,ds['train']['target'],groups = ds['train']['WEEK_NUM']): 
    dtrain = xgb.DMatrix(ds['train'].loc[train_idx,ds['features']], label=ds['train'].loc[train_idx,'target'],enable_categorical=True)
    dvalid = xgb.DMatrix(ds['train'].loc[valid_idx,ds['features']], label=ds['train'].loc[valid_idx,'target'],enable_categorical=True)
    ds_train_dmatrix_splits.append((dtrain,dvalid))

In [54]:
def trial_fn(params,
             splits = []):


    n_estimators = params.pop('n_estimators')
    scores = [] 
    for dtrain, dvalid in splits: 
        mod = xgb.train(params,dtrain, n_estimators)
        score = roc_auc_score(dvalid.get_label(),mod.predict(dvalid))
        scores.append(score)
    
    score = np.mean(scores) 

    return {"status": STATUS_OK, "loss": -score} # always minimizes

In [55]:
best_params = fmin(fn=partial(trial_fn, splits = ds_train_dmatrix_splits),
                    space=search_space,
                    algo=tpe.suggest,
                    max_evals=100,
                    timeout=60*60 # seconds
                  )
int_params = ['max_depth','n_estimators','max_cat_to_onehot']
bestp = get_base_params()
for k,v in best_params.items():
    if k in int params:
        bestp[k] = int(v)
    else:
        bestp[k] = v
bestp

100%|██████████| 10/10 [36:03<00:00, 216.32s/trial, best loss: -0.8005964928837767]


{'colsample_bylevel': 0,
 'colsample_bynode': 0,
 'colsample_bytree': 0,
 'gamma': 0,
 'learning_rate': 0,
 'max_depth': 7,
 'min_child_weight': 10,
 'n_estimators': 436,
 'scale_pos_weight': 1,
 'subsample': 0}

In [56]:
n_estimators = best_params.pop('n_estimators')
dtrain = xgb.DMatrix(ds['train'][ds['features']], label=ds['train']['target'],enable_categorical=True)
mod = xgb.train(best_params,dtrain, n_estimators)

# Submission


In [57]:
del ds['train']
gc.collect()

2546

In [58]:
dtest = xgb.DMatrix(ds['test'][ds['features']], enable_categorical=True)
ds['test']['score'] = mod.predict(dtest)

In [59]:
submission = ds['test'][['case_id','score']]
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,case_id,score
0,57543,0.031437
1,57549,0.031437
2,57551,0.031437
3,57552,0.031437
4,57569,0.031437
