In [1]:
import numpy as np
import pandas as pd
import polars as pl
import os, gc
from sklearn.preprocessing import LabelEncoder


pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
def calculate_woe_iv_categorical(feature, response):
    # Calculate the total number of events (positive responses) and non-events (negative responses)
    total_events = response.sum()
    total_non_events = response.count() - total_events
    
    # Create a new DataFrame with the feature and response values
    df = pd.DataFrame({'bin': feature.fillna('missing'), 'response': response})
    
    # Calculate the percentage of events and non-events for each bin of the feature
    bin_summary = df.groupby('bin')['response'].agg(['sum', 'count']).reset_index()
    bin_summary.columns = ['bin', 'events', 'total']
    bin_summary['non-events'] = (bin_summary['total'] - bin_summary['events']) 
    bin_summary['event_rate'] = (bin_summary['events'] / total_events)
    bin_summary['non-event_rate'] = (bin_summary['non-events'] / total_non_events) + 1e-10 # epsilon so that that the non event rate is not 0

    # Calculate the Weight of Evidence (WOE) and Information Value (IV) for each bin
    bin_summary['WOE'] = np.log1p(bin_summary['event_rate'] / bin_summary['non-event_rate'])
    bin_summary['IV'] = (bin_summary['event_rate'] - bin_summary['non-event_rate']) * bin_summary['WOE']

    # # Calculate the total Information Value (IV) for the feature
    total_IV = bin_summary['IV'].sum()
    
    return total_IV

def calculate_woe_iv_numeric(feature, response,quantiles = 25):
    # Calculate the total number of events (positive responses) and non-events (negative responses)
    total_events = response.sum()
    total_non_events = response.count() - total_events
    
    # Create a new DataFrame with the feature and response values
    df = pd.DataFrame({'feature': feature, 'response': response})
    
    # we want to support missing values
    df['bin'] = -1
    df.loc[df['feature'].notnull(),'bin'] = pd.qcut(df.loc[df['feature'].notnull(),'feature'], q=quantiles,duplicates='drop',labels=False)

    del df['feature']
    # Calculate the percentage of events and non-events for each bin of the feature
    bin_summary = df.groupby('bin')['response'].agg(['sum', 'count']).reset_index()
    bin_summary.columns = ['bin', 'events', 'total']
    bin_summary['non-events'] = (bin_summary['total'] - bin_summary['events']) 
    bin_summary['event_rate'] = (bin_summary['events'] / total_events)
    bin_summary['non-event_rate'] = (bin_summary['non-events'] / total_non_events) + 1e-10 # epsilon so that that the non event rate is not 0

    # Calculate the Weight of Evidence (WOE) and Information Value (IV) for each bin
    bin_summary['WOE'] = np.log1p(bin_summary['event_rate'] / bin_summary['non-event_rate'])
    bin_summary['IV'] = (bin_summary['event_rate'] - bin_summary['non-event_rate']) * bin_summary['WOE']

    # # Calculate the total Information Value (IV) for the feature
    total_IV = bin_summary['IV'].sum()
    
    return total_IV

def calculate_psi_categorical(old,new): 
    old = old.to_frame().fillna('missing')
    old.columns = ['bin']
    new = new.to_frame().fillna('missing')
    new.columns = ['bin']    
    
    old = old.groupby('bin').agg(count_old=('bin','count'))
    new = new.groupby('bin').agg(count_new=('bin','count'))
    
    bins = list(set(old['bin'].tolist()+new['bin'].tolist())) 
    bin_summary = pd.DataFrame(bins,columns=['bin'])
    bin_summary = bin_summary.merge(old[['bin','count_old']],on='bin',how='left')
    bin_summary = bin_summary.merge(new[['bin','count_new']],on='bin',how='left')
    bin_summary['prop_old'] = (bin_summary['count_old'].fillna(0) / len(old)) + 1e-10 # epsilon
    bin_summary['prop_new'] = (bin_summary['count_new'].fillna(0) / len(new)) + 1e-10 # epsilon

    return np.sum((bin_summary['prop_old'] - bin_summary['prop_new']) * np.log(bin_summary['prop_old']/bin_summary['prop_new']))

def calculate_psi_numeric(old,new,q=10): 

    old = old.to_frame()
    old.columns = ['val']
    new = new.to_frame()
    new.columns = ['val']
    
    # set up initial bins for missing values
    old['bin'] = -1
    new['bin'] = -1
    
    
    # return 0 in the event that theres less than 3 unique bin across both
    if (old['val'].fillna(-9999).nunique() + new['val'].fillna(-9999).nunique()) <= 2:
        return np.nan
    else: 
        # assign each value to a quantile 
        old.loc[old['val'].notnull(),'bin'] = pd.qcut(old.loc[old['val'].notnull(),'val'], q=q,duplicates='drop',labels=False)
        new.loc[new['val'].notnull(),'bin'] = pd.qcut(new.loc[new['val'].notnull(),'val'], q=q,duplicates='drop',labels=False)
        
        old = old.groupby('bin').agg(count_old=('bin','count'))
        new = new.groupby('bin').agg(count_new=('bin','count'))
        
        
        bins = list(set(old['bin'].tolist()+new['bin'].tolist())) 
        bin_summary = pd.DataFrame(bins,columns=['bin'])
        bin_summary = bin_summary.merge(old[['bin','count_old']],on='bin',how='left')
        bin_summary = bin_summary.merge(new[['bin','count_new']],on='bin',how='left')
        bin_summary['prop_old'] = (bin_summary['count_old'].fillna(0) / len(old)) + 1e-10 # epsilon
        bin_summary['prop_new'] = (bin_summary['count_new'].fillna(0) / len(new)) + 1e-10 # epsilon
    
        return np.sum((bin_summary['prop_old'] - bin_summary['prop_new']) * np.log(bin_summary['prop_old']/bin_summary['prop_new']))

# Preprocessing

[Data Info](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data) <br>
[Discussion on how the data is setup](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/473950) <br>
[Starter Notebook](https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook)
* depth=0 - These are static features directly tied to a specific case_id.
* depth=1 - Each case_id has an associated historical record, indexed by num_group1.
* depth=2 - Each case_id has an associated historical record, indexed by both num_group1 and num_group2.

In [3]:
class Aggregator:
    def __init__(self,numeric_cols,string_cols,date_cols,criteria):
        self.numeric_cols = numeric_cols
        self.string_cols  = string_cols
        self.date_cols    = date_cols
        self.criteria = criteria
        
    def num_expr(self,col):
        
        expr_max    = [pl.max(col).alias(f"{col}_MAX_{self.criteria}")]
        expr_min    = [pl.min(col).alias(f"{col}_MIN_{self.criteria}")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST_{self.criteria}")]
        expr_mean   = [pl.mean(col).alias(f"{col}_MEAN_{self.criteria}")]
        expr_median = [pl.median(col).alias(f"{col}_MEDIAN_{self.criteria}")]
        expr_var    = [pl.var(col).alias(f"{col}_VAR_{self.criteria}")]

        return expr_max + expr_last + expr_mean + expr_median + expr_var + expr_min

    def date_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX_{self.criteria}")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST_{self.criteria}")]
        expr_mean   = [pl.mean(col).alias(f"{col}_MEAN_{self.criteria}")]

        return expr_max + expr_last + expr_mean 

    def str_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX_{self.criteria}")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST_{self.criteria}")]
        return expr_max + expr_last 

    def count_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX_{self.criteria}")]

        return expr_max

    def get_exprs(self,df):
        expr = []
        new_date_cols = []
        new_str_cols = []
        for col in df.columns:
            if 'num_group' in col:
                expr.extend(self.count_expr(col))
            elif col in self.numeric_cols:
                expr.extend(self.num_expr(col))
            elif col in self.string_cols:
                new_str_cols.extend([f"{col}_MAX_{self.criteria}",f"{col}_LAST_{self.criteria}"])
                expr.extend(self.str_expr(col))
            elif col in self.date_cols:
                new_date_cols.extend([f"{col}_MAX_{self.criteria}",f"{col}_LAST_{self.criteria}",f"{col}_MEAN_{self.criteria}"])
                expr.extend(self.date_expr(col))
        
        return expr, new_date_cols, new_str_cols

In [4]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.97:
                df = df.drop(col)
                print(f"dropped column {col} because too many nulls")
    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)
                print(f"dropped column {col} because of category size")

    return df

In [5]:
def reduce_polars_memory_usage(df: pl.DataFrame) -> pl.DataFrame:
    """
    Reduces memory usage of a DataFrame by converting column types.

    Args:
    - df (pl.DataFrame): DataFrame to optimize.
    - name (str): Name of the DataFrame.

    Returns:
    - pl.DataFrame: Optimized DataFrame.
    """
    og_mem = round(df.estimated_size('mb'), 4)

    int_types = [
        pl.Int8,
        pl.Int16,
        pl.Int32,
        pl.Int64,
        pl.UInt8,
        pl.UInt16,
        pl.UInt32,
        pl.UInt64,
    ]
    float_types = [pl.Float32, pl.Float64]

    for col in df.columns:
        if col == 'case_id':
            continue
        col_type = df[col].dtype
        if col_type in int_types + float_types:
            c_min = df[col].min()
            c_max = df[col].max()

            if c_min is not None and c_max is not None:
                if col_type in int_types:
                    if c_min >= 0:
                        if (
                            c_min >= np.iinfo(np.uint8).min
                            and c_max <= np.iinfo(np.uint8).max
                        ):
                            df = df.with_columns(df[col].cast(pl.UInt8))
                        elif (
                            c_min >= np.iinfo(np.uint16).min
                            and c_max <= np.iinfo(np.uint16).max
                        ):
                            df = df.with_columns(df[col].cast(pl.UInt16))
                        elif (
                            c_min >= np.iinfo(np.uint32).min
                            and c_max <= np.iinfo(np.uint32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.UInt32))
                        elif (
                            c_min >= np.iinfo(np.uint64).min
                            and c_max <= np.iinfo(np.uint64).max
                        ):
                            df = df.with_columns(df[col].cast(pl.UInt64))
                    else:
                        if (
                            c_min >= np.iinfo(np.int8).min
                            and c_max <= np.iinfo(np.int8).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Int8))
                        elif (
                            c_min >= np.iinfo(np.int16).min
                            and c_max <= np.iinfo(np.int16).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Int16))
                        elif (
                            c_min >= np.iinfo(np.int32).min
                            and c_max <= np.iinfo(np.int32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Int32))
                        elif (
                            c_min >= np.iinfo(np.int64).min
                            and c_max <= np.iinfo(np.int64).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in float_types:
                    if (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                    ):
                        df = df.with_columns(df[col].cast(pl.Float32))

    print(
        f"Memory of polars dataframe went from {og_mem}MB to {round(df.estimated_size('mb'), 4)}MB."
    )

    return df

In [6]:
def extract_lowercase(s):
    # Initialize an empty result string
    result = ""

    # Loop through each character in the string
    for char in s:
        # Check if the character is lowercase
        if char.islower() or char == '_' or char.isnumeric():
            result += char
        # Break the loop if a non-lowercase character is encountered (if desired)
        elif result:
            break

    return result

In [7]:
class DatasetBuilder:
    """ This class is used to create the dataset """
    def __init__(self, 
                 n_samples   = None, 
                 parent_path = "/kaggle/input/home-credit-credit-risk-model-stability",
                ):
        


        self.parent_path = parent_path
        self.n_samples = n_samples

        self.feat_info = pd.read_csv(f"{parent_path}/feature_definitions.csv")
        self.date_cols = []
        self.string_cols = []
        
        self.run()

    def explain_feat(self,feat_name:str):
        assert feat_name in self.feat_info['Variable'].unique(), "feature not found in feature info dataframe"
        return self.feat_info[self.feat_info['Variable']==feat_name]['Description'].values[0]

    def set_table_dtypes(self,df):
        for col in df.columns:
                    
            if col in ["case_id"]:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
            elif col in  ["WEEK_NUM", "num_group1", "num_group2", "target"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16))            
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))                
            elif (col[-1] in ("M",)) or (col in self.string_cols):
                df = df.with_columns(pl.col(col).cast(pl.String))
                if col not in self.string_cols:
                    self.string_cols.append(col)
            elif col[-1] in ("L","T"): # we dont know the transform needed, just going to assume its either float and if not, then string
                try:
                    df = df.with_columns(pl.col(col).cast(pl.Float64))
                except:
                    df = df.with_columns(pl.col(col).cast(pl.String))
                    if col not in self.string_cols:
                        self.string_cols.append(col) 
                    continue
                
            elif col[-1] in ("D",) or (col in self.date_cols):
                df = df.with_columns(pl.col(col).cast(pl.Date))
                if col not in self.date_cols:
                    self.date_cols.append(col)
        return df

    def feature_engineer_dates(self,df,date_cols=None):
        if date_cols is None:
            date_cols = self.date_cols
        if 'date_decision' not in df.columns:
            df = df.join(self.df[['case_id','date_decision']],on='case_id')
        for col in date_cols:
            if col in df.columns:
                df = df.with_columns((pl.col("date_decision") - pl.col(col)).dt.total_days().alias(f'{col}_DAYS_SINCE'))
                df = df.drop(col)

        if 'date_decision' in df.columns:
            df = df.drop('date_decision')
            
        return df
  
    
    def create_base_dataset(self):
        
        # load in the training dataset 
        if self.n_samples is not None:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .sample(n=self.n_samples).with_columns(pl.lit('train').alias('partition'))
        else:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .with_columns(pl.lit('train').alias('partition'))
        
        # load in the test dataset
        test =  pl.read_parquet(f"{self.parent_path}/parquet_files/test/test_base.parquet")\
                .with_columns(pl.lit(0).alias('target'))\
                .with_columns(pl.lit('test').alias('partition'))        
        
        
        
        
        
        # concat train and test
        self.df = reduce_polars_memory_usage(pl.concat([train,test],how='vertical_relaxed').pipe(self.set_table_dtypes))
        
        # get all case_ids
        self.train_case_ids = train.get_column('case_id').to_list()
        self.test_case_ids  = test.get_column('case_id').to_list()
        
        # store base cols
        self.base_df_cols = self.df.columns
        
        del train
        del test
        gc.collect()

    def read_in_files_with_criteria(self, criteria:str):
        print(f"processing criteria {criteria}...")
        train_df  = pl.concat([pl.scan_parquet(f"{self.parent_path}/parquet_files/train/{x}", low_memory=True, rechunk=True)
                       for x in os.listdir(f"{self.parent_path}/parquet_files/train") if (criteria in x)],how='vertical_relaxed')
        test_df  =  pl.concat([pl.scan_parquet(f"{self.parent_path}/parquet_files/test/{x}", low_memory=True, rechunk=True)
                       for x in os.listdir(f"{self.parent_path}/parquet_files/test") if (criteria in x)],how='vertical_relaxed')
        

        # being in train partition doesnt gaurentee it is in the test partition, so we have to ensure it 
        columns_in_common = list(set(train_df.columns).intersection(set(test_df.columns)))

        df = pl.concat([train_df.select(columns_in_common),
                         test_df.select(columns_in_common)],how='vertical_relaxed') 
            
        del train_df
        del test_df 
        gc.collect()
        
        df = df.collect().pipe(self.set_table_dtypes).filter(pl.col('case_id').is_in(self.train_case_ids+self.test_case_ids))

        return df
        
    def optimize_polars_df(self,df):
        return reduce_polars_memory_usage(filter_cols(df))
       
        

    def evaluate_features(self,df:pl.DataFrame,
                          stability_scoring=False):
        """
        1) calculates weight of evidence * information value for measuring predictive power
        
        """
        feats = [x for x in df.columns if x not in self.base_df_cols]

        df = df.filter(pl.col("case_id").is_in(self.train_case_ids))
        n_row = len(self.df)
        if 'target' not in df.columns:
            df = df.join(self.df[['case_id','target']],on='case_id')
        
        # predictive power - woe*iv
        woeivs  = []
        for col in feats:
            if df[col].dtype == pl.String:
                woeiv = calculate_woe_iv_categorical(df[col].to_pandas(), df['target'].to_pandas())
                woeivs.append(woeiv)
            else:
                woeiv = calculate_woe_iv_numeric(df[col].to_pandas(), df['target'].to_pandas())
                woeivs.append(woeiv)
        

        feature_scores = pd.DataFrame(feats,columns=['feature'])
        feature_scores['categorical'] = feature_scores['feature'].isin(self.string_cols)
        feature_scores['prop_null'] = feature_scores['feature'].apply(lambda feat: df[feat].to_pandas().isna().sum()) / n_row
        feature_scores['woe_iv'] = woeivs       
    
        return feature_scores
        
    
    def select_features(self,df,score="woe_iv",threshold=0.02,dedup_agg=False):
        feature_scores = self.evaluate_features(df)
        start_n = len(feature_scores)
        
        if dedup_agg:
            feature_scores['feature_base_col'] = feature_scores['feature'].apply(extract_lowercase)
            feature_scores = feature_scores.sort_values(['feature_base_col',score],ascending=[True,False]).drop_duplicates(subset=['feature_base_col'])
        
        chosen_features = feature_scores[feature_scores[score]>=threshold]['feature'].unique().tolist()
        print(f"selected {len(chosen_features)}/{start_n} features for the model dataset")
        del feature_scores
        return chosen_features

    
    def to_pandas(self,df_data):
        df_data = df_data.to_pandas()
        cat_cols = [x for x in df_data.columns if (x in self.string_cols) and (x not in self.base_df_cols)]
        df_data[cat_cols] = df_data[cat_cols].astype("category")
        df_data = self.reduce_pandas_mem_usage(df_data)
        return df_data, cat_cols

    def reduce_pandas_mem_usage(self,df):
        """ iterate through all the columns of a dataframe and modify the data type
            to reduce memory usage.        
        """
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of pandas dataframe is {:.2f} MB'.format(start_mem))

        for col in [x for x in df.columns if x not in self.base_df_cols]:
            col_type = df[col].dtype
            if str(col_type)=="category":
                continue
                
            else:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)


        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

        return df
     
    def process_depth0(self):
        """
        These files can be used as is except for the dates, so just collect them, do feature engineering on the dates, then 
        throw out the date columns, grab top k features, join back to base
        """
        depth0_criterias = ["static_0","static_cb_0"]

        for criteria in depth0_criterias:
            df = self.read_in_files_with_criteria(criteria)
            df = self.optimize_polars_df(df)
            df = self.feature_engineer_dates(df)
            depth0_feats = self.select_features(df,score="woe_iv",threshold=0.03)
            self.df = self.df.join(df[['case_id']+depth0_feats], on='case_id', how='left')   
        
        del df
        gc.collect()

    def process_depth1(self):
        """
        These files have one group; collect them, auto aggregate, do feature engineering on the dates,
        throw out the date columns, grab top k features, join back to base
        """
        depth1_criterias = ["applprev_1","other_1",
                            "tax_registry_a_1","tax_registry_b_1","tax_registry_c_1",
                            "credit_bureau_a_1","credit_bureau_b_1",
                            "deposit_1","person_1","debitcard_1"]
        
        # all groups
        for criteria in depth1_criterias:
            df = self.df[['case_id','target','date_decision']]
            
            criteria_df = self.read_in_files_with_criteria(criteria)
            criteria_df = self.optimize_polars_df(criteria_df)
            aggr = Aggregator([x for x in criteria_df.columns if x not in self.string_cols+self.date_cols+self.base_df_cols],
                              self.string_cols,self.date_cols,
                              f"{criteria.upper()}_DEPTH1_ALL")
            agg_expr, agg_dt_cols, agg_str_cols = aggr.get_exprs(criteria_df)
 
            criteria_df = criteria_df.group_by("case_id").agg(agg_expr)
            df = df.join(criteria_df, on=['case_id'], how='left')
            df = self.feature_engineer_dates(df,date_cols=agg_dt_cols)    

            feats = self.select_features(df,score="woe_iv",dedup_agg=True,threshold=0.03)
            
            if len(feats)>0:
                self.string_cols.extend([x for x in agg_str_cols if x in feats])
                self.df = self.df.join(df[['case_id']+feats], on='case_id', how='left') 
 
            del criteria_df
            del df
            gc.collect()

    def process_depth2(self):
        """
        For now, just approach it like depth 2
        """
        depth2_criterias = ["applprev_2","person_2","credit_bureau_b_2"] # "credit_bureau_a_2",
        

        for criteria in depth2_criterias:
            df = self.df[['case_id','target','date_decision']]
            # all groups
            criteria_df = self.read_in_files_with_criteria(criteria)
            criteria_df = self.optimize_polars_df(criteria_df)
            aggr = Aggregator([x for x in criteria_df.columns if x not in self.string_cols+self.date_cols+self.base_df_cols],
                              self.string_cols,self.date_cols,
                              f"{criteria.upper()}_DEPTH2_ALL")
            agg_expr, agg_dt_cols, agg_str_cols = aggr.get_exprs(criteria_df)
 
            criteria_df = criteria_df.group_by("case_id").agg(agg_expr)
            df = df.join(criteria_df, on=['case_id'], how='left')
            df = self.feature_engineer_dates(df,date_cols=agg_dt_cols)    
            feats = self.select_features(df,score="woe_iv",dedup_agg=True)
            if len(feats)>0:
                self.string_cols.extend([x for x in agg_str_cols if x in feats])
                self.df = self.df.join(df[['case_id']+feats], on='case_id', how='left') 
 
            del criteria_df
            del df
            gc.collect()            
            

    def run(self):
        self.create_base_dataset()
        self.process_depth0()
        self.process_depth1()
        self.process_depth2()        
        
    def get_datasets(self):
        df,cat_cols = self.to_pandas(self.df)

        del self.df
        gc.collect()
        
        return {"train":df[df['partition']=='train'].reset_index(drop=True), 
                "test": df[df['partition']=='test'].reset_index(drop=True), 
                "features": [x for x in df.columns if x not in self.base_df_cols],
                "cat_features": cat_cols}
    


In [8]:
ds = DatasetBuilder().get_datasets()

Memory of polars dataframe went from 0.2386MB to 0.1814MB.
processing criteria static_0...
dropped column isbidproductrequest_292L because too many nulls
dropped column clientscnt_136L because too many nulls
dropped column lastotherinc_902A because too many nulls
dropped column lastrepayingdate_696D because too many nulls
dropped column payvacationpostpone_4187118D because too many nulls
dropped column interestrategrace_34L because too many nulls
dropped column lastotherlnsexpense_631A because too many nulls
dropped column lastdependentsnum_448L because too many nulls
dropped column equalityempfrom_62L because too many nulls
dropped column previouscontdistrict_112M because of category size
Memory of polars dataframe went from 11.1564MB to 6.2997MB.
selected 104/157 features for the model dataset
processing criteria static_cb_0...
dropped column fortoday_1092L because too many nulls
dropped column for3years_504L because too many nulls
dropped column forquarter_462L because too many null

In [9]:
print(ds['train'].shape)
ds['train']['target'].value_counts(normalize=True)

(10000, 288)


target
0    0.97
1    0.03
Name: proportion, dtype: float64

In [10]:
# del DSBuilder
# gc.collect()

In [11]:
ds['train']

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,partition,cntpmts24_3658933L,pctinstlsallpaidlate4d_3546849L,numinsttopaygrest_4493213L,daysoverduetolerancedd_3976961L,numinstpaidearly5dobd_4499205L,totinstallast1m_4525188A,lastrejectcommoditycat_161M,disbursedcredamount_1113A,numinstlswithdpd10_728L,avgdbdtollast24m_4525197P,cntincpaycont9m_3716944L,monthsannuity_845L,maxlnamtstart6m_4525199A,numinsttopaygr_769L,numinstlallpaidearly3d_817L,numrejects9m_859L,numinstpaidearly3dest_4493216L,mobilephncnt_593L,avgdbddpdlast3m_4187120P,maxdpdlast3m_392P,numinstmatpaidtearly2d_4499204L,numinstlswithoutdpd_562L,lastcancelreason_561M,avgdbddpdlast24m_3658932P,maxdebt4_972A,annuity_780A,avginstallast24m_3658937A,maxdbddpdlast1m_3658939P,numinstpaidearly3d_3546850L,mindbddpdlast24m_3658935P,maxannuity_159A,pctinstlsallpaidlate6d_3546844L,price_1097A,avgdpdtolclosure24_3658938P,sumoutstandtotalest_4493215A,numinstpaid_4499208L,maxdbddpdtollast12m_3658940P,avgpmtlast12m_4525200A,maxdpdinstlnum_3546846P,numinstpaidearly5dest_4493211L,pctinstlsallpaidearl3d_427L,avglnamtstart24m_4525187A,maxdpdlast9m_1059P,maxoutstandbalancel12m_4187113A,lastrejectcommodtypec_5251769M,numinstunpaidmax_3546851L,maxdpdlast6m_474P,totalsettled_863A,lastrejectreason_759M,numinstls_657L,amtinstpaidbefduel24m_4187115A,maxannuity_4075009A,pmtnum_254L,pctinstlsallpaidlate1d_3546856L,numcontrs3months_479L,sumoutstandtotal_3546847A,maxdpdtolerance_374P,avgmaxdpdlast9m_3716943P,maxdpdlast12m_727P,numinstlsallpaid_934L,numincomingpmts_3546848L,credamount_770A,lastst_736L,lastapprcredamount_781A,numinstpaidlate1d_3546852L,inittransactionamount_650A,avgoutstandbalancel6m_4187114A,maxdpdlast24m_143P,lastapprcommoditycat_1041M,lastrejectreasonclient_4145040M,numinstpaidearly_338L,maxpmtlast3m_4525190A,totaldebt_9A,annuitynextmonth_57A,maxdbddpdtollast6m_4187119P,numinstunpaidmaxest_4493212L,maininc_215A,maxdpdfrom6mto36m_3546853P,numinstregularpaid_973L,numinstpaidearly5d_1087L,currdebt_22A,eir_270L,lastrejectcredamount_222A,numinstlswithdpd5_4187116L,pctinstlsallpaidlat10d_839L,interestrate_311L,numinstregularpaidest_4493210L,lastapprcommoditytypec_5251766M,numinstpaidearlyest_4493214L,mindbdtollast24m_4525191P,numinstpaidlastcontr_4325080L,lastactivateddate_801D_DAYS_SINCE,firstclxcampaign_1125D_DAYS_SINCE,datefirstoffer_1144D_DAYS_SINCE,lastrejectdate_50D_DAYS_SINCE,maxdpdinstldate_3546855D_DAYS_SINCE,validfrom_1069D_DAYS_SINCE,lastdelinqdate_224D_DAYS_SINCE,firstdatedue_489D_DAYS_SINCE,lastapplicationdate_877D_DAYS_SINCE,datelastinstal40dpd_247D_DAYS_SINCE,dtlastpmtallstes_4499206D_DAYS_SINCE,datelastunpaid_3546854D_DAYS_SINCE,lastapprdate_640D_DAYS_SINCE,pmtaverage_3A,requesttype_4525192L,education_1103M,days90_310L,pmtaverage_4527227A,days120_123L,numberofqueries_373L,firstquarter_103L,pmtssum_45A,riskassesment_940T,contractssum_5085716L,pmtscount_423L,fourthquarter_440L,pmtcount_4955617L,pmtaverage_4955615A,days360_512L,days30_165L,days180_256L,secondquarter_766L,riskassesment_302T,assignmentdate_4955616D_DAYS_SINCE,assignmentdate_238D_DAYS_SINCE,birthdate_574D_DAYS_SINCE,responsedate_4527233D_DAYS_SINCE,dateofbirth_337D_DAYS_SINCE,responsedate_4917613D_DAYS_SINCE,annuity_853A_MAX_APPLPREV_1_DEPTH1_ALL,approvaldate_319D_LAST_APPLPREV_1_DEPTH1_ALL_DAYS_SINCE,byoccupationinc_3656910L_MAX_APPLPREV_1_DEPTH1_ALL,cancelreason_3545846M_LAST_APPLPREV_1_DEPTH1_ALL,creationdate_885D_LAST_APPLPREV_1_DEPTH1_ALL_DAYS_SINCE,credacc_actualbalance_314A_LAST_APPLPREV_1_DEPTH1_ALL,credacc_credlmt_575A_VAR_APPLPREV_1_DEPTH1_ALL,credacc_maxhisbal_375A_MIN_APPLPREV_1_DEPTH1_ALL,credacc_minhisbal_90A_MEAN_APPLPREV_1_DEPTH1_ALL,credamount_590A_VAR_APPLPREV_1_DEPTH1_ALL,currdebt_94A_MEAN_APPLPREV_1_DEPTH1_ALL,dateactivated_425D_LAST_APPLPREV_1_DEPTH1_ALL_DAYS_SINCE,downpmt_134A_VAR_APPLPREV_1_DEPTH1_ALL,dtlastpmt_581D_LAST_APPLPREV_1_DEPTH1_ALL_DAYS_SINCE,dtlastpmtallstes_3545839D_LAST_APPLPREV_1_DEPTH1_ALL_DAYS_SINCE,education_1138M_LAST_APPLPREV_1_DEPTH1_ALL,employedfrom_700D_MAX_APPLPREV_1_DEPTH1_ALL_DAYS_SINCE,familystate_726L_LAST_APPLPREV_1_DEPTH1_ALL,firstnonzeroinstldate_307D_MAX_APPLPREV_1_DEPTH1_ALL_DAYS_SINCE,isbidproduct_390L_VAR_APPLPREV_1_DEPTH1_ALL,mainoccupationinc_437A_MIN_APPLPREV_1_DEPTH1_ALL,maxdpdtolerance_577P_MEDIAN_APPLPREV_1_DEPTH1_ALL,num_group1_MAX_APPLPREV_1_DEPTH1_ALL,outstandingdebt_522A_MEAN_APPLPREV_1_DEPTH1_ALL,pmtnum_8L_MEDIAN_APPLPREV_1_DEPTH1_ALL,rejectreason_755M_LAST_APPLPREV_1_DEPTH1_ALL,rejectreasonclient_4145042M_LAST_APPLPREV_1_DEPTH1_ALL,revolvingaccount_394A_MIN_APPLPREV_1_DEPTH1_ALL,status_219L_LAST_APPLPREV_1_DEPTH1_ALL,tenor_203L_MEDIAN_APPLPREV_1_DEPTH1_ALL,amtdebitincoming_4809443A_MAX_OTHER_1_DEPTH1_ALL,amtdebitoutgoing_4809440A_MAX_OTHER_1_DEPTH1_ALL,amount_4527230A_VAR_TAX_REGISTRY_A_1_DEPTH1_ALL,num_group1_MAX_TAX_REGISTRY_A_1_DEPTH1_ALL,amount_4917619A_MAX_TAX_REGISTRY_B_1_DEPTH1_ALL,deductiondate_4917603D_LAST_TAX_REGISTRY_B_1_DEPTH1_ALL_DAYS_SINCE,pmtamount_36A_MEAN_TAX_REGISTRY_C_1_DEPTH1_ALL,processingdate_168D_MEAN_TAX_REGISTRY_C_1_DEPTH1_ALL_DAYS_SINCE,annualeffectiverate_199L_MEDIAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,classificationofcontr_400M_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,contractst_545M_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,contractst_964M_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,credlmt_230A_VAR_CREDIT_BUREAU_A_1_DEPTH1_ALL,credlmt_935A_MEDIAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,dateofcredend_289D_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,dateofcredend_353D_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,dateofcredstart_181D_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,dateofcredstart_739D_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,dateofrealrepmt_138D_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,debtoutstand_525A_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,debtoverdue_47A_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,description_351M_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,dpdmax_139P_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,dpdmax_757P_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,dpdmaxdatemonth_442T_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,dpdmaxdatemonth_89T_MEDIAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,dpdmaxdateyear_596T_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,dpdmaxdateyear_896T_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,financialinstitution_382M_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,financialinstitution_591M_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,instlamount_768A_MIN_CREDIT_BUREAU_A_1_DEPTH1_ALL,instlamount_852A_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,lastupdate_1112D_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,lastupdate_388D_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,monthlyinstlamount_332A_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,monthlyinstlamount_674A_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,nominalrate_281L_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,nominalrate_498L_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,num_group1_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,numberofcontrsvalue_358L_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,numberofinstls_229L_VAR_CREDIT_BUREAU_A_1_DEPTH1_ALL,numberofinstls_320L_MIN_CREDIT_BUREAU_A_1_DEPTH1_ALL,numberofoutstandinstls_59L_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,numberofoverdueinstlmax_1039L_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,numberofoverdueinstlmax_1151L_MEDIAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,numberofoverdueinstlmaxdat_148D_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,numberofoverdueinstlmaxdat_641D_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,numberofoverdueinstls_725L_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL,outstandingamount_354A_MEDIAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,outstandingamount_362A_VAR_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamount_659A_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmax2_14A_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmax2_398A_MEDIAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmax2date_1002D_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,overdueamountmax2date_1142D_MAX_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,overdueamountmax_155A_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmax_35A_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmaxdatemonth_284T_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmaxdatemonth_365T_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmaxdateyear_2T_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,overdueamountmaxdateyear_994T_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,refreshdate_3813885D_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL_DAYS_SINCE,residualamount_488A_VAR_CREDIT_BUREAU_A_1_DEPTH1_ALL,residualamount_856A_MEDIAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,totalamount_6A_MEAN_CREDIT_BUREAU_A_1_DEPTH1_ALL,totalamount_996A_MIN_CREDIT_BUREAU_A_1_DEPTH1_ALL,totaldebtoverduevalue_178A_MIN_CREDIT_BUREAU_A_1_DEPTH1_ALL,totaloutstanddebtvalue_39A_MIN_CREDIT_BUREAU_A_1_DEPTH1_ALL,amount_1115A_LAST_CREDIT_BUREAU_B_1_DEPTH1_ALL,contractdate_551D_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL_DAYS_SINCE,contractmaturitydate_151D_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL_DAYS_SINCE,contractst_516M_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,credlmt_1052A_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,credlmt_228A_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,credlmt_3940954A_MEDIAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,credor_3940957M_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,credquantity_1099L_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,credquantity_984L_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,debtpastduevalue_732A_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,debtvalue_227A_LAST_CREDIT_BUREAU_B_1_DEPTH1_ALL,dpd_550P_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,dpdmax_851P_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,dpdmaxdatemonth_804T_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,dpdmaxdateyear_742T_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,installmentamount_833A_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,instlamount_892A_LAST_CREDIT_BUREAU_B_1_DEPTH1_ALL,interesteffectiverate_369L_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,interestrateyearly_538L_MEDIAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,lastupdate_260D_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL_DAYS_SINCE,maxdebtpduevalodued_3940955A_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,num_group1_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,numberofinstls_810L_MEDIAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,overdueamountmax_950A_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,overdueamountmaxdatemonth_494T_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,overdueamountmaxdateyear_432T_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,pmtdaysoverdue_1135P_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,pmtnumpending_403L_LAST_CREDIT_BUREAU_B_1_DEPTH1_ALL,residualamount_127A_MAX_CREDIT_BUREAU_B_1_DEPTH1_ALL,residualamount_3940956A_MIN_CREDIT_BUREAU_B_1_DEPTH1_ALL,totalamount_503A_VAR_CREDIT_BUREAU_B_1_DEPTH1_ALL,totalamount_881A_MEAN_CREDIT_BUREAU_B_1_DEPTH1_ALL,amount_416A_MAX_DEPOSIT_1_DEPTH1_ALL,contractenddate_991D_MEAN_DEPOSIT_1_DEPTH1_ALL_DAYS_SINCE,openingdate_313D_LAST_DEPOSIT_1_DEPTH1_ALL_DAYS_SINCE,birth_259D_MAX_PERSON_1_DEPTH1_ALL_DAYS_SINCE,empl_employedfrom_271D_MAX_PERSON_1_DEPTH1_ALL_DAYS_SINCE,empl_industry_691L_MAX_PERSON_1_DEPTH1_ALL,incometype_1044T_MAX_PERSON_1_DEPTH1_ALL,mainoccupationinc_384A_MAX_PERSON_1_DEPTH1_ALL,persontype_1072L_VAR_PERSON_1_DEPTH1_ALL,persontype_792L_VAR_PERSON_1_DEPTH1_ALL,relationshiptoclient_415T_MAX_PERSON_1_DEPTH1_ALL,relationshiptoclient_642T_MAX_PERSON_1_DEPTH1_ALL,last180dayturnover_1134A_MAX_DEBITCARD_1_DEPTH1_ALL,openingdate_857D_MAX_DEBITCARD_1_DEPTH1_ALL_DAYS_SINCE,num_group1_MAX_APPLPREV_2_DEPTH2_ALL,num_group2_MAX_APPLPREV_2_DEPTH2_ALL,num_group1_MAX_CREDIT_BUREAU_B_2_DEPTH2_ALL,num_group2_MAX_CREDIT_BUREAU_B_2_DEPTH2_ALL,pmts_date_1107D_MEAN_CREDIT_BUREAU_B_2_DEPTH2_ALL_DAYS_SINCE,pmts_dpdvalue_108P_MEAN_CREDIT_BUREAU_B_2_DEPTH2_ALL,pmts_pmtsoverdue_635A_MAX_CREDIT_BUREAU_B_2_DEPTH2_ALL
0,158067,2019-09-11,201909,36,0,train,1.0,0.000000,0.0,0.0,4.0,,a55475b1,140000.0,0.0,-6.0,0.0,6.0,,0.0,5.0,0.0,5.0,1.0,,0.0,6.0,6.0,a55475b1,-6.0,20852.800781,7136.0,3948.000000,,5.0,-6.0,3949.000000,0.000000,0.0,0.0,0.000000,6.0,,,,4.0,0.833496,,0.0,,a55475b1,0.0,0.0,23693.000000,a55475b1,0.0,3948.000000,,48.0,0.000000,0.0,0.000000,0.0,,0.0,6.0,6.0,140000.0,K,19000.0,0.0,,,0.0,P12_6_178,a55475b1,4.0,,0.000000,0.0,,0.0,24000.0,0.0,6.0,4.0,0.000000,0.419922,22000.000000,0.0,0.000000,0.419922,6.0,a55475b1,4.0,-6.0,6.0,876.0,756.0,755.0,3296.0,,,,861.0,892.0,,714.0,,892.0,7222.000000,PENSION_6,a55475b1,0.0,7224.0,0.0,0.0,0.0,,,,,0.0,,,0.0,0.0,0.0,0.0,,,-14.0,,-14.0,21072.0,,3949.000000,892.0,,a55475b1,892.0,,0.000000e+00,,,3.233333e+07,0.000000,876.0,0.000000e+00,714.0,714.0,P97_36_170,5108.0,MARRIED,861.0,0.0,4240.000000,0.0,2.0,0.000000,24.0,a55475b1,a55475b1,,K,24.0,,,,,,,,,,ea6782cc,a55475b1,a55475b1,,,,1104.0,1380.0,,1104.0,0.000000e+00,0.0,a55475b1,,0.000000,6.500000,,,2016.0,d6a7d943,a55475b1,,,,1089.0,,2499.000000,,45.000000,8.0,2.0,18.000,,,,0.0,,,,0.0,,,,0.000000,,,,0.000000,6.500000,,,2016.0,61.0,,,15426.400391,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,21072.0,,,RETIRED_PENSIONER,14400.0,0.000000,,,,,,2.0,2.0,,,,,
1,799895,2019-09-12,201909,36,0,train,,,,,,,a55475b1,13998.0,,,,,,,,0.0,,2.0,,0.0,,,a55475b1,,0.000000,1428.0,,,,,0.000000,,13998.0,,,,,,,,,,0.0,,a55475b1,,0.0,0.000000,a55475b1,0.0,,,36.0,,0.0,,0.0,,0.0,,,13998.0,D,,,13998.0,,0.0,a55475b1,a55475b1,,,0.000000,0.0,,,,0.0,,,0.000000,,60000.000000,,,,,a55475b1,,,,,,,1629.0,,,,,1629.0,,,,,,DEDUCTION_6,717ddd49,2.0,,3.0,14.0,5.0,17824.000000,,,7.0,5.0,,,14.0,0.0,5.0,13.0,,,,,-14.0,10696.0,,4650.000000,,1.0,a55475b1,1629.0,,0.000000e+00,,,0.000000e+00,0.000000,,0.000000e+00,,,P97_36_170,2007.0,SINGLE,1598.0,0.0,48000.000000,,1.0,0.000000,18.0,a55475b1,a55475b1,,D,18.0,,,1540898.25,6.0,,,2546.00,-14.0,,ea6782cc,a55475b1,a55475b1,,10000.0,-800.0,1112.0,1680.0,844.0,1173.0,2.194379e+05,0.0,a55475b1,0.000000,11.000000,9.000000,10.0,2018.0,2015.0,a55475b1,a55475b1,0.0,,-9.0,733.0,7366.171875,0.000000,,,8.0,2.0,0.000,43.0,41.0,0.000000,12.5,1176.0,,0.0,0.0,,0.0,0.000000,3731.197998,1176.0,,0.000000,3731.197998,8.500000,9.664062,2018.0,2015.0,62.0,,2169.800049,136580.000000,2.222785e+05,0.0,219437.859375,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10696.0,209.0,TRADE,PRIVATE_SECTOR_EMPLOYEE,22000.0,5.332031,5.332031,SIBLING,SIBLING,,,1.0,1.0,,,,,
2,1331073,2019-04-05,201904,13,0,train,4.0,0.277832,,112.0,,,a55475b1,19198.0,5.0,,6.0,18.0,,0.0,11.0,1.0,,4.0,-12.0,0.0,,14.0,P94_109_143,-13.0,7568.000000,1500.0,1412.599976,-1.0,11.0,-29.0,4078.400146,0.277832,19198.0,0.0,,,-1.0,,1.0,,0.611328,,0.0,8475.000000,a55475b1,0.0,0.0,23099.201172,P99_56_166,0.0,8475.000000,,24.0,0.277832,1.0,0.000000,100.0,0.0,0.0,13.0,15.0,19198.0,D,6930.0,5.0,19198.0,3059.566162,0.0,P100_96_175,P94_109_143,10.0,,0.000000,0.0,-1.0,,30000.0,0.0,6.0,5.0,0.000000,,10000.000000,5.0,0.277832,,,a55475b1,,,,186.0,,,50.0,4412.0,,,4672.0,50.0,4324.0,,4292.0,188.0,,,6b2ae0fa,11.0,,15.0,30.0,23.0,6590.000000,,,10.0,12.0,,,30.0,2.0,16.0,13.0,,,,12880.0,,12880.0,,1863.800049,,1.0,P94_109_143,50.0,,0.000000e+00,,,4.516664e+07,0.000000,,2.777336e+05,,,a55475b1,3916.0,,22.0,0.0,3600.000000,0.0,10.0,0.000000,6.0,P99_56_166,P94_109_143,,D,6.0,,,,,,,659.00,82.0,,a55475b1,a55475b1,a55475b1,,33000.0,-315.0,,,420.0,,1.401634e+05,0.0,a55475b1,0.000000,,,5.0,2018.0,,a55475b1,a55475b1,4260.0,,-11.0,,5732.399902,,,,63.0,59.0,,1.0,6.0,1.400391,,,865.0,0.0,,6.067782e+08,0.0,1052.829224,,,865.0,0.000000,,,4.800781,2018.0,,28.0,,26122.703125,,5.000000e+03,0.0,140163.390625,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,272.645996,503.0,1598.0,12880.0,386.0,OTHER,EMPLOYED,45800.0,5.332031,5.332031,PARENT,PARENT,,1598.0,10.0,2.0,,,,,
3,1472921,2019-08-02,201908,30,0,train,9.0,0.222168,,6.0,,,P109_133_183,66000.0,0.0,,10.0,9.0,,0.0,4.0,0.0,,5.0,5.0,6.0,,14.0,a55475b1,-6.0,85124.601562,7144.0,4886.399902,5.0,4.0,-30.0,76500.000000,0.111084,66000.0,2.0,,,6.0,,4.0,,0.444336,,6.0,84400.000000,a55475b1,0.0,6.0,104900.000000,P94_109_143,0.0,38094.070312,,12.0,0.333252,0.0,0.000000,6.0,2.0,6.0,6.0,11.0,66000.0,K,10798.0,3.0,,4883.333984,6.0,P12_6_178,P94_109_143,2.0,,0.000000,0.0,6.0,,70000.0,0.0,9.0,0.0,0.000000,0.449951,31600.000000,2.0,0.000000,0.449951,,a55475b1,,,6.0,185.0,,,641.0,77.0,,16.0,166.0,197.0,,,16.0,197.0,,,a55475b1,0.0,,0.0,5.0,1.0,850.000000,,,1.0,2.0,,,5.0,0.0,0.0,0.0,,,,12208.0,,12208.0,,10997.600586,,,P94_109_143,1044.0,,2.446221e+09,0.0,0.0,1.012947e+09,0.000000,,2.233333e+07,,,a55475b1,1905.0,,166.0,0.0,26000.000000,3.0,3.0,0.000000,6.0,P99_56_166,P94_109_143,760474304.0,D,6.0,,,,,,,850.00,-5.0,,ea6782cc,a55475b1,a55475b1,868027776.0,0.0,-1839.0,315.0,960.0,1083.0,740.0,1.162138e+06,0.0,a55475b1,0.000000,0.000000,5.000000,9.5,2018.0,2017.0,a55475b1,a55475b1,0.0,15.71875,-5.0,720.0,20936.599609,988.259033,,19.500000,8.0,3.0,,120.0,111.0,0.000000,0.0,,,0.0,0.0,,0.0,0.000000,0.000000,,,0.000000,0.000000,5.000000,9.500000,2018.0,2017.0,41.0,0.0,0.000000,11769.799805,1.200000e+06,0.0,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12208.0,,,EMPLOYED,64000.0,,,,,,,3.0,1.0,,,,,
4,2668316,2020-02-18,202002,59,0,train,24.0,0.000000,0.0,1.0,66.0,26063.435547,P52_56_90,100000.0,0.0,-6.0,13.0,60.0,61410.199219,0.0,72.0,0.0,72.0,1.0,-4.0,0.0,75.0,86.0,a55475b1,-6.0,100908.640625,7700.0,9222.400391,0.0,72.0,-22.0,49887.066406,0.000000,0.0,0.0,0.000000,84.0,0.0,10228.200195,,0.0,0.856934,,0.0,124236.242188,a55475b1,0.0,0.0,358917.906250,a55475b1,0.0,230561.906250,74720.0,24.0,0.000000,1.0,0.000000,0.0,0.0,0.0,84.0,74.0,100000.0,N,0.0,0.0,,34841.949219,0.0,a55475b1,a55475b1,66.0,17279.833984,0.000000,0.0,0.0,0.0,,0.0,84.0,0.0,0.000000,0.419922,5735.800293,0.0,0.000000,0.419922,84.0,a55475b1,66.0,-22.0,0.0,354.0,1479.0,4336.0,4164.0,,-11.0,,3096.0,-9.0,,,,-9.0,,PENSION_6,6b2ae0fa,2.0,17984.0,3.0,3.0,6.0,,,,,3.0,,,3.0,0.0,3.0,1.0,,,,,-14.0,26160.0,,8783.600586,384.0,20000.0,a55475b1,384.0,,0.000000e+00,0.0,0.0,9.381743e+08,0.000000,376.0,1.203699e+05,-12.0,-12.0,P97_36_170,17472.0,MARRIED,331.0,0.0,9500.000000,0.0,8.0,0.000000,12.0,a55475b1,a55475b1,800109184.0,K,12.0,0.0,0.0,,,,,,,34.869999,ea6782cc,a55475b1,a55475b1,,,-41.0,2108.0,2724.0,384.0,1957.0,1.691602e+04,0.0,a55475b1,0.000000,0.000000,5.800781,2.0,2019.0,2013.0,dcb42d2c,a55475b1,,,7.0,2011.0,8783.600586,3185.568359,42.000000,26.421875,17.0,17.0,74.625,14.0,2.0,0.000000,0.0,3974.0,,0.0,0.0,,0.0,0.000000,0.000000,3974.0,,0.000000,0.000000,5.800781,2.000000,2019.0,2013.0,70.0,,,47624.738281,9.600000e+04,0.0,16916.021484,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,26160.0,,,RETIRED_PENSIONER,44000.0,0.000000,,,,,,8.0,2.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1505897,2019-08-26,201908,33,0,train,8.0,0.000000,0.0,1.0,9.0,,a55475b1,57996.0,0.0,,0.0,9.0,,0.0,9.0,0.0,9.0,1.0,,0.0,9.0,15.0,a55475b1,-44.0,34396.000000,5000.0,4636.800293,,9.0,-90.0,18800.000000,0.000000,57996.0,0.0,0.000000,10.0,,,,9.0,0.899902,,0.0,,a55475b1,0.0,0.0,49661.386719,a55475b1,0.0,41731.386719,,12.0,0.000000,0.0,0.000000,0.0,,0.0,10.0,13.0,57996.0,K,34396.0,0.0,,,0.0,P159_130_59,a55475b1,9.0,,0.000000,0.0,,0.0,50000.0,0.0,10.0,9.0,0.000000,0.000000,,0.0,0.000000,0.000000,10.0,a55475b1,9.0,,10.0,648.0,,,,,,,642.0,673.0,,400.0,,673.0,,,a55475b1,1.0,,1.0,1.0,0.0,421.800018,,,1.0,0.0,,,1.0,1.0,1.0,0.0,,,,11776.0,,11776.0,,3830.000000,673.0,,a55475b1,673.0,,,,,,0.000000,648.0,,400.0,400.0,P97_36_170,4488.0,MARRIED,642.0,,50000.000000,0.0,0.0,0.000000,11.0,a55475b1,a55475b1,,K,11.0,,,,,,,421.75,161.0,,ea6782cc,a55475b1,a55475b1,,,,1356.0,1889.0,,1586.0,0.000000e+00,0.0,a55475b1,,0.000000,7.500000,,,2014.0,b619fa46,a55475b1,,,,1559.0,,11091.388672,,42.500000,8.0,2.0,,,,,0.0,,,,0.0,,,,0.000000,,,,0.000000,7.500000,,,2014.0,54.0,,,34396.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11776.0,,,PRIVATE_SECTOR_EMPLOYEE,30000.0,,,,,,,0.0,1.0,,,,,
9996,178111,2019-11-20,201911,46,0,train,7.0,0.000000,9.0,0.0,1.0,2919.400146,a55475b1,30000.0,0.0,-3.0,7.0,7.0,34978.000000,9.0,5.0,5.0,5.0,4.0,-2.0,0.0,5.0,7.0,P94_109_143,-3.0,34978.000000,3104.0,2919.400146,-4.0,5.0,-5.0,3000.000000,0.000000,0.0,0.0,26123.800781,7.0,0.0,2919.400146,,1.0,0.714355,,0.0,46705.800781,a55475b1,9.0,0.0,20582.000000,a55475b1,16.0,20435.800781,,12.0,0.000000,2.0,26123.800781,0.0,0.0,0.0,7.0,7.0,30000.0,D,32998.0,0.0,,33490.464844,0.0,P148_110_5,P94_109_143,1.0,2919.400146,26123.800781,2920.0,0.0,9.0,70000.0,0.0,7.0,1.0,26123.800781,0.419922,30000.000000,0.0,0.000000,0.419922,7.0,a55475b1,1.0,-5.0,7.0,217.0,,,49.0,,,,197.0,49.0,,18.0,,228.0,,DEDUCTION_6,6b2ae0fa,3.0,,4.0,16.0,6.0,,,,,9.0,,,16.0,0.0,6.0,5.0,,,,,-14.0,8632.0,,3913.400146,,,P94_109_143,49.0,,0.000000e+00,,,1.868206e+08,21943.392578,,0.000000e+00,,,a55475b1,1363.0,,19.0,0.0,2000.000000,0.0,8.0,26123.800781,9.0,a55475b1,P94_109_143,,D,9.0,,,6268368.00,11.0,,,,,55.619999,ea6782cc,a55475b1,a55475b1,50000000.0,10000.0,-366.0,551.0,853.0,285.0,601.0,3.748066e+04,0.0,a55475b1,1.666992,37.187500,7.800781,3.0,2019.0,2017.0,a55475b1,a55475b1,3842.0,0.00000,-7.0,544.0,3841.960205,0.000000,21.703125,,9.0,5.0,0.000,12.0,5.5,2.000000,0.0,383.0,253.0,0.0,0.0,1.510851e+08,0.0,28.958666,0.000000,415.0,253.0,28.958666,5484.120117,7.199219,3.000000,2019.0,2017.0,53.0,0.0,9950.780273,5200.000000,3.042640e+04,0.0,37480.660156,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8632.0,1363.0,OTHER,PRIVATE_SECTOR_EMPLOYEE,70000.0,4.800781,5.332031,OTHER_RELATIVE,OTHER_RELATIVE,,,8.0,2.0,,,,,
9997,2564587,2019-04-20,201904,15,0,train,25.0,0.130371,,19.0,,,a55475b1,98000.0,5.0,,26.0,76.0,,11.0,103.0,1.0,,2.0,-14.0,0.0,,130.0,a55475b1,-15.0,103193.906250,5996.0,9074.799805,-9.0,99.0,-54.0,16914.443359,0.115967,0.0,0.0,,,1.0,,25.0,,0.712402,,0.0,97338.757812,a55475b1,7.0,0.0,441921.718750,P99_56_166,48.0,215304.406250,,36.0,0.194214,0.0,44179.558594,17.0,0.0,1.0,117.0,140.0,98000.0,A,43380.0,27.0,,60405.339844,1.0,P148_110_5,P69_72_116,80.0,,44179.558594,8520.0,-1.0,,46000.0,5.0,5.0,3.0,44179.558594,0.419922,0.000000,11.0,0.094177,0.419922,,a55475b1,,,,131.0,1070.0,1886.0,147.0,49.0,47.0,267.0,4584.0,141.0,,,267.0,141.0,7388.266602,,a55475b1,0.0,,0.0,2.0,4.0,,,,,2.0,,,2.0,0.0,1.0,1.0,,,5248.0,24640.0,,24640.0,,5541.200195,,35960.0,P69_72_116,147.0,,0.000000e+00,,,3.222410e+08,2969.384033,,0.000000e+00,,,a55475b1,,,111.0,0.0,1949.400024,0.0,13.0,3398.427490,10.0,P99_56_166,P69_72_116,,D,10.0,,,,,,,,,,a55475b1,a55475b1,a55475b1,,,-181.0,,,549.0,,5.014662e+04,0.0,a55475b1,1.000000,,,4.5,2018.0,,a55475b1,a55475b1,,,-4.0,,4552.200195,,45.000000,,11.0,10.0,,12.0,6.5,1.500000,,,46.0,0.0,,9.394962e+07,0.0,8.300000,,,46.0,8.300000,,,4.500000,2018.0,,36.0,,,,4.338000e+04,0.0,50146.621094,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,24640.0,,,RETIRED_PENSIONER,47800.0,,,,,,,13.0,0.0,,,,,
9998,140158,2019-07-02,201907,26,0,train,5.0,0.000000,,0.0,,,a55475b1,80000.0,0.0,,6.0,5.0,,1.0,1.0,1.0,,1.0,0.0,0.0,,5.0,P198_89_166,-1.0,8502.799805,9120.0,1607.000000,0.0,1.0,-2.0,3040.000000,0.000000,0.0,0.0,,,0.0,,,,0.199951,,0.0,9640.400391,a55475b1,1.0,0.0,8060.000000,P99_56_166,6.0,8035.000000,,18.0,0.000000,1.0,1580.400024,0.0,0.0,0.0,5.0,6.0,80000.0,T,7786.0,0.0,,5580.399902,0.0,P159_130_59,P94_109_143,0.0,,1580.400024,0.0,0.0,,56000.0,0.0,5.0,0.0,1580.400024,0.449951,80000.000000,0.0,0.000000,0.449951,,a55475b1,,,5.0,154.0,,,7.0,,,,130.0,-14.0,,,,161.0,,,a55475b1,1.0,,1.0,1.0,0.0,,,,,0.0,,,1.0,1.0,1.0,1.0,,,,,,11624.0,,8071.200195,,,P94_109_143,7.0,,0.000000e+00,,,1.738287e+09,1522.958008,,0.000000e+00,,,a55475b1,2360.0,,-45.0,0.0,56000.000000,0.0,2.0,1580.400024,18.0,P99_56_166,P94_109_143,,D,18.0,,,,,,,,,,a55475b1,a55475b1,a55475b1,,,-20.0,,,161.0,,1.584338e+03,0.0,a55475b1,0.000000,,,2.0,2019.0,,a55475b1,a55475b1,,,-8.0,,1605.400024,,45.000000,,10.0,,,6.0,1.0,0.000000,,,,0.0,,,0.0,0.000000,,,,0.000000,,,2.000000,2019.0,,50.0,,,,8.502800e+03,0.0,1584.338013,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11624.0,2360.0,OTHER,PRIVATE_SECTOR_EMPLOYEE,60000.0,3.000000,4.500000,SPOUSE,SPOUSE,,,2.0,1.0,,,,,


In [None]:
submission = ds['test'][['case_id']]

# Training LGBM

In [12]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split,StratifiedGroupKFold
import lightgbm as lgb 
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

In [13]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [14]:
def get_base_params():
    base_params = {
        'boosting_type':'gbdt',
        'random_state': 117,
        'objective': 'binary',
        'metric': 'auc',
        'extra_trees':True,
        'verbose': -1,
        'max_bin': 64,
#         'device_type': 'gpu'
        
    }
    return base_params

In [15]:
search_space_setup = {
    'feature_fraction': hp.uniform('colsample_bynode', 0.3, 0.8),
    'max_depth': scope.int(hp.uniform('max_depth', 5, 20)),
    'l1_regularization': hp.loguniform('l1_regularization', np.log(.001), np.log(100)),
    'l2_regularization':hp.loguniform('l2_regularization',np.log(.001), np.log(100)),
    'cat_l2': hp.loguniform('cat_l2', np.log(.001), np.log(100)),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.3, 0.8),
    'bagging_freq': scope.int(hp.uniform('bagging_freq', 0, 5)),
    'learning_rate' : hp.loguniform('learning_rate', np.log(0.01), np.log(.5)),
#     'n_estimators':scope.int(hp.uniform('n_estimators', 500, 1500)),
    

}
search_space = get_base_params()
for k,v in search_space_setup.items():
    search_space[k] = v

In [29]:
## I AM DISABLING THIS BECAUSE I AM GOING TO DO TUNING ELSEWHERE



# # do splits ahead of time to improve trial speed
# k = 5

# lgbtrain = lgb.Dataset(ds['train'].loc[:,ds['features']], label=ds['train'].loc[:,'target'])
# test_X = ds['test'][ds['features']]
# splits = [(train_idx,valid_idx) for train_idx,valid_idx in 
#           StratifiedGroupKFold(n_splits=k).split(np.arange(ds['train'].shape[0]),
#                                                  ds['train']['target'],
#                                                  groups = ds['train']['WEEK_NUM'])]


# def trial_fn(params,
#              splits = None,
#              dataset = None):
    
#     num_boost_round = params.pop('n_estimators')
#     cv_results = lgb.cv(
#         params,
#         dataset,
#         num_boost_round=50,
#         folds=splits,
#         seed = 117
#     ) 
    
#     score = cv_results['valid auc-mean'][-1] 
#     return {"status": STATUS_OK, "loss": -score} # always minimizes


# best_params = fmin(fn=partial(trial_fn, splits = splits, dataset = lgbtrain),
#                     space=search_space,
#                     algo=tpe.suggest,
#                     max_evals=10,
#                     timeout=60*60*2 # seconds
#                   )
# int_params = ['max_depth','n_estimators','bagging_freq']
# bestp = get_base_params()
# for k,v in best_params.items():
#     if k in int_params:
#         bestp[k] = int(v)
#     else:
#         bestp[k] = v
# bestp

100%|██████████| 10/10 [09:16<00:00, 55.61s/trial, best loss: -0.7904817038435111]


{'boosting_type': 'gbdt',
 'random_state': 117,
 'objective': 'binary',
 'metric': 'auc',
 'extra_trees': True,
 'verbose': -1,
 'max_bin': 64,
 'bagging_fraction': 0.6615111203742043,
 'bagging_freq': 4,
 'cat_l2': 0.4303012850161522,
 'colsample_bynode': 0.30799275380454566,
 'l1_regularization': 0.09818609605701412,
 'l2_regularization': 45.88388390697673,
 'learning_rate': 0.06583892942324936,
 'max_depth': 15,
 'n_estimators': 849}

In [None]:
bestp = {'boosting_type': 'gbdt',
 'random_state': 117,
 'objective': 'binary',
 'metric': 'auc',
 'extra_trees': True,
 'verbose': -1,
 'max_bin': 64,
 'bagging_fraction': 0.6615111203742043,
 'bagging_freq': 4,
 'cat_l2': 0.4303012850161522,
 'colsample_bynode': 0.30799275380454566,
 'l1_regularization': 0.09818609605701412,
 'l2_regularization': 45.88388390697673,
 'learning_rate': 0.06583892942324936,
 'max_depth': 15,
 'n_estimators': 849}

In [30]:
gbm = lgb.train(
    bestp,
    lgbtrain 
)



In [None]:
# del ds
# gc.collect()

# Submission


In [31]:
submission['score'] = gbm.predict(test_X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['score'] = gbm.predict(test_X)


In [32]:
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,case_id,score
0,57543,0.002805
1,57549,0.013698
2,57551,0.001894
3,57552,0.011688
4,57569,0.018598
