<font size=6><strong>Home Credit</strong></font>

### prepare

In [1]:
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import warnings
import time
import gc
import re
from itertools import combinations

from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPRegressor

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import Ridge,Lasso,ElasticNet,Lars,LassoLars,OrthogonalMatchingPursuit
from sklearn.linear_model import LogisticRegression,TweedieRegressor,Perceptron,PassiveAggressiveRegressor

from sklearn.ensemble import VotingRegressor,RandomForestRegressor,AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor,StackingRegressor
from lightgbm import LGBMClassifier

%matplotlib inline
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')
sklearn.set_config(display="diagram")

In [2]:
def reduce_mem_usage(df, ignore_cols=['SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV']):
    """
    Reduce memory usage of dataframe by converting digital data types without 
    losing data information
    ----------
    Args:
    - df (pandas.DataFrame): The input data frame to be optimized.
    - ignore_cols (list, optional): A list of column names to be ignored during 
      memory optimization. Default value is ['SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV'].

    Returns: 
    ----------
    pandas.DataFrame: The optimized dataframe.
    """
    
    # Initialize memory usage of dataframe
    start_mem = df.memory_usage().sum() / 1024**2
    print('Initial memory usage: {:.2f} MB'.format(start_mem))
    # Remove specific columns
    cols = [ col for col in df.columns if col not in ignore_cols]
    # Loop through each column
    for col in cols:
        col_type = df[col].dtype
        
        # If data type is an integer
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # If both the minimum and maximum values can be represented by a smaller data type
            if str(col_type)[:3] == 'int' and c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif str(col_type)[:3] == 'int' and c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif str(col_type)[:3] == 'int' and c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif str(col_type)[:3] == 'int' and c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
            
            # If both the minimum and maximum values can be represented by a smaller data type
            elif str(col_type)[:5] == 'float' and c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif str(col_type)[:5] == 'float' and c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                pass

    # Output optimized memory usage
    end_mem = df.memory_usage().sum() / 1024**2
    print('Optimized memory usage: {:.2f} MB'.format(end_mem))
    return df

In [3]:
def select_low_cardinality_numeric_features(df, label_col, threshold=5):
    """
    Selects numerical features except for the label_col and checks if the number of 
    unique values is less than the threshold.If yes, the name of the feature will be 
    added to the list low_cardinality_feats and returned.

    Args:
    ----------
    - df(pandas.DataFrame):The dataset containing features and target variable.
    - label_col(str):The name of the label
    - threshold(int):The threshold of the number of unique values. Features with unique 
      values less than this threshold will be considered as "low cardinality features". 
      The default value is 5.

    Returns:
    ----------
    low_cardinality_feats(list):The list of names of numerical features with low cardinality.
    """
    numeric_feats = df.select_dtypes(include='number').columns.tolist()
    low_cardinality_feats = []
    for feat in numeric_feats:
        if feat == label_col:
            continue
        if df[feat].nunique() <= threshold:
            low_cardinality_feats.append(feat)
    return low_cardinality_feats

In [62]:
def cross_validate_with_feature_importance(models, data, labels, n_folds=5):
    """
    Performs cross-validation for each model and outputs the ROC AUC score for each fold.
    If the model supports feature importance evaluation, it outputs the relative importance 
    ranking for each feature.

    Args:
    ----------
    - models: a list of models to evaluate
    - data: the input data for the models
    - labels: the target labels for the input data
    - n_folds: the number of folds to use in cross-validation

    Returns: 
    ----------
    - trained_models(list):a list of trained models
    """  
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    trained_models = []
    
    for model_idx, model in enumerate(models):
        print(f"Model {model_idx + 1}: {type(model).__name__}")
        total_feature_importance = None
        scores = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(data, labels)):
            print(f"Fold {fold + 1}")
            X_train, y_train = data.iloc[train_idx], labels.iloc[train_idx].values
            X_val, y_val = data.iloc[val_idx], labels.iloc[val_idx].values

            start_time = time.time()
            model.fit(X_train, y_train)
            train_time = time.time() - start_time
            print(f"Training time: {train_time:.2f}s")
            
            if fold == 0 and train_time > 360:
                print(f"Model training skipped because it took {train_time:.2f}s")
                break

            if hasattr(model, "predict_proba"):
                y_pred = model.predict_proba(X_val)[:, 1]
            else:
                y_pred = model.predict(X_val)
            score = roc_auc_score(y_val, y_pred)
            scores.append(score)
            print(f"Validation ROC AUC score: {score:.4f}")

            if hasattr(model, "feature_importances_"):
                fold_feature_importance = pd.Series(model.feature_importances_, index=data.columns)
                if total_feature_importance is None:
                    total_feature_importance = fold_feature_importance
                else:
                    total_feature_importance += fold_feature_importance

        if total_feature_importance is not None:
            top_features = total_feature_importance.sort_values(ascending=False)[:15]
            print("Top 15 most important features:")
            print(top_features)
        if scores:
            print(f"Mean ROC AUC score: {np.mean(scores):.4f} ± {np.std(scores):.4f}")
        else:
            print("No validation scores obtained.")
            
        trained_models.append(model)
        
    return trained_models

In [5]:
def load_data(data_set_name:str):
    return pd.read_csv(f"./data/{data_set_name}.csv")

In [6]:
def trans_inf_days(df):
    """
    Identifies samples with a time value of 365243.0, which is obviously anomalous, 
    and replaces them with NaN.
    Based on the official explanation(https://www.kaggle.com/c/home-credit-default-risk/discussion/57248).

    Args:
    ----------
    - df(pandas.DataFrame):Data frame containing the time feature to be transformed

    Returns:
    ----------
    - df(pandas.DataFrame):Data frame with transformed time feature
    """
    df['isinf_DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].where(df['DAYS_EMPLOYED'] == 365243.0,0).where(df['DAYS_EMPLOYED'] != 365243.0,1)
    df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].where(df['DAYS_EMPLOYED'] != 365243.0, np.nan)
    return df

In [7]:
def missing_values_summary(df):
    """
    A function that calculates the summary of missing values for each column in a 
    pandas DataFrame.

    Args:
    ----------
    df (pandas.DataFrame): The input DataFrame
    Returns:
    ----------
    mis_val_table_ren_columns (pandas.DataFrame): A DataFrame that displays the number and 
    percentage of missing values for each column that has missing values, sorted in descending 
    order by percentage. Also displays the data type of each column.
    """

    mis_val = df.isnull().sum()
    mis_val_percent = 100*df.isnull().sum() / len(df)
    mis_val_table =pd.concat([mis_val, mis_val_percent], axis=1)   
    mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] !=0].sort_values(
    '% of Total Values', ascending = False).round(1)
    mis_val_table_ren_columns = mis_val_table_ren_columns.merge(df.dtypes.rename('dtypes').to_frame(),left_index=True, right_index=True)
    print('Your selected dataframe has ' + str(df.shape[1])+ ' columns.\n'
         "There are " + str(mis_val_table_ren_columns.shape[0])+ ' columns that have missing values.')
    return mis_val_table_ren_columns

In [8]:
def transform_features(data, num_pipeline, cat_number_pipeline, cat_object_pipeline):
    """
    This function selects numeric and object columns from input dataframe,
    applies specified pipeline strategies to each column type, and returns the transformed dataframe.

    Args:
    ----------
    - data (pandas.DataFrame): Input dataframe
    - num_pipeline (sklearn.pipeline.Pipeline): Pipeline strategy for numeric columns
    - cat_object_pipeline (sklearn.pipeline.Pipeline): Pipeline strategy for object columns
    - cat_number_pipeline (sklearn.pipeline.Pipeline): Pipeline strategy for low-cardinality numeric columns

    Returns:
    ----------
    - transformed_data (pandas.DataFrame): Transformed dataframe
    """
    if 'TARGET' in data.columns:
        data.remove('TARGET')
        
    num_cols = []
    
    cate_cols = data.select_dtypes(include='object').columns.tolist()
    cate_cols.extend(select_low_cardinality_numeric_features(data, 'TARGET'))

    num_cols.extend(data.columns.difference(cate_cols))
    
    cate_cols_object = data[cate_cols].select_dtypes(include=['object']).columns
    cate_cols_number = data[cate_cols].select_dtypes(include=['number']).columns

    processing = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cate_object',cat_object_pipeline,cate_cols_object),
        ('cate_number', cat_number_pipeline, cate_cols_number)
    ], remainder='passthrough')

    transformed_data = processing.fit_transform(data)
    transformed_data = pd.DataFrame(transformed_data, columns=processing.get_feature_names_out())
    
    return transformed_data

In [9]:
def concat_df_by_name(name_str,all_vars):
    """
    Concatenates dataframes with specific name in a dictionary of variables into a single 
    dataframe.

    Args:
    ----------
    - name_str(str): A string to match for the target dataframes' name.
    - all_vars(dict): A dictionary containing variables, where dataframes with matching 
    name_str will be concatenated.
    
    Returns:
    ----------
    - result(pd.DataFrame): A concatenated dataframe of all dataframes with name_str in 
    their name. If no such dataframe is found, an empty dataframe is returned.
    """
    df_list = []
    keys_to_delete = []  # stores the key-value pairs to be deleted
    for var_name, var_value in all_vars.items():
        if isinstance(var_value, pd.DataFrame) and name_str in var_name:
            df_list.append(var_value)
            keys_to_delete.append(var_name)  # records the keys to be deleted
    for key in keys_to_delete:  # deletes the key-value pairs after the loop ends
        del all_vars[key]
    if not df_list:
        return pd.DataFrame()
    result = pd.concat(df_list, axis=1)
    gc.collect()
    return result

### load data

In [10]:
# Using the kaggle command to obtain data and decompress it
# &&kaggle competitions download -c home-credit-default-risk\
# &&mkdir data&&unzip *.zip -d ./data/

In [11]:
application = load_data('application_train')
application = reduce_mem_usage(application)
application = trans_inf_days(application)

Initial memory usage: 286.23 MB
Optimized memory usage: 93.55 MB


In [12]:
application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 123 entries, SK_ID_CURR to isinf_DAYS_EMPLOYED
dtypes: float16(61), float32(4), float64(1), int16(2), int32(1), int64(1), int8(37), object(16)
memory usage: 95.9+ MB


### Split data

In [13]:
train_set, test_set = train_test_split(application, test_size=0.2, random_state= 42)
del application
gc.collect()

0

### quick glance

In [14]:
app = train_set.copy()
app.dtypes.value_counts()
missing_values_summary(app)
del app
gc.collect()

float16    61
int8       37
object     16
float32     4
int16       2
int64       1
float64     1
int32       1
dtype: int64

Your selected dataframe has 123 columns.
There are 68 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,dtypes
COMMONAREA_MODE,171929,69.9,float16
COMMONAREA_AVG,171929,69.9,float16
COMMONAREA_MEDI,171929,69.9,float16
NONLIVINGAPARTMENTS_MEDI,170868,69.5,float16
NONLIVINGAPARTMENTS_MODE,170868,69.5,float16
NONLIVINGAPARTMENTS_AVG,170868,69.5,float16
FONDKAPREMONT_MODE,168286,68.4,object
LIVINGAPARTMENTS_MEDI,168196,68.4,float16
LIVINGAPARTMENTS_AVG,168196,68.4,float16
LIVINGAPARTMENTS_MODE,168196,68.4,float16


0

### config cell code to execute

In [15]:
# Control whether the code is executed
baseline = True
add_more_information = True
is_use_calculated_ratiofeature = True
add_ratio_feature = True
delete_invaild_feature = False
linear_model=True
tree_model=True
ensemble_without_missing=False
ensemble_with_missing=True

### baseline
*Run the model roughly to see the effect*

In [16]:
# prepare data
app = train_set.drop('TARGET', axis=1)
app_labels = train_set['TARGET'].copy()

In [17]:
if baseline:
    # Define process strategies
    num_pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()
                    )
    cate_number_pipeline = make_pipeline(
                    FunctionTransformer(lambda X: X.astype(str),feature_names_out='one-to-one'),
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )
    cate_object_pipeline = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )

    transformed_data = transform_features(app, num_pipeline, cate_number_pipeline, cate_object_pipeline)

    # Define model
    lr = LinearRegression()
    sgd = SGDRegressor()
    tree = DecisionTreeRegressor()
    gnb = GaussianNB()
    nn = MLPRegressor()
    models = [lr, sgd, tree, gnb, nn]

    # cross validation
    cross_validate_with_feature_importance(models, transformed_data, app_labels)

    # clean memory
    del transformed_data,num_pipeline,cate_number_pipeline,cate_object_pipeline
    gc.collect()

Model 1: LinearRegression
Fold 1
Training time: 1.64s
Validation ROC AUC score: 0.7497
Fold 2
Training time: 1.61s
Validation ROC AUC score: 0.7515
Fold 3
Training time: 1.61s
Validation ROC AUC score: 0.7447
Fold 4
Training time: 1.63s
Validation ROC AUC score: 0.7428
Fold 5
Training time: 1.61s
Validation ROC AUC score: 0.7418
Mean ROC AUC score: 0.7461 ± 0.0038
Model 2: SGDRegressor
Fold 1
Training time: 0.94s
Validation ROC AUC score: 0.7404
Fold 2
Training time: 0.94s
Validation ROC AUC score: 0.7384
Fold 3
Training time: 0.95s
Validation ROC AUC score: 0.7382
Fold 4
Training time: 0.81s
Validation ROC AUC score: 0.7342
Fold 5
Training time: 0.82s
Validation ROC AUC score: 0.6831
Mean ROC AUC score: 0.7269 ± 0.0220
Model 3: DecisionTreeRegressor
Fold 1
Training time: 14.09s
Validation ROC AUC score: 0.5378
Fold 2
Training time: 13.97s
Validation ROC AUC score: 0.5318
Fold 3
Training time: 14.01s
Validation ROC AUC score: 0.5336
Fold 4
Training time: 13.76s
Validation ROC AUC score

[LinearRegression(),
 SGDRegressor(),
 DecisionTreeRegressor(),
 GaussianNB(),
 MLPRegressor()]

24

### add more information

In [18]:
def process_bureau():
    # load bureau data
    bureau = load_data('bureau')
    bureau = reduce_mem_usage(bureau)

    # Remove 17 records with obvious errors in the update time
    bureau = bureau[bureau.DAYS_CREDIT_UPDATE <= 0]
    # Total number of records returned by the credit check/Number of active loans in the credit report/
    # Number of closed loans in the credit report/Number of sold loans in the credit report/
    # Number of bad loans in the credit report/Length of time since the earliest loan in the credit report/
    # Length of time since the most recent loan in the credit report/Number of loans with past due payments in the credit report/
    # Longest number of days past due for any loan in the credit report/Number of loans paid off early/
    # Number of loans due for payment/Number of loans with delayed payments/Maximum amount of past due payments in the credit report/
    # Total number of loan extensions/Total amount of loan repayments/Total amount of outstanding loan balances/Maximum credit limit/
    # Total amount of overdue loan payments/Number of consumer loans/Number of credit cards/Number of auto loans/
    # Number of mortgage loans/Number of micro loans/Number of other loans/Total amount borrowed for consumer loans/
    # Total amount borrowed for credit cards/Total amount borrowed for auto loans/Total amount borrowed for mortgage loans/
    # Total amount borrowed for micro loans/Total amount borrowed for other loans/Total amount outstanding for consumer loans/
    # Total amount outstanding for credit card loans/Total amount outstanding for auto loans/Total amount outstanding for mortgage loans/
    # Total amount outstanding for micro loans/Total amount outstanding for other loans/Total amount overdue for consumer loans/
    # Total amount overdue for credit card loans/Total amount overdue for auto loans/Total amount overdue for mortgage loans/
    # Total amount overdue for micro loans/Total amount overdue for other loans/Date of most recent credit report update.
    bureau_features = pd.DataFrame()
    bureau_features['BUREAU_NUM'] = bureau[['SK_ID_CURR','SK_ID_BUREAU']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_ACTIVE_NUM'] = bureau[bureau['CREDIT_ACTIVE']=='Active'][['SK_ID_CURR','SK_ID_BUREAU']].groupby(['SK_ID_CURR']).count() 
    bureau_features['BUREAU_COLSED_NUM'] = bureau[bureau['CREDIT_ACTIVE']=='Closed'][['SK_ID_CURR','SK_ID_BUREAU']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_SOLD_NUM'] = bureau[bureau['CREDIT_ACTIVE']=='Sold'][['SK_ID_CURR','SK_ID_BUREAU']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_BAD_DEBT_NUM'] = bureau[bureau['CREDIT_ACTIVE']=='Bad debt'][['SK_ID_CURR','SK_ID_BUREAU']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_MINDAYS_APPLICATION'] = bureau[['SK_ID_CURR','DAYS_CREDIT']].groupby(['SK_ID_CURR']).min()
    bureau_features['BUREAU_MAXDAYS_APPLICATION'] = bureau[['SK_ID_CURR','DAYS_CREDIT']].groupby(['SK_ID_CURR']).max()
    bureau_features['BUREAU_NUM_OVERDUE'] = bureau[bureau['CREDIT_DAY_OVERDUE'] > 0][['SK_ID_CURR','CREDIT_DAY_OVERDUE']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_MAXDAY_OVERDUE'] = bureau[bureau['CREDIT_DAY_OVERDUE'] > 0][['SK_ID_CURR','CREDIT_DAY_OVERDUE']].groupby(['SK_ID_CURR']).max()
    bureau_features['BUREAU_NUM_PREPAY'] = bureau[bureau['DAYS_CREDIT_ENDDATE'] > bureau['DAYS_ENDDATE_FACT'] ][['SK_ID_CURR','DAYS_ENDDATE_FACT']].groupby(['SK_ID_CURR']).count() 
    bureau_features['BUREAU_NUM_NORMAL'] = bureau[bureau['DAYS_CREDIT_ENDDATE'] == bureau['DAYS_ENDDATE_FACT'] ][['SK_ID_CURR','DAYS_ENDDATE_FACT']].groupby(['SK_ID_CURR']).count() 
    bureau_features['BUREAU_NUM_DELAY'] = bureau[bureau['DAYS_CREDIT_ENDDATE'] < bureau['DAYS_ENDDATE_FACT'] ][['SK_ID_CURR','DAYS_ENDDATE_FACT']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_MAXAMT_OVERDUE'] = bureau[bureau['AMT_CREDIT_MAX_OVERDUE'] > 0][['SK_ID_CURR','AMT_CREDIT_MAX_OVERDUE']].groupby(['SK_ID_CURR']).max()
    bureau_features['BUREAU_PROLONG_NUM'] = bureau[['SK_ID_CURR','CNT_CREDIT_PROLONG']].groupby(['SK_ID_CURR']).max()
    bureau_features['BUREAU_LOAN_AMT'] = bureau[['SK_ID_CURR','AMT_CREDIT_SUM']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_DEBT_AMT'] = bureau[['SK_ID_CURR','AMT_CREDIT_SUM_DEBT']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_LIMITAMT_MAX'] = bureau[['SK_ID_CURR','AMT_CREDIT_SUM_LIMIT']].groupby(['SK_ID_CURR']).max()
    bureau_features['BUREAU_SUMAMT_OVERDUE'] = bureau[['SK_ID_CURR','AMT_CREDIT_SUM_OVERDUE']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_NUM_CONSUMER'] = bureau[bureau['CREDIT_TYPE'] == 'Consumer credit' ][['SK_ID_CURR','CREDIT_TYPE']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_NUM_CARD'] = bureau[bureau['CREDIT_TYPE'] == 'Credit card' ][['SK_ID_CURR','CREDIT_TYPE']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_NUM_CAR'] = bureau[bureau['CREDIT_TYPE'] == 'Car loan' ][['SK_ID_CURR','CREDIT_TYPE']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_NUM_MORTGAGE'] = bureau[bureau['CREDIT_TYPE'] == 'Mortgage' ][['SK_ID_CURR','CREDIT_TYPE']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_NUM_MiCROLOAN'] = bureau[bureau['CREDIT_TYPE'] == 'Microloan' ][['SK_ID_CURR','CREDIT_TYPE']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_NUM_OTHER'] = bureau[~bureau.CREDIT_TYPE.isin(['Consumer credit','Credit card','Car loan','Mortgage','Microloan']) ][['SK_ID_CURR','CREDIT_TYPE']].groupby(['SK_ID_CURR']).count()
    bureau_features['BUREAU_AMT_CONSUMER'] = bureau[bureau['CREDIT_TYPE'] == 'Consumer credit' ][['SK_ID_CURR','AMT_CREDIT_SUM']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_AMT_CARD'] = bureau[bureau['CREDIT_TYPE'] == 'Credit card' ][['SK_ID_CURR','AMT_CREDIT_SUM']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_AMT_CAR'] = bureau[bureau['CREDIT_TYPE'] == 'Car loan' ][['SK_ID_CURR','AMT_CREDIT_SUM']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_AMT_MORTGAGE'] = bureau[bureau['CREDIT_TYPE'] == 'Mortgage' ][['SK_ID_CURR','AMT_CREDIT_SUM']].groupby(['SK_ID_CURR']).sum() 
    bureau_features['BUREAU_AMT_MiCROLOAN'] = bureau[bureau['CREDIT_TYPE'] == 'Microloan' ][['SK_ID_CURR','AMT_CREDIT_SUM']].groupby(['SK_ID_CURR']).sum() 
    bureau_features['BUREAU_AMT_OTHER'] = bureau[~bureau.CREDIT_TYPE.isin(['Consumer credit','Credit card','Car loan','Mortgage','Microloan']) ][['SK_ID_CURR','AMT_CREDIT_SUM']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_DEBTAMT_CONSUMER'] = bureau[bureau['CREDIT_TYPE'] == 'Consumer credit' ][['SK_ID_CURR','AMT_CREDIT_SUM_DEBT']].groupby(['SK_ID_CURR']).sum()                 
    bureau_features['BUREAU_DEBTAMT_CARD'] = bureau[bureau['CREDIT_TYPE'] == 'Credit card' ][['SK_ID_CURR','AMT_CREDIT_SUM_DEBT']].groupby(['SK_ID_CURR']).sum()                  
    bureau_features['BUREAU_DEBTAMT_CAR'] = bureau[bureau['CREDIT_TYPE'] == 'Car loan' ][['SK_ID_CURR','AMT_CREDIT_SUM_DEBT']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_DEBTAMT_MORTGAGE'] = bureau[bureau['CREDIT_TYPE'] == 'Mortgage' ][['SK_ID_CURR','AMT_CREDIT_SUM_DEBT']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_DEBTAMT_MiCROLOAN'] = bureau[bureau['CREDIT_TYPE'] == 'Microloan' ][['SK_ID_CURR','AMT_CREDIT_SUM_DEBT']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_DEBTAMT_OTHER'] = bureau[~bureau.CREDIT_TYPE.isin(['Consumer credit','Credit card','Car loan','Mortgage','Microloan']) ][['SK_ID_CURR','AMT_CREDIT_SUM_DEBT']].groupby(['SK_ID_CURR']).sum() 
    bureau_features['BUREAU_OVERDUEAMT_CONSUMER'] = bureau[bureau['CREDIT_TYPE'] == 'Consumer credit' ][['SK_ID_CURR','AMT_CREDIT_SUM_OVERDUE']].groupby(['SK_ID_CURR']).sum()
    bureau_features['BUREAU_OVERDUEAMT_CARD'] = bureau[bureau['CREDIT_TYPE'] == 'Credit card' ][['SK_ID_CURR','AMT_CREDIT_SUM_OVERDUE']].groupby(['SK_ID_CURR']).sum()                 
    bureau_features['BUREAU_OVERDUEAMT_CAR'] = bureau[bureau['CREDIT_TYPE'] == 'Car loan' ][['SK_ID_CURR','AMT_CREDIT_SUM_OVERDUE']].groupby(['SK_ID_CURR']).sum()                  
    bureau_features['BUREAU_OVERDUEAMT_MORTGAGE'] = bureau[bureau['CREDIT_TYPE'] == 'Mortgage' ][['SK_ID_CURR','AMT_CREDIT_SUM_OVERDUE']].groupby(['SK_ID_CURR']).sum()                 
    bureau_features['BUREAU_OVERDUEAMT_MiCROLOAN'] = bureau[bureau['CREDIT_TYPE'] == 'Microloan' ][['SK_ID_CURR','AMT_CREDIT_SUM_OVERDUE']].groupby(['SK_ID_CURR']).sum()                   
    bureau_features['BUREAU_OVERDUEAMT_OTHER'] = bureau[~bureau.CREDIT_TYPE.isin(['Consumer credit','Credit card','Car loan','Mortgage','Microloan']) ][['SK_ID_CURR','AMT_CREDIT_SUM_OVERDUE']].groupby(['SK_ID_CURR']).sum()              
    bureau_features['BUREAU_LAST_UPDATE_DAYS'] = bureau[['SK_ID_CURR','DAYS_CREDIT_UPDATE']].groupby(['SK_ID_CURR']).max()    
    print(bureau_features.shape)
    return bureau_features

In [28]:
def process_bureau_balance():
    # load bureau_balance data
    bureau_balance = load_data('bureau_balance')
    bureau_balance = reduce_mem_usage(bureau_balance)
    bureau = load_data('bureau')
    bureau = reduce_mem_usage(bureau)

    tmp = bureau[['SK_ID_BUREAU','SK_ID_CURR']]
    bureau_balance_union = pd.merge(bureau_balance, tmp, how='left', on='SK_ID_BUREAU')
    bureau_balance_union['SK_ID_CURR'] = bureau_balance_union['SK_ID_CURR'].fillna(0).astype('int64')
    bureau_balance_union = bureau_balance_union[bureau_balance_union.STATUS.isin(['0','1','2','3','4','5'])]
    # Number of loans with delinquency status 0, 1, 2, 3, 4, 5 in the most recent 1 month credit report 
    # Number of loans with delinquency status 0, 1, 2, 3, 4, 5 in the most recent 1-3 months credit report 
    # Number of loans with delinquency status 0, 1, 2, 3, 4, 5 in the most recent 3-6 months credit report 
    # Number of loans with delinquency status 0, 1, 2, 3, 4, 5 in the most recent 6-12 months credit report 
    # Number of loans with delinquency status 0, 1, 2, 3, 4, 5 in the most recent 12-24 months credit report 
    # Number of loans with delinquency status 0, 1, 2, 3, 4, 5 in the most recent 24-36 months credit report 
    # Number of loans with delinquency status 0, 1, 2, 3, 4, 5 in the most recent 36+ months credit report
    bureau_balance = pd.DataFrame()

    tmp = bureau_balance_union[bureau_balance_union.MONTHS_BALANCE >= -1][['SK_ID_BUREAU','SK_ID_CURR','STATUS']].groupby(['SK_ID_BUREAU','SK_ID_CURR'], as_index=False).max()
    bureau_balance['BUREAU_OVERDUE_NUM_1_0'] = tmp[tmp.STATUS =='0'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_1_1'] = tmp[tmp.STATUS == '1'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_1_2'] = tmp[tmp.STATUS == '2'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_1_3'] = tmp[tmp.STATUS == '3'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_1_4'] = tmp[tmp.STATUS == '4'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_1_5'] = tmp[tmp.STATUS == '5'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    
    tmp = bureau_balance_union[ (bureau_balance_union.MONTHS_BALANCE < -1) & (bureau_balance_union.MONTHS_BALANCE >= -3) ][['SK_ID_BUREAU','SK_ID_CURR','STATUS']].groupby(['SK_ID_BUREAU','SK_ID_CURR'], as_index=False).max()
    bureau_balance['BUREAU_OVERDUE_NUM_3_0'] = tmp[tmp.STATUS =='0'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_3_1'] = tmp[tmp.STATUS == '1'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_3_2'] = tmp[tmp.STATUS == '2'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_3_3'] = tmp[tmp.STATUS == '3'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_3_4'] = tmp[tmp.STATUS == '4'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_3_5'] = tmp[tmp.STATUS == '5'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    
    tmp = bureau_balance_union[ (bureau_balance_union.MONTHS_BALANCE < -3) & (bureau_balance_union.MONTHS_BALANCE >= -6) ][['SK_ID_BUREAU','SK_ID_CURR','STATUS']].groupby(['SK_ID_BUREAU','SK_ID_CURR'], as_index=False).max()
    bureau_balance['BUREAU_OVERDUE_NUM_6_0'] = tmp[tmp.STATUS =='0'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_6_1'] = tmp[tmp.STATUS == '1'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_6_2'] = tmp[tmp.STATUS == '2'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_6_3'] = tmp[tmp.STATUS == '3'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_6_4'] = tmp[tmp.STATUS == '4'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_6_5'] = tmp[tmp.STATUS == '5'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    
    tmp = bureau_balance_union[ (bureau_balance_union.MONTHS_BALANCE < -6) & (bureau_balance_union.MONTHS_BALANCE >= -12) ][['SK_ID_BUREAU','SK_ID_CURR','STATUS']].groupby(['SK_ID_BUREAU','SK_ID_CURR'], as_index=False).max()
    bureau_balance['BUREAU_OVERDUE_NUM_12_0'] = tmp[tmp.STATUS =='0'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_12_1'] = tmp[tmp.STATUS == '1'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_12_2'] = tmp[tmp.STATUS == '2'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_12_3'] = tmp[tmp.STATUS == '3'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_12_4'] = tmp[tmp.STATUS == '4'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_12_5'] = tmp[tmp.STATUS == '5'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    
    tmp = bureau_balance_union[ (bureau_balance_union.MONTHS_BALANCE < -12) & (bureau_balance_union.MONTHS_BALANCE >= -24) ][['SK_ID_BUREAU','SK_ID_CURR','STATUS']].groupby(['SK_ID_BUREAU','SK_ID_CURR'], as_index=False).max()
    bureau_balance['BUREAU_OVERDUE_NUM_24_0'] = tmp[tmp.STATUS =='0'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_24_1'] = tmp[tmp.STATUS == '1'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_24_2'] = tmp[tmp.STATUS == '2'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_24_3'] = tmp[tmp.STATUS == '3'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_24_4'] = tmp[tmp.STATUS == '4'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_24_5'] = tmp[tmp.STATUS == '5'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    
    tmp = bureau_balance_union[ (bureau_balance_union.MONTHS_BALANCE < -24) & (bureau_balance_union.MONTHS_BALANCE >= -36) ][['SK_ID_BUREAU','SK_ID_CURR','STATUS']].groupby(['SK_ID_BUREAU','SK_ID_CURR'], as_index=False).max()
    bureau_balance['BUREAU_OVERDUE_NUM_36_0'] = tmp[tmp.STATUS =='0'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36_1'] = tmp[tmp.STATUS == '1'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36_2'] = tmp[tmp.STATUS == '2'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36_3'] = tmp[tmp.STATUS == '3'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36_4'] = tmp[tmp.STATUS == '4'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36_5'] = tmp[tmp.STATUS == '5'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    
    tmp = bureau_balance_union[ bureau_balance_union.MONTHS_BALANCE < -36][['SK_ID_BUREAU','SK_ID_CURR','STATUS']].groupby(['SK_ID_BUREAU','SK_ID_CURR'], as_index=False).max()
    bureau_balance['BUREAU_OVERDUE_NUM_36plus_0'] = tmp[tmp.STATUS =='0'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36plus_1'] = tmp[tmp.STATUS == '1'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36plus_2'] = tmp[tmp.STATUS == '2'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36plus_3'] = tmp[tmp.STATUS == '3'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36plus_4'] = tmp[tmp.STATUS == '4'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    bureau_balance['BUREAU_OVERDUE_NUM_36plus_5'] = tmp[tmp.STATUS == '5'][['SK_ID_BUREAU','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    print(bureau_balance.shape)
    return bureau_balance

In [31]:
def process_pre():
    previous_application = load_data('previous_application')
    previous_application = reduce_mem_usage(previous_application)

    previous_application['RATE_INTEREST_ACTUAL'] = ((previous_application.AMT_ANNUITY * previous_application.CNT_PAYMENT) - previous_application.AMT_CREDIT) / previous_application.AMT_CREDIT
    previous_application.loc[previous_application.RATE_INTEREST_ACTUAL == -1.0,'RATE_INTEREST_ACTUAL'] = np.nan

    pre = pd.DataFrame()
    # Total number of past loans
    # Total amount of past loans
    # Total monthly payment of past loans
    # Total number of past credit card loans
    # Total amount of past credit card loans
    # Total monthly payment of past credit card loans
    # Total number of past cash loans
    # Total amount of past cash loans
    # Total monthly payment of past cash loans
    # Total number of past other loans
    # Total amount of past other loans
    # Total monthly payment of past other loans
    # Total number of past credit loans
    # Total amount of past credit loans
    # Total monthly payment of past credit loans
    # Total number of past car loans
    # Total amount of past car loans
    # Total monthly payment of past car loans
    # Total number of past approved loans
    # Total amount of past approved loans
    # Total monthly payment of past approved loans
    # Total number of past cancelled loans
    # Total amount of past cancelled loans
    # Total monthly payment of past cancelled loans
    # Total number of past rejected loans
    # Total amount of past rejected loans
    # Total monthly payment of past rejected loans
    # Total number of past unused loans
    # Total amount of past unused loans
    # Total monthly payment of past unused loans
    # Number of applications rejected due to HC
    # Number of applications rejected due to LIMIT
    # Number of applications rejected due to SCO
    # Number of applications rejected due to SCOFR
    # Number of applications rejected due to XNA
    # Number of applications rejected due to VERIF
    # Number of applications rejected due to SYSTEM
    # Maximum interest rate of past applications
    # Minimum interest rate of past applications
    # Average interest rate of past applications
    # Current number of active loans
    # Total amount of currently active loans
    # Total monthly payment of currently active loans
    # Number of currently active loans in arrears
    # Total amount of currently active loans in arrears
    # Total monthly payment of currently active loans in arrears
    # Number of loans in the past 3, 6 months, 1 year, 2 years, 3 years, 3+ years rejected, approved, total amount, monthly payment, pending.

    pre['PRE_CREDIT_NUM'] = previous_application[['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_AMT'] = previous_application[['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_ANNUITY_AMT'] = previous_application[['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_POS_NUM'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'POS'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_POS_AMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'POS'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_POS_ANNUITYAMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'POS'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_CASH_NUM'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cash'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_CASH_AMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cash'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_CASH_ANNUITYAMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cash'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_XNA_NUM'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'XNA'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_XNA_AMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'XNA'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_XNA_ANNUITYAMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'XNA'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Cards_NUM'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cards'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_Cards_AMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cards'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Cards_ANNUITYAMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cards'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Cars_NUM'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cars'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_Cars_AMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cars'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Cars_ANNUITYAMT'] = previous_application[previous_application['NAME_PORTFOLIO'] == 'Cars'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Approved_NUM'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Approved'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_Approved_AMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Approved'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Approved_ANNUITYAMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Approved'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Canceled_NUM'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Canceled'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_Canceled_AMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Canceled'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Canceled_ANNUITYAMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Canceled'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Refused_NUM'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Refused'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_Refused_AMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Refused'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Refused_ANNUITYAMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Refused'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Unused_NUM'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Unused offer'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_CREDIT_Unused_AMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Unused offer'][['SK_ID_CURR','AMT_CREDIT']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_CREDIT_Unused_ANNUITYAMT'] = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Unused offer'][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_HC_Refused_NUM'] = previous_application[(previous_application['NAME_CONTRACT_STATUS'] == 'Refused') & (previous_application['CODE_REJECT_REASON'] == 'HC')][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_LIMIT_Refused_NUM'] = previous_application[(previous_application['NAME_CONTRACT_STATUS'] == 'Refused') & (previous_application['CODE_REJECT_REASON'] == 'LIMIT')][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_SCO_Refused_NUM'] = previous_application[(previous_application['NAME_CONTRACT_STATUS'] == 'Refused') & (previous_application['CODE_REJECT_REASON'] == 'SCO')][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_SCOFR_Refused_NUM'] = previous_application[(previous_application['NAME_CONTRACT_STATUS'] == 'Refused') & (previous_application['CODE_REJECT_REASON'] == 'SCOFR')][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_XNA_Refused_NUM'] = previous_application[(previous_application['NAME_CONTRACT_STATUS'] == 'Refused') & (previous_application['CODE_REJECT_REASON'] == 'XNA')][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_VERIF_Refused_NUM'] = previous_application[(previous_application['NAME_CONTRACT_STATUS'] == 'Refused') & (previous_application['CODE_REJECT_REASON'] == 'VERIF')][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_SYSTEM_Refused_NUM'] = previous_application[(previous_application['NAME_CONTRACT_STATUS'] == 'Refused') & (previous_application['CODE_REJECT_REASON'] == 'SYSTEM')][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_MAX_INTEREST_RATE'] = previous_application[['SK_ID_CURR','RATE_INTEREST_ACTUAL']].groupby(['SK_ID_CURR']).max()
    pre['PRE_MIN_INTEREST_RATE'] = previous_application[['SK_ID_CURR','RATE_INTEREST_ACTUAL']].groupby(['SK_ID_CURR']).min()
    pre['PRE_AVG_INTEREST_RATE'] = previous_application[['SK_ID_CURR','RATE_INTEREST_ACTUAL']].groupby(['SK_ID_CURR']).mean()
    pre['PRE_REPAY_NUM'] = previous_application[previous_application.DAYS_TERMINATION == 365243.0][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_REPAY_AMT'] = previous_application[previous_application.DAYS_TERMINATION == 365243.0][['SK_ID_CURR','AMT_APPLICATION']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_REPAY_ANNUITYAMT'] = previous_application[previous_application.DAYS_TERMINATION == 365243.0][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_REAPY_OVERDUR_NUM'] = previous_application[(previous_application.DAYS_TERMINATION == 365243.0) & (previous_application.DAYS_LAST_DUE != 365243.0)][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pre['PRE_REAPY_OVERDUR_AMT'] = previous_application[(previous_application.DAYS_TERMINATION == 365243.0) & (previous_application.DAYS_LAST_DUE != 365243.0)][['SK_ID_CURR','AMT_APPLICATION']].groupby(['SK_ID_CURR']).sum()
    pre['PRE_REAPY_OVERDUR_ANNUITYAMT'] = previous_application[(previous_application.DAYS_TERMINATION == 365243.0) & (previous_application.DAYS_LAST_DUE != 365243.0)][['SK_ID_CURR','AMT_ANNUITY']].groupby(['SK_ID_CURR']).sum()

    print(pre.shape)
    return pre

In [32]:
def process_pos():
    POS_CASH_balance = load_data('POS_CASH_balance')
    POS_CASH_balance = reduce_mem_usage(POS_CASH_balance)

    pos = pd.DataFrame()
    # Total number of loans in POS
    # Number of POS loans that have been fully repaid
    # Number of POS loans that are currently being repaid
    # Maximum number of days overdue for fully repaid POS loans
    # Number of fully repaid POS loans that have experienced overdue payments
    # Maximum number of days overdue for POS loans that are currently being repaid
    # Number of POS loans that are currently being repaid and have experienced overdue payments
    # Number of POS loans with overdue payments in the past 6 months within 0, 7, 14, 30, 90, and 90+ days
    # Number of POS loans with overdue payments in the past 7-12 months within 0, 7, 14, 30, 90, and 90+ days
    # Number of POS loans with overdue payments in the past 13-24 months within 0, 7, 14, 30, 90, and 90+ days
    # Number of POS loans with overdue payments in the past 24-36 months within 0, 7, 14, 30, 90, and 90+ days
    # Number of POS loans with overdue payments in the past 36+ months within 0, 7, 14, 30, 90, and 90+ days
    # Number of POS loans currently overdue and the maximum number of days overdue

    pos['POS_CREDIT_NUM'] = POS_CASH_balance[['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).nunique()
    pos['POS_FINISH_NUM'] = POS_CASH_balance[POS_CASH_balance.NAME_CONTRACT_STATUS == 'Completed'][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).nunique()
    pos['POS_REPAY_NUM'] = POS_CASH_balance[ (POS_CASH_balance.MONTHS_BALANCE == -1) & (POS_CASH_balance.NAME_CONTRACT_STATUS == 'Active')][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).nunique()
    tmp = pd.DataFrame(POS_CASH_balance[POS_CASH_balance.NAME_CONTRACT_STATUS == 'Completed']['SK_ID_PREV'].unique(), columns=['SK_ID_PREV'])
    tmp = pd.merge(POS_CASH_balance, tmp, how='inner', on='SK_ID_PREV')
    pos['POS_DAYS_MAXOVERDUE_FINISH'] = tmp[['SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_CURR']).max()
    pos['POS_NUM_MAXOVERDUE_FINISH'] = tmp[tmp.SK_DPD_DEF > 0][['SK_ID_PREV','SK_ID_CURR']].groupby('SK_ID_CURR').nunique()
    tmp = pd.DataFrame(POS_CASH_balance[(POS_CASH_balance.MONTHS_BALANCE == -1) & (POS_CASH_balance.NAME_CONTRACT_STATUS == 'Active')]['SK_ID_PREV'].unique(), columns=['SK_ID_PREV'])
    tmp = pd.merge(POS_CASH_balance, tmp, how='inner', on='SK_ID_PREV')
    pos['POS_DAYS_MAXOVERDUE_REPAY'] = tmp[['SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_CURR']).max()
    pos['POS_NUM_MAXOVERDUE_REPAY'] = tmp[tmp.SK_DPD_DEF > 0][['SK_ID_PREV','SK_ID_CURR']].groupby('SK_ID_CURR').nunique()

    tmp = POS_CASH_balance[(POS_CASH_balance.MONTHS_BALANCE >= -6)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    pos['POS_OVERDUENUM_6_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_6_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_6_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_6_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_6_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_6_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = POS_CASH_balance[(POS_CASH_balance.MONTHS_BALANCE >= -12) & (POS_CASH_balance.MONTHS_BALANCE < -6)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    pos['POS_OVERDUENUM_12_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_12_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_12_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_12_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_12_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_12_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = POS_CASH_balance[(POS_CASH_balance.MONTHS_BALANCE >= -24) & (POS_CASH_balance.MONTHS_BALANCE < -12)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    pos['POS_OVERDUENUM_24_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_24_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_24_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_24_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_24_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_24_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = POS_CASH_balance[(POS_CASH_balance.MONTHS_BALANCE >= -36) & (POS_CASH_balance.MONTHS_BALANCE < -24)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    pos['POS_OVERDUENUM_36_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = POS_CASH_balance[POS_CASH_balance.MONTHS_BALANCE < -36][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    pos['POS_OVERDUENUM_36plus_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36plus_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36plus_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36plus_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36plus_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    pos['POS_OVERDUENUM_36plus_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    pos['POS_NUM_OVERDUE_STILL'] = POS_CASH_balance[(POS_CASH_balance.MONTHS_BALANCE == -1) & (POS_CASH_balance.SK_DPD_DEF > 30)][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    pos['POS_DAYS_MAXOVERDUE_STILL'] = POS_CASH_balance[(POS_CASH_balance.MONTHS_BALANCE == -1) & (POS_CASH_balance.SK_DPD_DEF > 30)][['SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_CURR']).max()

    print(pos.shape)
    return pos

In [37]:
def process_inst():
    # load installments_payments data
    installments_payments = load_data('installments_payments')
    installments_payments = reduce_mem_usage(installments_payments)
    # There are many installment repayment records with a scheduled payment amount of 67.5, and there are a large number of 
    # overdue records in these, which is clearly problematic. We will remove these data.
    installments_payments = installments_payments[installments_payments.AMT_INSTALMENT > 100]
    installments_payments['DAYS_DIFF'] = installments_payments['DAYS_ENTRY_PAYMENT'] - installments_payments['DAYS_INSTALMENT']

    inst = pd.DataFrame()
    # Number of loans with installment payments
    # Number of loans with installment payments that have been repaid in the last 6 months
    # Number of loans with installment payments that have been overdue in the last 6 months
    # Number of loans with installment payments that have been overdue for 7 days or less in the last 6 months
    # Number of loans with installment payments that have been overdue for 8-14 days in the last 6 months
    # Number of loans with installment payments that have been overdue for 15-30 days in the last 6 months
    # Number of loans with installment payments that have been overdue for 31-90 days in the last 6 months
    # Number of loans with installment payments that have been overdue for 90 days or more in the last 6 months
    # Number of loans with installment payments that have been repaid in the last 6-12 months
    # Number of loans with installment payments that have been overdue in the last 6-12 months
    # Number of loans with installment payments that have been overdue for 7 days or less in the last 6-12 months
    # Number of loans with installment payments that have been overdue for 8-14 days in the last 6-12 months
    # Number of loans with installment payments that have been overdue for 15-30 days in the last 6-12 months
    # Number of loans with installment payments that have been overdue for 31-90 days in the last 6-12 months
    # Number of loans with installment payments that have been overdue for 90 days or more in the last 6-12 months
    # Number of loans with installment payments that have been repaid in the last 12-24 months
    # Number of loans with installment payments that have been overdue in the last 12-24 months
    # Number of loans with installment payments that have been overdue for 7 days or less in the last 12-24 months
    # Number of loans with installment payments that have been overdue for 8-14 days in the last 12-24 months
    # Number of loans with installment payments that have been overdue for 15-30 days in the last 12-24 months
    # Number of loans with installment payments that have been overdue for 31-90 days in the last 12-24 months
    # Number of loans with installment payments that have been overdue for 90 days or more in the last 12-24 months
    # Number of loans with installment payments that have been repaid in the last 24-36 months
    # Number of loans with installment payments that have been overdue in the last 24-36 months
    # Number of loans with installment payments that have been overdue for 7 days or less in the last 24-36 months
    # Number of loans with installment payments that have been overdue for 8-14 days in the last 24-36 months
    # Number of loans with installment payments that have been overdue for 15-30 days in the last 24-36 months
    # Number of loans with installment payments that have been overdue for 31-90 days in the last 24-36 months
    # Number of loans with installment payments that have been overdue for 90 days or more in the last 24-36 months
    # Number of loans with installment payments that have been repaid in the last 36+ months
    # Number of loans with installment payments that have been overdue in the last 36+ months
    # Number of loans with overdue payments within 7 days in the most recent 36+ months of installment repayment.
    # Number of loans with overdue payments between 8-14 days in the most recent 36+ months of installment repayment.
    # Number of loans with overdue payments between 15-30 days in the most recent 36+ months of installment repayment.
    # Number of loans with overdue payments between 31-90 days in the most recent 36+ months of installment repayment.
    # Number of loans with overdue payments over 90 days in the most recent 36+ months of installment repayment.
    # The overdue amount in the most recent 6 months of installment repayment with overdue payments.
    # The overdue amount in the most recent 6-12 months of installment repayment with overdue payments.
    # The overdue amount in the most recent 12-24 months of installment repayment with overdue payments.
    # The overdue amount in the most recent 24-36 months of installment repayment with overdue payments.
    # The overdue amount in the most recent 36+ months of installment repayment with overdue payments.
    # The number of loans that are still overdue in installment repayment.
    # The total number of overdue periods for loans that are still overdue in installment repayment.
    # The maximum overdue days for loans that are still overdue in installment repayment.
    # The total overdue amount for loans that are still overdue in installment repayment.

    inst['INST_NUM'] = installments_payments[['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_6m'] = installments_payments[installments_payments.DAYS_INSTALMENT >= -180][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_6m_all'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -180) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_6m_7d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -180) & (installments_payments.DAYS_DIFF > 0) & (installments_payments.DAYS_DIFF <= 7)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_6m_14d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -180) & (installments_payments.DAYS_DIFF > 7) & (installments_payments.DAYS_DIFF <= 14)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_6m_30d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -180) & (installments_payments.DAYS_DIFF > 14) & (installments_payments.DAYS_DIFF <= 30)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_6m_90d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -180) & (installments_payments.DAYS_DIFF > 30) & (installments_payments.DAYS_DIFF <= 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_6m_90dplus'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -180) & (installments_payments.DAYS_DIFF > 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()

    inst['INST_NUM_12m'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_12m_all'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_12m_7d'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360) & (installments_payments.DAYS_DIFF > 0) & (installments_payments.DAYS_DIFF <= 7)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_12m_14d'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360) & (installments_payments.DAYS_DIFF > 7) & (installments_payments.DAYS_DIFF <= 14)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_12m_30d'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360) & (installments_payments.DAYS_DIFF > 14) & (installments_payments.DAYS_DIFF <= 30)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_12m_90d'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360) & (installments_payments.DAYS_DIFF > 30) & (installments_payments.DAYS_DIFF <= 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_12m_90dplus'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360) & (installments_payments.DAYS_DIFF > 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()

    inst['INST_NUM_24m'] = installments_payments[(installments_payments.DAYS_INSTALMENT <= -720) & (installments_payments.DAYS_INSTALMENT < -360)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_24m_all'] = installments_payments[(installments_payments.DAYS_INSTALMENT <= -720) & (installments_payments.DAYS_INSTALMENT < -360) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_24m_7d'] = installments_payments[(installments_payments.DAYS_INSTALMENT <= -720) & (installments_payments.DAYS_INSTALMENT < -360) & (installments_payments.DAYS_DIFF > 0) & (installments_payments.DAYS_DIFF <= 7)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_24m_14d'] = installments_payments[(installments_payments.DAYS_INSTALMENT <= -720) & (installments_payments.DAYS_INSTALMENT < -360) & (installments_payments.DAYS_DIFF > 7) & (installments_payments.DAYS_DIFF <= 14)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_24m_30d'] = installments_payments[(installments_payments.DAYS_INSTALMENT <= -720) & (installments_payments.DAYS_INSTALMENT < -360) & (installments_payments.DAYS_DIFF > 14) & (installments_payments.DAYS_DIFF <= 30)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_24m_90d'] = installments_payments[(installments_payments.DAYS_INSTALMENT <= -720) & (installments_payments.DAYS_INSTALMENT < -360) & (installments_payments.DAYS_DIFF > 30) & (installments_payments.DAYS_DIFF <= 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_24m_90dplus'] = installments_payments[(installments_payments.DAYS_INSTALMENT <= -720) & (installments_payments.DAYS_INSTALMENT < -360) & (installments_payments.DAYS_DIFF > 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()

    inst['INST_NUM_36m'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_INSTALMENT < -720)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36m_all'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_INSTALMENT < -720) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36m_7d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_INSTALMENT < -720) & (installments_payments.DAYS_DIFF > 0) & (installments_payments.DAYS_DIFF <= 7)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36m_14d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_INSTALMENT < -720) & (installments_payments.DAYS_DIFF > 7) & (installments_payments.DAYS_DIFF <= 14)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36m_30d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_INSTALMENT < -720) & (installments_payments.DAYS_DIFF > 14) & (installments_payments.DAYS_DIFF <= 30)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36m_90d'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_INSTALMENT < -720) & (installments_payments.DAYS_DIFF > 30) & (installments_payments.DAYS_DIFF <= 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36m_90dplus'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_INSTALMENT < -720) & (installments_payments.DAYS_DIFF > 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()

    inst['INST_NUM_36mplus'] = installments_payments[installments_payments.DAYS_INSTALMENT < -1080][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36mplus_all'] = installments_payments[installments_payments.DAYS_INSTALMENT < -1080 & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36mplus_7d'] = installments_payments[installments_payments.DAYS_INSTALMENT < -1080 & (installments_payments.DAYS_DIFF > 0) & (installments_payments.DAYS_DIFF <= 7)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36mplus_14d'] = installments_payments[installments_payments.DAYS_INSTALMENT < -1080 & (installments_payments.DAYS_DIFF > 7) & (installments_payments.DAYS_DIFF <= 14)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36mplus_30d'] = installments_payments[installments_payments.DAYS_INSTALMENT < -1080 & (installments_payments.DAYS_DIFF > 14) & (installments_payments.DAYS_DIFF <= 30)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36mplus_90d'] = installments_payments[installments_payments.DAYS_INSTALMENT < -1080 & (installments_payments.DAYS_DIFF > 30) & (installments_payments.DAYS_DIFF <= 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()
    inst['INST_NUM_36mplus_90dplus'] = installments_payments[installments_payments.DAYS_INSTALMENT < -1080 & (installments_payments.DAYS_DIFF > 90)][['SK_ID_CURR','SK_ID_PREV']].groupby('SK_ID_CURR').nunique()

    inst['INST_AMT_6m'] = installments_payments[(installments_payments.DAYS_INSTALMENT >= -180) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','AMT_PAYMENT']].groupby('SK_ID_CURR').sum()
    inst['INST_AMT_12m'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -180) & (installments_payments.DAYS_INSTALMENT >= -360) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','AMT_PAYMENT']].groupby('SK_ID_CURR').sum()
    inst['INST_AMT_24m'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -360) & (installments_payments.DAYS_INSTALMENT >= -720) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','AMT_PAYMENT']].groupby('SK_ID_CURR').sum()
    inst['INST_AMT_36m'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -720) & (installments_payments.DAYS_INSTALMENT >= -1080) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','AMT_PAYMENT']].groupby('SK_ID_CURR').sum()
    inst['INST_AMT_36mplus'] = installments_payments[(installments_payments.DAYS_INSTALMENT < -1080) & (installments_payments.DAYS_DIFF > 0)][['SK_ID_CURR','AMT_PAYMENT']].groupby('SK_ID_CURR').sum()

    inst['INST_NUM_STILL'] = installments_payments[installments_payments.DAYS_ENTRY_PAYMENT.isnull()][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    inst['INST_NUM_SEQ_STILL'] = installments_payments[installments_payments.DAYS_ENTRY_PAYMENT.isnull()][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    inst['INST_DAYS_MAX_STILL'] = installments_payments[installments_payments.DAYS_ENTRY_PAYMENT.isnull()][['SK_ID_CURR','DAYS_INSTALMENT']].groupby(['SK_ID_CURR']).min()
    inst['INST_AMT_STILL'] = installments_payments[installments_payments.DAYS_ENTRY_PAYMENT.isnull()][['SK_ID_CURR','AMT_INSTALMENT']].groupby(['SK_ID_CURR']).sum()

    print(inst.shape)
    return inst

In [38]:
def process_credit_balance():
    # load credit_card_balance data
    credit_card_balance = load_data('credit_card_balance')
    credit_card_balance = reduce_mem_usage(credit_card_balance)

    credit = pd.DataFrame()

    # Number of credit cards
    # Number of credit cards in different statuses
    # Credit card usage duration
    # Monthly balance of credit cards in the last 1, 3, 6, 12, 24, 36, and 36+ months
    # Withdrawal amount of credit cards in the last 1, 3, 6, 12, 24, 36, and 36+ months
    # POS amount of credit cards in the last 1, 3, 6, 12, 24, 36, and 36+ months
    # ATM amount of credit cards in the last 1, 3, 6, 12, 24, 36, and 36+ months
    # Maximum, minimum, and average credit limit of credit cards
    # Number of credit card transactions that were overdue for 0, 7, 14, 30, 90, and 90+ days in the last 6 months
    # Number of credit card transactions that were overdue for 0, 7, 14, 30, 90, and 90+ days between 7-12 months ago
    # Number of credit card transactions that were overdue for 0, 7, 14, 30, 90, and 90+ days between 13-24 months ago
    # Number of credit card transactions that were overdue for 0, 7, 14, 30, 90, and 90+ days between 24-36 months ago
    # Number of credit card transactions that were overdue for 0, 7, 14, 30, 90, and 90+ days more than 36 months ago
    # Number of credit card transactions that are currently overdue, and the maximum number of days overdue.

    credit['CREDIT_NUM'] = credit_card_balance[['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    credit['CREDIT_NUM_ACTIVE'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Active'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    credit['CREDIT_NUM_COMPLETED'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Completed'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    credit['CREDIT_NUM_SIGNED'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Signed'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    credit['CREDIT_NUM_DEMAND'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Demand'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    credit['CREDIT_NUM_SENT'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Sent proposal'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    credit['CREDIT_NUM_REFUSED'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Refused'][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).nunique()
    tmp = credit_card_balance[['SK_ID_PREV','SK_ID_CURR','MONTHS_BALANCE']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).count()
    credit['CREDIT_MONTHS_MAX'] = tmp[['SK_ID_CURR','MONTHS_BALANCE']].groupby('SK_ID_CURR').max()
    credit['CREDIT_MONTHS_MIN'] = tmp[['SK_ID_CURR','MONTHS_BALANCE']].groupby('SK_ID_CURR').min()
    credit['CREDIT_MONTHS_AVG'] = tmp[['SK_ID_CURR','MONTHS_BALANCE']].groupby('SK_ID_CURR').mean()
    credit['CREDIT_MONTHS_SUM'] = tmp[['SK_ID_CURR','MONTHS_BALANCE']].groupby('SK_ID_CURR').sum()
    credit['CREDIT_AMT_1m'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE == -1][['SK_ID_CURR','AMT_BALANCE']].groupby(['SK_ID_CURR']).mean()
    credit['CREDIT_AMT_3m'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -3][['SK_ID_CURR','AMT_BALANCE']].groupby(['SK_ID_CURR']).mean()
    credit['CREDIT_AMT_6m'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -6][['SK_ID_CURR','AMT_BALANCE']].groupby(['SK_ID_CURR']).mean()
    credit['CREDIT_AMT_12m'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -12][['SK_ID_CURR','AMT_BALANCE']].groupby(['SK_ID_CURR']).mean()
    credit['CREDIT_AMT_24m'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -24][['SK_ID_CURR','AMT_BALANCE']].groupby(['SK_ID_CURR']).mean()
    credit['CREDIT_AMT_36m'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -36][['SK_ID_CURR','AMT_BALANCE']].groupby(['SK_ID_CURR']).mean()
    credit['CREDIT_AMT_36mplus'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE < -36][['SK_ID_CURR','AMT_BALANCE']].groupby(['SK_ID_CURR']).mean()
    credit['CREDIT_AMT_1m_CURRENT'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE == -1][['SK_ID_CURR','AMT_DRAWINGS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_3m_CURRENT'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -3][['SK_ID_CURR','AMT_DRAWINGS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_6m_CURRENT'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -6][['SK_ID_CURR','AMT_DRAWINGS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_12m_CURRENT'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -12][['SK_ID_CURR','AMT_DRAWINGS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_24m_CURRENT'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -24][['SK_ID_CURR','AMT_DRAWINGS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_36m_CURRENT'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -36][['SK_ID_CURR','AMT_DRAWINGS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_36mplus_CURRENT'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE < -36][['SK_ID_CURR','AMT_DRAWINGS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_1m_POS'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE == -1][['SK_ID_CURR','AMT_DRAWINGS_POS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_3m_POS'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -3][['SK_ID_CURR','AMT_DRAWINGS_POS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_6m_POS'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -6][['SK_ID_CURR','AMT_DRAWINGS_POS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_12m_POS'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -12][['SK_ID_CURR','AMT_DRAWINGS_POS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_24m_POS'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -24][['SK_ID_CURR','AMT_DRAWINGS_POS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_36m_POS'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -36][['SK_ID_CURR','AMT_DRAWINGS_POS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_36mplus_POS'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE < -36][['SK_ID_CURR','AMT_DRAWINGS_POS_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_1m_ATM'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE == -1][['SK_ID_CURR','AMT_DRAWINGS_ATM_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_3m_ATM'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -3][['SK_ID_CURR','AMT_DRAWINGS_ATM_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_6m_ATM'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -6][['SK_ID_CURR','AMT_DRAWINGS_ATM_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_12m_ATM'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -12][['SK_ID_CURR','AMT_DRAWINGS_ATM_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_24m_ATM'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -24][['SK_ID_CURR','AMT_DRAWINGS_ATM_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_36m_ATM'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE >= -36][['SK_ID_CURR','AMT_DRAWINGS_ATM_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_AMT_36mplus_ATM'] = credit_card_balance[credit_card_balance.MONTHS_BALANCE < -36][['SK_ID_CURR','AMT_DRAWINGS_ATM_CURRENT']].groupby(['SK_ID_CURR']).sum()
    credit['CREDIT_LIMITAMT_MAX'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Active'][['SK_ID_CURR','AMT_CREDIT_LIMIT_ACTUAL']].groupby(['SK_ID_CURR']).max()
    credit['CREDIT_LIMITAMT_MIN'] = credit_card_balance[credit_card_balance.NAME_CONTRACT_STATUS == 'Active'][['SK_ID_CURR','AMT_CREDIT_LIMIT_ACTUAL']].groupby(['SK_ID_CURR']).min()
    tmp = credit_card_balance[(credit_card_balance.MONTHS_BALANCE >= -6)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    credit['CREDIT_OVERDUENUM_6_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_6_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_6_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_6_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_6_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_6_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = credit_card_balance[(credit_card_balance.MONTHS_BALANCE >= -12) & (credit_card_balance.MONTHS_BALANCE < -6)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    credit['CREDIT_OVERDUENUM_12_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_12_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_12_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_12_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_12_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_12_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = credit_card_balance[(credit_card_balance.MONTHS_BALANCE >= -24) & (credit_card_balance.MONTHS_BALANCE < -12)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    credit['CREDIT_OVERDUENUM_24_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_24_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_24_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_24_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_24_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_24_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = credit_card_balance[(credit_card_balance.MONTHS_BALANCE >= -36) & (credit_card_balance.MONTHS_BALANCE < -24)][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    credit['CREDIT_OVERDUENUM_36_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()

    tmp = credit_card_balance[credit_card_balance.MONTHS_BALANCE < -36][['SK_ID_PREV','SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_PREV','SK_ID_CURR'], as_index=False).max()
    credit['CREDIT_OVERDUENUM_36plus_0'] = tmp[tmp.SK_DPD_DEF ==0][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36plus_7'] = tmp[(tmp.SK_DPD_DEF > 0) & (tmp.SK_DPD_DEF <= 7)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36plus_14'] = tmp[(tmp.SK_DPD_DEF > 7) & (tmp.SK_DPD_DEF <= 14)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36plus_30'] = tmp[(tmp.SK_DPD_DEF > 14) & (tmp.SK_DPD_DEF <= 30)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36plus_90'] = tmp[(tmp.SK_DPD_DEF > 30) & (tmp.SK_DPD_DEF <= 90)][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    credit['CREDIT_OVERDUENUM_36plus_90plus'] = tmp[tmp.SK_DPD_DEF > 90][['SK_ID_PREV','SK_ID_CURR']].groupby(['SK_ID_CURR']).count()
    # credit['CREDIT_NUM_OVERDUE_STILL'] = credit_card_balance[(credit_card_balance.MONTHS_BALANCE == -1) & (credit_card_balance.SK_DPD_DEF > 30)][['SK_ID_CURR','SK_ID_PREV']].groupby(['SK_ID_CURR']).count()
    # credit['CREDIT_DAYS_MAXOVERDUE_STILL'] = credit_card_balance[(credit_card_balance.MONTHS_BALANCE == -1) & (credit_card_balance.SK_DPD_DEF > 30)][['SK_ID_CURR','SK_DPD_DEF']].groupby(['SK_ID_CURR']).max()


    print(credit.shape)
    return credit

In [39]:
def merge_info(df,ls):
    """
    Merge additional information stored in a list of DataFrames to the input 
    DataFrame based on 'SK_ID_CURR' column.
    Args:
    ----------
    df (pandas.DataFrame): Input DataFrame with 'SK_ID_CURR' column as the index.
    ls (list): A list of DataFrames containing additional information to be merged.

    Returns:
    ----------
    pandas.DataFrame: A merged DataFrame with additional information based on 'SK_ID_CURR' column.

    """
    print('Shape before merging: {}'.format(df.shape))
    res = df.set_index('SK_ID_CURR')
    for extra in ls:
        res = pd.merge(res, extra, how = 'left', left_index=True, right_index=True)
        del extra
    print('Shape after merging: {}'.format(res.shape))
    return res

In [40]:
# prepare data
bureau_extra = process_bureau()
bureau_balance_extra = process_bureau_balance()
pre_extra = process_pre()
pos_extra = process_pos()
inst_extra = process_inst()
credit_extra = process_credit_balance()
extra_info = [ bureau_extra, bureau_balance_extra, pre_extra, pos_extra, inst_extra, credit_extra ]

app_extra = merge_info(app, extra_info)
print("app_extra's shape:{}".format(app_extra.shape))

del bureau_extra,bureau_balance_extra,pre_extra,pos_extra,inst_extra,credit_extra,extra_info,app
gc.collect()

Initial memory usage: 222.62 MB
Optimized memory usage: 126.04 MB
(305810, 43)
Initial memory usage: 624.85 MB
Optimized memory usage: 442.60 MB
Initial memory usage: 222.62 MB
Optimized memory usage: 126.04 MB
(84483, 42)
Initial memory usage: 471.48 MB
Optimized memory usage: 321.75 MB
(338857, 46)
Initial memory usage: 610.43 MB
Optimized memory usage: 314.76 MB
(337252, 39)
Initial memory usage: 830.41 MB
Optimized memory usage: 415.20 MB
(339572, 45)
Initial memory usage: 673.88 MB
Optimized memory usage: 318.63 MB
(103558, 71)
Shape before merging: (246008, 122)
Shape after merging: (246008, 407)
app_extra's shape:(246008, 407)


1173

In [41]:
if add_more_information:
    # Define process strategies
    num_pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()
                    )
    cate_number_pipeline = make_pipeline(
                    FunctionTransformer(lambda X: X.astype(str),feature_names_out='one-to-one'),
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )
    cate_object_pipeline = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )

    transformed_data = transform_features(app_extra, num_pipeline, cate_number_pipeline, cate_object_pipeline)

    # Define model
    lr = LinearRegression()
    sgd = SGDRegressor()
    tree = DecisionTreeRegressor()
    gnb = GaussianNB()
    nn = MLPRegressor()
    models = [lr, sgd, tree, gnb, nn]

    # cross validation
    cross_validate_with_feature_importance(models, transformed_data, app_labels)

    # clean memory
    del transformed_data,num_pipeline,cate_number_pipeline,cate_object_pipeline
    gc.collect()

Model 1: LinearRegression
Fold 1
Training time: 9.16s
Validation ROC AUC score: 0.7689
Fold 2
Training time: 8.93s
Validation ROC AUC score: 0.7671
Fold 3
Training time: 8.84s
Validation ROC AUC score: 0.7648
Fold 4
Training time: 8.83s
Validation ROC AUC score: 0.7646
Fold 5
Training time: 8.79s
Validation ROC AUC score: 0.7625
Mean ROC AUC score: 0.7656 ± 0.0022
Model 2: SGDRegressor
Fold 1
Training time: 2.13s
Validation ROC AUC score: 0.5023
Fold 2
Training time: 2.12s
Validation ROC AUC score: 0.4709
Fold 3
Training time: 2.12s
Validation ROC AUC score: 0.5307
Fold 4
Training time: 2.11s
Validation ROC AUC score: 0.5165
Fold 5
Training time: 2.12s
Validation ROC AUC score: 0.4986
Mean ROC AUC score: 0.5038 ± 0.0200
Model 3: DecisionTreeRegressor
Fold 1
Training time: 47.96s
Validation ROC AUC score: 0.5460
Fold 2
Training time: 47.88s
Validation ROC AUC score: 0.5456
Fold 3
Training time: 47.43s
Validation ROC AUC score: 0.5470
Fold 4
Training time: 46.96s
Validation ROC AUC score

[LinearRegression(),
 SGDRegressor(),
 DecisionTreeRegressor(),
 GaussianNB(),
 MLPRegressor()]

24

### feature transform and add ploy feature

In [42]:
def evaluate_features(df, label, n):
    """
    Evaluate and select the best n features for a machine learning model from a DataFrame 
    of features and a label.
    Args:
    df (pandas.DataFrame): A DataFrame of features.
    label (pandas.Series): A Series containing the label for each sample.
    n (int): The number of top features to select.

    Returns:
    numpy.ndarray: An array of the n top features selected based on their feature importance.
    """
    # Replace missing values with the median value of each column
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in df.columns:
        median_value = df[col].median()
        df[col].fillna(median_value,inplace=True)

    # Scale the features
    scaler = StandardScaler()
    scaled_df = scaler.fit_transform(df)

    # Replace missing values with 0 after scaling
    scaled_df = np.nan_to_num(scaled_df)

    # Fit the decision tree regressor
    clf = DecisionTreeRegressor(random_state=0)
    clf.fit(scaled_df, label)

    feature_importance = pd.Series(clf.feature_importances_, index=df.columns)
    top_features = feature_importance.sort_values(ascending=False)[:n]
    return top_features.index.values

In [43]:
def generate_ratio_features(train_features, label, n):
    """
    Generate ratio features by dividing any two columns in a feature dataset and selecting 
    the best n features.
    Args:
    train_features (pandas.DataFrame): A DataFrame containing the training features.
    label (pandas.Series): A Series containing the label for each sample in the training data.
    n (int): The number of top ratio features to select.

    Returns:
    top_ratio_features(pandas.Index): the best n new interaction features
    """
    # Generate all combinations of feature pairs
    features_combinations = list(combinations(train_features.columns, 2))
    features_combinations = [(a, b) if a < b else (b, a) for a, b in features_combinations]
    features_combinations = list(set(features_combinations))
    
    # Generate ratio features for each feature pair and select the top n
    top_ratio_features = pd.DataFrame()
    for i in range(0, len(features_combinations), 100):
        start_idx = i
        end_idx = min(i + 100, len(features_combinations))
        ratio_features = pd.DataFrame()
        for feature_pair in features_combinations[start_idx:end_idx]:
            numerator, denominator = feature_pair
            ratio_feature_name = '{}_div_{}'.format(numerator.replace("_", ""), denominator.replace("_", ""))
            if ratio_feature_name in ratio_features.columns:
                continue
            ratio_features[ratio_feature_name] = train_features[numerator] / train_features[denominator]
        ratio_features_merged = pd.concat([top_ratio_features, ratio_features], axis=1)
        top_features = evaluate_features(ratio_features_merged, label, n)
        top_ratio_features = ratio_features_merged[top_features]
    return top_ratio_features.columns

In [44]:
def gen_inter_feature(df, features):
    """
    This function generates interaction features by dividing pairs of existing features.

    Args:

    df: the pandas DataFrame containing the original features.
    features: a list of strings representing the pairs of features to be divided.
    Returns:

    A pandas DataFrame that contains the original features plus the newly generated interaction features.
    """
    dic = {}
    for col in df.columns:
        dic[col] = col.replace('_','')
    df.rename(columns = dic, inplace=True)
    
    result = pd.DataFrame()
    for feature in features:
        numerator, denominator = feature.split('_div_')
        result[feature] = df[numerator] / df[denominator]
        result.replace([np.inf, -np.inf], 999, inplace=True) # The situation where the denominator has 0
    res = pd.concat([df, result], axis=1)
    return res

In [45]:
# Code of this cell takes too long to run. You can directly use the following cell.
if not is_use_calculated_ratiofeature:
    # Time related data
    days_cols = []
    for col in app_extra.columns:
        if 'DAYS' in col:
            days_cols.append(col)
    days_inter_list = generate_ratio_features(app_extra[days_cols],app_labels,5).tolist()
    # Amount related data
    amt_cols = []
    for col in app_extra.columns:
        if 'AMT' in col and 'AMT_REQ' not in col:
            amt_cols.append(col)
    amt_inter_list = generate_ratio_features(app_extra[amt_cols],app_labels,15).tolist()
    # Number related data
    num_cols = []
    for col in app_extra.columns:
        if 'NUM' in col:
            num_cols.append(col)
    num_inter_list = generate_ratio_features(app_extra[num_cols],app_labels,15).tolist()

    # Select the most relevant 35 ratio features
    selected_ratio_feature = days_inter_list + amt_inter_list + num_inter_list

    del days_cols,amt_cols,num_cols,days_inter_list,amt_inter_list,num_inter_list
    gc.collect()

In [50]:
if is_use_calculated_ratiofeature:
    # the ratio features calculated by upper code
    selected_ratio_feature = [
    'DAYSBIRTH_div_DAYSEMPLOYED', 
    'DAYSBIRTH_div_DAYSIDPUBLISH',
    'DAYSEMPLOYED_div_DAYSREGISTRATION', 
    'DAYSBIRTH_div_DAYSREGISTRATION',
    'DAYSEMPLOYED_div_DAYSIDPUBLISH',
    'AMTANNUITY_div_AMTCREDIT', 'AMTANNUITY_div_AMTGOODSPRICE',
    'PRECREDITANNUITYAMT_div_PRECREDITApprovedAMT',
    'AMTANNUITY_div_PRECREDITApprovedANNUITYAMT',
    'PRECREDITAMT_div_PRECREDITANNUITYAMT',
    'BUREAUDEBTAMT_div_BUREAULOANAMT', 'AMTCREDIT_div_PRECREDITApprovedAMT',
    'PRECREDITPOSAMT_div_PRECREDITPOSANNUITYAMT',
    'AMTINCOMETOTAL_div_PRECREDITPOSANNUITYAMT',
    'AMTANNUITY_div_BUREAUAMTCONSUMER', 'AMTANNUITY_div_AMTINCOMETOTAL',
    'AMTINCOMETOTAL_div_PRECREDITPOSAMT',
    'AMTINCOMETOTAL_div_BUREAULOANAMT',
    'AMTINCOMETOTAL_div_PRECREDITApprovedAMT',
    'AMTCREDIT_div_AMTINCOMETOTAL',
    'BUREAUNUMCONSUMER_div_PRECREDITNUM', 'BUREAUNUM_div_POSFINISHNUM',
    'BUREAUNUM_div_PRECREDITNUM', 'BUREAUNUM_div_PRECREDITPOSNUM',
    'POSOVERDUENUM120_div_PRECREDITNUM',
    'INSTNUM36mplus7d_div_PRECREDITCASHNUM',
    'BUREAUACTIVENUM_div_BUREAUNUMCARD',
    'POSOVERDUENUM240_div_PRECREDITNUM', 'BUREAUNUMNORMAL_div_PRECREDITNUM',
    'BUREAUNUM_div_BUREAUNUMDELAY', 'INSTNUM36m_div_INSTNUM36mplus90dplus',
    'PRECREDITApprovedNUM_div_PRECREDITPOSNUM',
    'PRECREDITNUM_div_PRECREDITXNANUM',
    'BUREAUOVERDUENUM120_div_PRECREDITNUM', 'BUREAUCOLSEDNUM_div_INSTNUM'
                             ]

In [51]:
# prepare data
app_extra_add = gen_inter_feature(app_extra, selected_ratio_feature)
print("app_extra_add's shape:{}".format(app_extra_add.shape))

app_extra_add's shape:(246008, 442)


In [52]:
if add_ratio_feature:    
    # Define process strategies
    num_pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()
                    )
    cate_number_pipeline = make_pipeline(
                    FunctionTransformer(lambda X: X.astype(str),feature_names_out='one-to-one'),
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )
    cate_object_pipeline = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )

    transformed_data = transform_features(app_extra_add, num_pipeline, cate_number_pipeline, cate_object_pipeline)

    # Define model
    lr = LinearRegression()
    sgd = SGDRegressor()
    tree = DecisionTreeRegressor()
    gnb = GaussianNB()
    nn = MLPRegressor()
    models = [lr, sgd, tree, gnb, nn]

    # cross validation
    cross_validate_with_feature_importance(models, transformed_data, app_labels)

    # clean memory
    del app_extra,transformed_data,num_pipeline,cate_number_pipeline,cate_object_pipeline
    gc.collect()

Model 1: LinearRegression
Fold 1
Training time: 10.33s
Validation ROC AUC score: 0.7725
Fold 2
Training time: 9.98s
Validation ROC AUC score: 0.7683
Fold 3
Training time: 9.87s
Validation ROC AUC score: 0.7663
Fold 4
Training time: 9.95s
Validation ROC AUC score: 0.7666
Fold 5
Training time: 9.81s
Validation ROC AUC score: 0.7640
Mean ROC AUC score: 0.7675 ± 0.0028
Model 2: SGDRegressor
Fold 1
Training time: 5.37s
Validation ROC AUC score: 0.4865
Fold 2
Training time: 2.19s
Validation ROC AUC score: 0.4831
Fold 3
Training time: 2.19s
Validation ROC AUC score: 0.4896
Fold 4
Training time: 2.20s
Validation ROC AUC score: 0.5612
Fold 5
Training time: 2.20s
Validation ROC AUC score: 0.4507
Mean ROC AUC score: 0.4942 ± 0.0363
Model 3: DecisionTreeRegressor
Fold 1
Training time: 65.05s
Validation ROC AUC score: 0.5453
Fold 2
Training time: 66.75s
Validation ROC AUC score: 0.5444
Fold 3
Training time: 67.33s
Validation ROC AUC score: 0.5466
Fold 4
Training time: 66.41s
Validation ROC AUC scor

[LinearRegression(),
 SGDRegressor(),
 DecisionTreeRegressor(),
 GaussianNB(),
 MLPRegressor()]

793

### delete invaild feature

In [57]:
if delete_invaild_feature:
    # Define process strategies
    num_pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()
                    )
    cate_number_pipeline = make_pipeline(
                    FunctionTransformer(lambda X: X.astype(str),feature_names_out='one-to-one'),
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )
    cate_object_pipeline = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )

    transformed_data = transform_features(app_extra_add, num_pipeline, cate_number_pipeline, cate_object_pipeline)

    # delete invaild feature
    erg = ExtraTreesRegressor(n_jobs=-1)
    erg = erg.fit(transformed_data, app_labels)
    model = SelectFromModel(erg, prefit=True)
    transformed_data_select = model.transform(transformed_data)
    transformed_data_select = pd.DataFrame(transformed_data_select, columns=model.get_feature_names_out(input_features=transformed_data.columns))
    
    # Define model
    lr = LinearRegression()
    sgd = SGDRegressor()
    tree = DecisionTreeRegressor()
    gnb = GaussianNB()
    nn = MLPRegressor()
    models = [lr, sgd, tree, gnb, nn]

    # cross validation
    cross_validate_with_feature_importance(models, transformed_data_select, app_labels)

    # clean memory
    del erg,model,transformed_data,transformed_data_select
    gc.collect()

Model 1: LinearRegression
Fold 1
Training time: 2.21s
Validation ROC AUC score: 0.7733
Fold 2
Training time: 2.19s
Validation ROC AUC score: 0.7687
Fold 3
Training time: 2.01s
Validation ROC AUC score: 0.7674
Fold 4
Training time: 2.04s
Validation ROC AUC score: 0.7637
Fold 5
Training time: 2.15s
Validation ROC AUC score: 0.7629
Mean ROC AUC score: 0.7672 ± 0.0038
Model 2: SGDRegressor
Fold 1
Training time: 1.08s
Validation ROC AUC score: 0.5090
Fold 2
Training time: 1.07s
Validation ROC AUC score: 0.5316
Fold 3
Training time: 1.07s
Validation ROC AUC score: 0.5248
Fold 4
Training time: 1.05s
Validation ROC AUC score: 0.5062
Fold 5
Training time: 1.06s
Validation ROC AUC score: 0.4680
Mean ROC AUC score: 0.5079 ± 0.0221
Model 3: DecisionTreeRegressor
Fold 1
Training time: 49.75s
Validation ROC AUC score: 0.5458
Fold 2
Training time: 51.97s
Validation ROC AUC score: 0.5482
Fold 3
Training time: 49.28s
Validation ROC AUC score: 0.5478
Fold 4
Training time: 49.54s
Validation ROC AUC score

[LinearRegression(),
 SGDRegressor(),
 DecisionTreeRegressor(),
 GaussianNB(),
 MLPRegressor()]

1744

### try more model and fine tune

#### linear model

In [58]:
def linear_importance_rank(model,data):
    coef = model.coef_
    sorted_idx = np.argsort(np.abs(coef))[::-1]
    print("{}_Feature importance ranking:".format(type(model).__name__))
    for idx in sorted_idx:
        print(f"Feature {data.columns[idx]}: {coef[idx]}")

In [59]:
if linear_model:
    # Define process strategies
    num_pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()
                    )
    cate_number_pipeline = make_pipeline(
                    FunctionTransformer(lambda X: X.astype(str),feature_names_out='one-to-one'),
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )
    cate_object_pipeline = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )

    transformed_data = transform_features(app_extra_add, num_pipeline, cate_number_pipeline, cate_object_pipeline)

    # Define models
    ridge = Ridge(alpha=750)
    lasso = Lasso(alpha=1e-5)
    ela_net = ElasticNet(alpha=5e-5, l1_ratio=0.6)
    # This algorithm can only select linearly related features, nonlinearity is discarded, and there may be interactions between features
    lars = Lars(n_nonzero_coefs=110)  
    lassolars = LassoLars(alpha=1e-9,max_iter=500)
    # This algorithm can only select linearly correlated features, and nonlinearity is discarded. The difference from LARS is that it assumes that features are independent of each other
    omp = OrthogonalMatchingPursuit(n_nonzero_coefs=300)
    lr = LogisticRegression(C=0.5)
    tr = TweedieRegressor(power=0,alpha=1e-5, link='identity' )
    sgd = SGDRegressor(loss='huber',penalty='elasticnet', 
                       alpha=1e-4, l1_ratio=0.1, learning_rate='adaptive',
                       early_stopping=True, validation_fraction=0.2,n_iter_no_change=5
                      )
    ppt = Perceptron(penalty='elasticnet',alpha = 1e-9,l1_ratio = 0.5,
                     early_stopping = True,class_weight = 'balanced',eta0 = 0.01
                    )
    par = PassiveAggressiveRegressor(C=1e-5, loss='squared_epsilon_insensitive')
    
    models = [ridge,lasso,ela_net,lars,lassolars,omp,lr,tr,sgd,ppt,par]
    
    # cross validation
    cross_validate_with_feature_importance(models, transformed_data, app_labels)
    
    linear_importance_rank(lars, transformed_data)
    linear_importance_rank(omp, transformed_data)
    
    del transformed_data,num_pipeline,cate_number_pipeline,cate_object_pipeline
    gc.collect()

Model 1: Ridge
Fold 1
Training time: 2.40s
Validation ROC AUC score: 0.7756
Fold 2
Training time: 2.35s
Validation ROC AUC score: 0.7704
Fold 3
Training time: 2.30s
Validation ROC AUC score: 0.7691
Fold 4
Training time: 2.28s
Validation ROC AUC score: 0.7674
Fold 5
Training time: 2.44s
Validation ROC AUC score: 0.7666
Mean ROC AUC score: 0.7698 ± 0.0032
Model 2: Lasso
Fold 1
Training time: 50.65s
Validation ROC AUC score: 0.7753
Fold 2
Training time: 49.51s
Validation ROC AUC score: 0.7700
Fold 3
Training time: 54.51s
Validation ROC AUC score: 0.7685
Fold 4
Training time: 50.10s
Validation ROC AUC score: 0.7672
Fold 5
Training time: 50.48s
Validation ROC AUC score: 0.7663
Mean ROC AUC score: 0.7694 ± 0.0032
Model 3: ElasticNet
Fold 1
Training time: 45.15s
Validation ROC AUC score: 0.7755
Fold 2
Training time: 44.87s
Validation ROC AUC score: 0.7701
Fold 3
Training time: 45.24s
Validation ROC AUC score: 0.7690
Fold 4
Training time: 45.17s
Validation ROC AUC score: 0.7672
Fold 5
Training

[Ridge(alpha=750),
 Lasso(alpha=1e-05),
 ElasticNet(alpha=5e-05, l1_ratio=0.6),
 Lars(n_nonzero_coefs=110),
 LassoLars(alpha=1e-09),
 OrthogonalMatchingPursuit(n_nonzero_coefs=300),
 LogisticRegression(C=0.5),
 TweedieRegressor(alpha=1e-05, link='identity', power=0),
 SGDRegressor(early_stopping=True, l1_ratio=0.1, learning_rate='adaptive',
              loss='huber', penalty='elasticnet', validation_fraction=0.2),
 Perceptron(alpha=1e-09, class_weight='balanced', early_stopping=True, eta0=0.01,
            l1_ratio=0.5, penalty='elasticnet'),
 PassiveAggressiveRegressor(C=1e-05, loss='squared_epsilon_insensitive')]

Lars_Feature importance ranking:
Feature cate_object__FLAGOWNCAR_N: 28714231330523.375
Feature cate_object__FLAGOWNCAR_Y: 28714231330523.37
Feature num__BUREAUCOLSEDNUM: 0.061592451879517326
Feature num__EXTSOURCE2: -0.03178440769044325
Feature num__EXTSOURCE3: -0.030596245336610303
Feature cate_object__CODEGENDER_M: 0.02475963025848808
Feature num__BUREAUNUMCONSUMER: -0.023313311974000994
Feature num__BUREAUACTIVENUM: 0.021228727124668334
Feature num__BUREAUNUM: -0.020357938052310415
Feature cate_number__INSTNUM6mall_nan: -0.016688840478310603
Feature num__AMTANNUITY_div_AMTGOODSPRICE: 0.013551921330208806
Feature num__PREAVGINTERESTRATE: 0.012461509806521802
Feature num__AMTANNUITY_div_AMTCREDIT: -0.012080382187731711
Feature num__BUREAUNUMNORMAL: -0.011312778011849744
Feature num__BUREAUNUMPREPAY: -0.01115591530115882
Feature num__EXTSOURCE1: -0.010474292026161182
Feature num__POSCREDITNUM: -0.01024584487228955
Feature cate_number__FLAGDOCUMENT3_0: -0.009141337421527847
Feature cate

31

#### tree model

In [60]:
if tree_model:
    # Define process strategies
    num_pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()
                    )
    cate_number_pipeline = make_pipeline(
                    FunctionTransformer(lambda X: X.astype(str),feature_names_out='one-to-one'),
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )
    cate_object_pipeline = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )

    transformed_data = transform_features(app_extra_add, num_pipeline, cate_number_pipeline, cate_object_pipeline)

    # Define models
    tree = DecisionTreeRegressor(splitter='random', 
                                 criterion='squared_error', 
                                 max_depth=10, 
                                 min_samples_split=100
                                )
    # cross validation
    cross_validate_with_feature_importance([tree], transformed_data, app_labels)
    
    del transformed_data,num_pipeline,cate_number_pipeline,cate_object_pipeline
    gc.collect()

Model 1: DecisionTreeRegressor
Fold 1
Training time: 11.28s
Validation ROC AUC score: 0.7164
Fold 2
Training time: 10.88s
Validation ROC AUC score: 0.7187
Fold 3
Training time: 10.73s
Validation ROC AUC score: 0.7171
Fold 4
Training time: 10.84s
Validation ROC AUC score: 0.7144
Fold 5
Training time: 10.69s
Validation ROC AUC score: 0.7117
Top 15 most important features:
num__EXTSOURCE2                                    1.378729
num__EXTSOURCE3                                    1.263636
num__PREAVGINTERESTRATE                            0.171551
cate_number__AMTREQCREDITBUREAUHOUR_nan            0.117828
num__EXTSOURCE1                                    0.097117
cate_object__CODEGENDER_M                          0.065021
cate_object__CODEGENDER_F                          0.057641
num__DAYSEMPLOYED                                  0.041893
num__PREMAXINTERESTRATE                            0.040285
num__DAYSBIRTH                                     0.036923
cate_object__NAMEEDUCATIONT

[DecisionTreeRegressor(max_depth=10, min_samples_split=100, splitter='random')]

24

#### ensemble model

##### Sensitivity to missing values

In [64]:
if ensemble_without_missing:
    # Define process strategies
    num_pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()
                    )
    cate_number_pipeline = make_pipeline(
                    FunctionTransformer(lambda X: X.astype(str),feature_names_out='one-to-one'),
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )
    cate_object_pipeline = make_pipeline(
                    SimpleImputer(strategy='constant', fill_value='UKN'),
                    OneHotEncoder(handle_unknown='ignore')
    )

    transformed_data = transform_features(app_extra_add, num_pipeline, cate_number_pipeline, cate_object_pipeline)

    # Define models
    reg1 = Ridge(alpha=750)
    reg2 = Lasso(alpha=1e-5)
    reg3 = ElasticNet(alpha=5e-5, l1_ratio=0.6)
    reg4 = Lars(n_nonzero_coefs=110)
    reg5 = LassoLars(alpha=1e-9,max_iter=500)
    reg6 = OrthogonalMatchingPursuit(n_nonzero_coefs=300)
    reg7 = TweedieRegressor(power=0,alpha=1e-5, link='identity' )
    reg8 = PassiveAggressiveRegressor(C=1e-5, loss='squared_epsilon_insensitive')

    vote = VotingRegressor([('reg1', reg1),('reg2', reg2),
                          ('reg3', reg3),('reg4', reg4),
                          ('reg5', reg5),('reg6', reg6),
                          ('reg7', reg7),('reg8', reg8),
                         ])
    rf = RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=100 )
    ada = AdaBoostRegressor()

    base_tree = DecisionTreeRegressor(splitter='random', 
                                      criterion='squared_error', 
                                      max_depth=10, 
                                      min_samples_split=100
                                     )
    bg = BaggingRegressor(estimator=base_tree)
    et = ExtraTreesRegressor()
    gbdt = GradientBoostingRegressor()
    
    ridge = Ridge(alpha=750)
    dt = DecisionTreeRegressor(splitter='random', 
                               criterion='squared_error', 
                               max_depth=10, 
                               min_samples_split=100)
    estimators = [
        ('rg', ridge),
        ('dt', dt)]
    stack = StackingRegressor(
        estimators=estimators,
        final_estimator=RandomForestRegressor(n_estimators=10,
                                              random_state=42)
    )

    models = [vote,rf,ada,bg,et,gbdt,stack]
    
    # cross validation
    cross_validate_with_feature_importance(models, transformed_data, app_labels)
    
    del transformed_data,num_pipeline,cate_number_pipeline,cate_object_pipeline
    gc.collect()  

Model 1: VotingRegressor
Fold 1
Training time: 131.09s
Validation ROC AUC score: 0.7752
Fold 2
Training time: 125.80s
Validation ROC AUC score: 0.7703
Fold 3
Training time: 122.31s
Validation ROC AUC score: 0.7694
Fold 4
Training time: 121.45s
Validation ROC AUC score: 0.7675
Fold 5
Training time: 114.61s
Validation ROC AUC score: 0.7664
Mean ROC AUC score: 0.7698 ± 0.0030
Model 2: RandomForestRegressor
Fold 1
Training time: 2207.45s
Model training skipped because it took 2207.45s
No validation scores obtained.
Model 3: AdaBoostRegressor
Fold 1
Training time: 77.91s
Validation ROC AUC score: 0.6857
Fold 2
Training time: 33.26s
Validation ROC AUC score: 0.6893
Fold 3
Training time: 45.94s
Validation ROC AUC score: 0.6801
Fold 4
Training time: 46.27s
Validation ROC AUC score: 0.6825
Fold 5
Training time: 52.25s
Validation ROC AUC score: 0.6764
Top 15 most important features:
num__EXTSOURCE3                          2.619442
num__EXTSOURCE2                          2.088814
num__PREAVGINT

[VotingRegressor(estimators=[('reg1', Ridge(alpha=750)),
                             ('reg2', Lasso(alpha=1e-05)),
                             ('reg3', ElasticNet(alpha=5e-05, l1_ratio=0.6)),
                             ('reg4', Lars(n_nonzero_coefs=110)),
                             ('reg5', LassoLars(alpha=1e-09)),
                             ('reg6',
                              OrthogonalMatchingPursuit(n_nonzero_coefs=300)),
                             ('reg7',
                              TweedieRegressor(alpha=1e-05, link='identity',
                                               power=0)),
                             ('reg8',
                              PassiveAggressiveRegressor(C=1e-05,
                                                         loss='squared_epsilon_insensitive'))]),
 RandomForestRegressor(max_depth=10, min_samples_split=100, n_estimators=200),
 AdaBoostRegressor(),
 BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=10,
                     

0

##### Not sensitive to missing values

In [65]:
if ensemble_with_missing:
    # Define process strategies
    num_pipeline = make_pipeline(
                StandardScaler()
                )
    cate_number_pipeline = make_pipeline(
                OneHotEncoder(handle_unknown='ignore')
                )
    cate_object_pipeline = make_pipeline(
                OneHotEncoder(handle_unknown='ignore')
                )
    
    transformed_data = transform_features(app_extra_add, num_pipeline, cate_number_pipeline, cate_object_pipeline)
    transformed_data = transformed_data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

    # grid search
#     lgbm_parameters = {
#               'num_leaves':[20,30,40], 
#               'subsample_for_bin':[100,1000],
#               'n_estimators':[100,1000,10000], 
#               'colsample_bytree':[0.9,0.5,0.1], 
#              }
#     gs = GridSearchCV(clf, lgbm_parameters, scoring='roc_auc', n_jobs=-1, cv=3 )
#     gs.fit(transformed_data, app_labels)
#     gs.best_estimator_
#     gs.best_score_
#     gs.best_params_
    
    # Define models
    hist = HistGradientBoostingRegressor(loss='poisson',max_iter=1000,max_leaf_nodes=50, 
                                        min_samples_leaf=20, max_depth=8, learning_rate=0.02,
                                        random_state=42 )
    lgbm = LGBMClassifier(boosting_type='gbdt', 
                     num_leaves=20,
                     max_depth = 10,
                     n_estimators=9000,
                     subsample_for_bin=1000,
                     learning_rate=0.01,
                     colsample_bytree=0.1,
                     subsample=0.75,
                     min_split_gain=0.05,
                     min_child_samples=300
                    )

    models = [hist, lgbm]
    
    # cross validation
    cross_validate_with_feature_importance(models, transformed_data, app_labels)
    
    del transformed_data,num_pipeline,cate_number_pipeline,cate_object_pipeline
    gc.collect()  

Model 1: HistGradientBoostingRegressor
Fold 1
Training time: 136.07s
Validation ROC AUC score: 0.7862
Fold 2
Training time: 127.38s
Validation ROC AUC score: 0.7819
Fold 3
Training time: 161.12s
Validation ROC AUC score: 0.7826
Fold 4
Training time: 158.06s
Validation ROC AUC score: 0.7823
Fold 5
Training time: 130.25s
Validation ROC AUC score: 0.7777
Mean ROC AUC score: 0.7821 ± 0.0027
Model 2: LGBMClassifier
Fold 1
Training time: 161.18s
Validation ROC AUC score: 0.7957
Fold 2
Training time: 161.48s
Validation ROC AUC score: 0.7909
Fold 3
Training time: 154.56s
Validation ROC AUC score: 0.7876
Fold 4
Training time: 156.44s
Validation ROC AUC score: 0.7883
Fold 5
Training time: 153.19s
Validation ROC AUC score: 0.7889
Top 15 most important features:
num__AMTANNUITY_div_AMTCREDIT                      16851
num__EXTSOURCE3                                    12362
num__DAYSBIRTH                                     10532
num__EXTSOURCE2                                    10523
num__EXTSOU

[HistGradientBoostingRegressor(learning_rate=0.02, loss='poisson', max_depth=8,
                               max_iter=1000, max_leaf_nodes=50,
                               random_state=42),
 LGBMClassifier(colsample_bytree=0.1, learning_rate=0.01, max_depth=10,
                min_child_samples=300, min_split_gain=0.05, n_estimators=9000,
                num_leaves=20, subsample=0.75, subsample_for_bin=1000)]

49

### Evaluate model on test set

In [68]:
# prepare data
app_train = train_set.drop('TARGET', axis=1)
app_train_labels = train_set['TARGET'].copy()

app_test = test_set.drop('TARGET', axis=1)
app_test_labels = test_set['TARGET'].copy()

bureau_extra = process_bureau()
bureau_balance_extra = process_bureau_balance()
pre_extra = process_pre()
pos_extra = process_pos()
inst_extra = process_inst()
credit_extra = process_credit_balance()
extra_info = [ bureau_extra, bureau_balance_extra, pre_extra, pos_extra, inst_extra, credit_extra ]

app_train_extra = merge_info(app_train, extra_info)
app_test_extra = merge_info(app_test, extra_info)

del bureau_extra,bureau_balance_extra,pre_extra,pos_extra,inst_extra,credit_extra,extra_info
del app_train,app_test
gc.collect()

app_train_extra_add = gen_inter_feature(app_train_extra, selected_ratio_feature)
app_test_extra_add = gen_inter_feature(app_test_extra, selected_ratio_feature)


# Define process strategies
num_pipeline = make_pipeline(
            StandardScaler()
            )
cate_number_pipeline = make_pipeline(
            OneHotEncoder(handle_unknown='ignore')
            )
cate_object_pipeline = make_pipeline(
            OneHotEncoder(handle_unknown='ignore')
            )

# transform train set and test set

num_cols = []

cate_cols = app_train_extra_add.select_dtypes(include='object').columns.tolist()
cate_cols.extend(select_low_cardinality_numeric_features(app_train_extra_add, 'TARGET'))

num_cols.extend(app_train_extra_add.columns.difference(cate_cols))

cate_cols_object = app_train_extra_add[cate_cols].select_dtypes(include=['object']).columns
cate_cols_number = app_train_extra_add[cate_cols].select_dtypes(include=['number']).columns

processing = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cate_object',cate_object_pipeline,cate_cols_object),
    ('cate_number', cate_number_pipeline, cate_cols_number)
], remainder='passthrough')

processing = processing.fit(app_train_extra_add)

transformed_data_train = processing.transform(app_train_extra_add)
transformed_data_test = processing.transform(app_test_extra_add)

transformed_data_train = pd.DataFrame(transformed_data_train, columns=processing.get_feature_names_out())
transformed_data_test = pd.DataFrame(transformed_data_test, columns=processing.get_feature_names_out())

transformed_data_train = transformed_data_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
transformed_data_test = transformed_data_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Define models
lgbm = LGBMClassifier(boosting_type='gbdt', 
                 num_leaves=20,
                 max_depth = 10,
                 n_estimators=9000,
                 subsample_for_bin=1000,
                 learning_rate=0.01,
                 colsample_bytree=0.1,
                 subsample=0.75,
                 min_split_gain=0.05,
                 min_child_samples=300
                )

# train model on train set and evaluate on test set
lgbm.fit(transformed_data_train,app_train_labels)
y_pred = lgbm.predict_proba(transformed_data_test)[:, 1]
roc_auc_score(app_test_labels, y_pred)

Initial memory usage: 222.62 MB
Optimized memory usage: 126.04 MB
(305810, 43)
Initial memory usage: 624.85 MB
Optimized memory usage: 442.60 MB
Initial memory usage: 222.62 MB
Optimized memory usage: 126.04 MB
(84483, 42)
Initial memory usage: 471.48 MB
Optimized memory usage: 321.75 MB
(338857, 46)
Initial memory usage: 610.43 MB
Optimized memory usage: 314.76 MB
(337252, 39)
Initial memory usage: 830.41 MB
Optimized memory usage: 415.20 MB
(339572, 45)
Initial memory usage: 673.88 MB
Optimized memory usage: 318.63 MB
(103558, 71)
Shape before merging: (246008, 122)
Shape after merging: (246008, 407)
Shape before merging: (61503, 122)
Shape after merging: (61503, 407)


477

0.7938342026178067

In [95]:
del app_extra_add,app_labels,app_test_extra,app_test_extra_add,app_test_labels,app_train_extra
del app_train_extra_add,app_train_labels,processing
gc.collect()

869

### Final model

In [96]:
# prepare data
application = load_data('application_train')
application = reduce_mem_usage(application)
application_test = load_data('application_test')
application_test = reduce_mem_usage(application_test)

app_train = application.drop('TARGET', axis=1)
app_train_labels = application['TARGET'].copy()

app_test = application_test

bureau_extra = process_bureau()
bureau_balance_extra = process_bureau_balance()
pre_extra = process_pre()
pos_extra = process_pos()
inst_extra = process_inst()
credit_extra = process_credit_balance()
extra_info = [ bureau_extra, bureau_balance_extra, pre_extra, pos_extra, inst_extra, credit_extra ]

app_train_extra = merge_info(app_train, extra_info)
app_test_extra = merge_info(app_test, extra_info)

del bureau_extra,bureau_balance_extra,pre_extra,pos_extra,inst_extra,credit_extra,extra_info
del app_train,application_test
gc.collect()

app_train_extra_add = gen_inter_feature(app_train_extra, selected_ratio_feature)
app_test_extra_add = gen_inter_feature(app_test_extra, selected_ratio_feature)

# Define process strategies
num_pipeline = make_pipeline(
            StandardScaler()
            )
cate_number_pipeline = make_pipeline(
            OneHotEncoder(handle_unknown='ignore')
            )
cate_object_pipeline = make_pipeline(
            OneHotEncoder(handle_unknown='ignore')
            )

# transform train set and test set

num_cols = []

cate_cols = app_train_extra_add.select_dtypes(include='object').columns.tolist()
cate_cols.extend(select_low_cardinality_numeric_features(app_train_extra_add, 'TARGET'))

num_cols.extend(app_train_extra_add.columns.difference(cate_cols))

cate_cols_object = app_train_extra_add[cate_cols].select_dtypes(include=['object']).columns
cate_cols_number = app_train_extra_add[cate_cols].select_dtypes(include=['number']).columns

processing = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cate_object',cate_object_pipeline,cate_cols_object),
    ('cate_number', cate_number_pipeline, cate_cols_number)
], remainder='passthrough')

processing = processing.fit(app_train_extra_add)

transformed_data_train = processing.transform(app_train_extra_add)
transformed_data_test = processing.transform(app_test_extra_add)

transformed_data_train = pd.DataFrame(transformed_data_train, columns=processing.get_feature_names_out())
transformed_data_test = pd.DataFrame(transformed_data_test, columns=processing.get_feature_names_out())

transformed_data_train = transformed_data_train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
transformed_data_test = transformed_data_test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Define models
lgbm = LGBMClassifier(boosting_type='gbdt', 
                 num_leaves=20,
                 max_depth = 10,
                 n_estimators=9000,
                 subsample_for_bin=1000,
                 learning_rate=0.01,
                 colsample_bytree=0.1,
                 subsample=0.75,
                 min_split_gain=0.05,
                 min_child_samples=300
                )

# train model on all data
lgbm.fit(transformed_data_train,app_train_labels)

Initial memory usage: 286.23 MB
Optimized memory usage: 93.55 MB
Initial memory usage: 45.00 MB
Optimized memory usage: 14.78 MB
Initial memory usage: 222.62 MB
Optimized memory usage: 126.04 MB
(305810, 43)
Initial memory usage: 624.85 MB
Optimized memory usage: 442.60 MB
Initial memory usage: 222.62 MB
Optimized memory usage: 126.04 MB
(84483, 42)
Initial memory usage: 471.48 MB
Optimized memory usage: 321.75 MB
(338857, 46)
Initial memory usage: 610.43 MB
Optimized memory usage: 314.76 MB
(337252, 39)
Initial memory usage: 830.41 MB
Optimized memory usage: 415.20 MB
(339572, 45)
Initial memory usage: 673.88 MB
Optimized memory usage: 318.63 MB
(103558, 71)
Shape before merging: (307511, 121)
Shape after merging: (307511, 406)
Shape before merging: (48744, 121)
Shape after merging: (48744, 406)


27

### submit result

In [110]:
y_pred = lgbm.predict_proba(transformed_data_test)[:, 1]

In [107]:
application_test = load_data('application_test')
application_test = reduce_mem_usage(application_test)

Initial memory usage: 45.00 MB
Optimized memory usage: 14.78 MB


In [111]:
application_test['TARGET'] = y_pred

In [114]:
application_test.shape

(48744, 122)

In [113]:
application_test[['SK_ID_CURR', 'TARGET']].to_csv('./data/submission.csv', index=False)

In [120]:
!kaggle competitions submit -c home-credit-default-risk -f ./data/submission.csv -m "first version"

zsh:1: command not found: kaggle
