In [1]:
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import joblib
from functools import partial

%matplotlib inline
import seaborn as sns
import category_encoders as ce

def _get_last_k_applications_feature_name(feature_name, number, suffix):
    return 'application_previous_application_{}_last_{}_applications_{}'.format(feature_name, number, suffix)


def get_last_k_credits_features(merged_sorted, numbers_of_applications):
    features = pd.DataFrame({'SK_ID_CURR': merged_sorted['SK_ID_CURR'].unique()})
    feature_list = ['annuity_diff', 'annuity_ratio', 'credit_diff', 'credit_ratio', 'the_same_contract_type',
                        'the_same_type_suite', 'the_same_weekday', 'hour_diff']

    for number in numbers_of_applications:
        table_tail = merged_sorted.groupby('SK_ID_CURR').tail(number)
        tail_groupby = table_tail.groupby('SK_ID_CURR')
        g = tail_groupby[feature_list].agg('mean')

        g = g.rename(axis='columns', mapper=partial(_get_last_k_applications_feature_name, number=number,
                                        suffix='mean')).reset_index()

        features = features.merge(g, how='left', on=['SK_ID_CURR'])
    return features

In [2]:
DIR = '/Users/xiaohehe/Desktop/hkust/24-25fall/fintech/project1/data'
#description = pd.read_csv(os.path.join(DIR,'HomeCredit_columns_description.csv'),encoding = 'latin1')
application = pd.read_csv(os.path.join(DIR, 'application_test.csv'))
previous_application = pd.read_csv(os.path.join(DIR, 'previous_application.csv'))
features = pd.DataFrame({'SK_ID_CURR': application['SK_ID_CURR']})

common_columns = [col for col in application.columns if col in previous_application.columns]
application_common = application[common_columns]
merged_tables = previous_application[common_columns + ['DAYS_DECISION']].merge(application_common, on='SK_ID_CURR',
                                                                               how='right', suffixes=('_previous', '_current'))
#合并相同的列，以观察两个表格的差异

merged_sorted = merged_tables.sort_values(['SK_ID_CURR', 'DAYS_DECISION']) 

merged_sorted['annuity_diff'] = merged_sorted['AMT_ANNUITY_current'] - merged_sorted['AMT_ANNUITY_previous']
merged_sorted['annuity_ratio'] = merged_sorted['AMT_ANNUITY_current'] / merged_sorted['AMT_ANNUITY_previous']
merged_sorted['credit_diff'] = merged_sorted['AMT_CREDIT_current'] - merged_sorted['AMT_CREDIT_previous']
merged_sorted['credit_ratio'] = merged_sorted['AMT_CREDIT_current'] / merged_sorted['AMT_CREDIT_previous']

merged_sorted['the_same_contract_type'] = (
    merged_sorted['NAME_CONTRACT_TYPE_previous'] == merged_sorted['NAME_CONTRACT_TYPE_current']).astype(int)
merged_sorted['the_same_weekday'] = (merged_sorted['WEEKDAY_APPR_PROCESS_START_previous'] == merged_sorted['WEEKDAY_APPR_PROCESS_START_current']).astype(int)
merged_sorted['hour_diff'] = merged_sorted['HOUR_APPR_PROCESS_START_previous'] - merged_sorted['HOUR_APPR_PROCESS_START_current']
merged_sorted['the_same_type_suite'] = (merged_sorted['NAME_TYPE_SUITE_previous'] == merged_sorted['NAME_TYPE_SUITE_current']
                                       ).astype(int)
merged_sorted['the_same_type_suite'][merged_sorted['NAME_TYPE_SUITE_previous'].isnull()] = 1

g = get_last_k_credits_features(merged_sorted, numbers_of_applications=[1,3,5,10])

# 观察最近k个申请的几何数据

features = features.merge(g, on=['SK_ID_CURR'], how='left')
features


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  merged_sorted['the_same_type_suite'][merged_sorted['NAME_TYPE_SUITE_previous'].isnull()] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ret

Unnamed: 0,SK_ID_CURR,application_previous_application_annuity_diff_last_1_applications_mean,application_previous_application_annuity_ratio_last_1_applications_mean,application_previous_application_credit_diff_last_1_applications_mean,application_previous_application_credit_ratio_last_1_applications_mean,application_previous_application_the_same_contract_type_last_1_applications_mean,application_previous_application_the_same_type_suite_last_1_applications_mean,application_previous_application_the_same_weekday_last_1_applications_mean,application_previous_application_hour_diff_last_1_applications_mean,application_previous_application_annuity_diff_last_3_applications_mean,...,application_previous_application_the_same_weekday_last_5_applications_mean,application_previous_application_hour_diff_last_5_applications_mean,application_previous_application_annuity_diff_last_10_applications_mean,application_previous_application_annuity_ratio_last_10_applications_mean,application_previous_application_credit_diff_last_10_applications_mean,application_previous_application_credit_ratio_last_10_applications_mean,application_previous_application_the_same_contract_type_last_10_applications_mean,application_previous_application_the_same_type_suite_last_10_applications_mean,application_previous_application_the_same_weekday_last_10_applications_mean,application_previous_application_hour_diff_last_10_applications_mean
0,100001,16609.500,5.203872,545013.000,23.912221,0.0,0.0,0.0,-5.0,16609.5000,...,0.00,-5.00,16609.50000,5.203872,5.450130e+05,23.912221,0.00,0.0,0.00,-5.000
1,100005,,,222768.000,inf,1.0,1.0,1.0,1.0,12556.8000,...,0.50,1.50,12556.80000,3.608826,2.026912e+05,inf,0.50,1.0,0.50,1.500
2,100013,,,663264.000,inf,1.0,1.0,0.0,2.0,55828.8000,...,0.25,0.50,58298.80500,9.466408,5.171299e+05,inf,0.50,0.5,0.25,0.500
3,100028,,,1575000.000,inf,1.0,1.0,0.0,-1.0,42021.7650,...,0.00,-0.20,40926.91500,6.498295,1.482080e+06,inf,0.20,0.8,0.00,-0.200
4,100038,7603.290,1.310799,117004.500,1.230099,1.0,1.0,0.0,2.0,14284.8450,...,0.00,0.50,14284.84500,2.099781,3.249495e+05,3.992279,0.50,1.0,0.00,0.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,3251.070,1.228588,157860.000,1.619788,1.0,1.0,0.0,-2.0,3251.0700,...,0.00,-2.00,3251.07000,1.228588,1.578600e+05,1.619788,1.00,1.0,0.00,-2.000
48740,456222,17337.735,2.189817,413559.000,2.980134,1.0,1.0,1.0,0.0,24018.0450,...,0.75,1.25,24940.60875,6.093606,5.237089e+05,11.910617,0.25,1.0,0.75,1.250
48741,456223,20564.550,2.626820,203704.335,2.830299,0.0,1.0,1.0,8.0,19004.4225,...,0.50,8.00,19004.42250,2.366803,1.824832e+05,2.439620,0.00,1.0,0.50,8.000
48742,456224,,,450000.000,inf,1.0,1.0,0.0,1.0,10324.8675,...,0.00,4.00,13641.78375,3.007298,3.224214e+05,inf,0.40,0.6,0.00,4.000


In [3]:
features.to_parquet('/Users/xiaohehe/Desktop/hkust/24-25fall/fintech/project1/features/app_pre_test.parquet')