In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix, r2_score
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [2]:
users = pd.read_csv("user_spec.csv")
loans = pd.read_csv("loan_result.csv")

In [3]:
joined = loans.join(users.set_index("application_id"), on="application_id")
joined = joined.dropna(subset = ["loan_limit", "loan_rate"])

In [4]:
mean_income = joined.yearly_income.mean()
joined.yearly_income = joined.yearly_income.fillna(mean_income)

In [5]:
median_birth_year = joined.birth_year.median()
joined.birth_year = joined.birth_year.fillna(median_birth_year)
joined.birth_year = pd.to_datetime(joined.birth_year, format = "%Y")

In [6]:
joined[[
    "personal_rehabilitation_yn", "personal_rehabilitation_complete_yn", 
    "existing_loan_cnt", "existing_loan_amt"
]] = joined[[
    "personal_rehabilitation_yn", "personal_rehabilitation_complete_yn", 
    "existing_loan_cnt", "existing_loan_amt"
]].fillna(0)

In [7]:
joined.company_enter_month = joined.company_enter_month.fillna(202206)
joined.company_enter_month = joined.company_enter_month.astype("str").str[:6]
temp = joined.company_enter_month.str[:6]
temp = pd.to_datetime(temp, format = "%Y%m")
joined.company_enter_month = temp

In [8]:
joined.loanapply_insert_time = pd.to_datetime(joined.loanapply_insert_time)
joined.user_id = joined.user_id.astype(int)

In [9]:
gender_na = joined[joined.gender.isna()]
gender_not_na = joined[~joined.gender.isna()]
total_user_num = len(gender_not_na.user_id.unique())
by_user = gender_not_na.groupby("user_id").max()
num_males = by_user.gender.sum()
prob = num_males / total_user_num
na_user_id_list = gender_na.user_id.unique()
np.random.seed(0)
random_array = np.random.binomial(1, prob, len(na_user_id_list))
random_gender = dict(zip(na_user_id_list, random_array))
na_user_genders = joined.loc[joined.user_id.isin(na_user_id_list)].groupby(joined.user_id).max()
not_na_id = na_user_genders[~na_user_genders.gender.isna()].user_id
not_na_gender = na_user_genders[~na_user_genders.gender.isna()].gender
non_random_gender = dict(zip(not_na_id, not_na_gender))
joined.gender = joined.gender.fillna(joined.user_id.map(non_random_gender))
joined.gender = joined.gender.fillna(joined.user_id.map(random_gender))

In [10]:
joined['birth_year'] = joined['birth_year'].apply(lambda x: int(str(x)[0:4]))
joined['yearly_income'] = joined['yearly_income'].apply(lambda t: np.log10(1+t))
joined['desired_amount'] = joined['desired_amount'].apply(lambda t: np.log10(1+t))
joined['existing_loan_amt'] = joined['existing_loan_amt'].apply(lambda t: np.log10(1+t))
joined['existing_loan_cnt'] = joined['existing_loan_cnt'].apply(lambda t: int(float(t)))

In [11]:
credit_tree_joined = pd.concat(
    [
        joined,
        pd.get_dummies(joined.income_type, prefix='income'),
        pd.get_dummies(joined.houseown_type, prefix='house')
    ],
    axis=1
).reset_index(drop=True).drop(
    [
        "application_id", "loanapply_insert_time",
        'insert_time',"bank_id", "product_id",
        'income_type','loan_rate','loan_limit',
        'is_applied','user_id','company_enter_month',
        'purpose','employment_type','houseown_type'
    ],
    axis=1
)

In [12]:
train_credit = credit_tree_joined[credit_tree_joined['credit_score'].notna()]
train_null_credit = credit_tree_joined[credit_tree_joined['credit_score'].isna()]

In [13]:
credit_tree_X = train_credit.drop(['credit_score'], axis=1)
credit_tree_y = train_credit['credit_score']
credit_tree_X_train, credit_tree_X_test, credit_tree_y_train, credit_tree_y_test = train_test_split(
    credit_tree_X, credit_tree_y, test_size=0.3, random_state=44
)
credit_tree_model = DecisionTreeRegressor(random_state=44)
credit_tree_model.fit(credit_tree_X_train, credit_tree_y_train)

In [14]:
r2_score(credit_tree_y_test, credit_tree_model.predict(credit_tree_X_test))

0.955760685174084

In [15]:
credit_predict = credit_tree_model.predict(train_null_credit.drop(['credit_score'],axis = 1))
null_credit = joined[joined['credit_score'].isna()]
null_credit['credit_score'] = credit_predict
null_credit['credit_score']= null_credit['credit_score']//10*10

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_credit['credit_score'] = credit_predict
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_credit['credit_score']= null_credit['credit_score']//10*10


In [16]:
data = pd.concat([joined[joined['credit_score'].notna()],null_credit])

In [17]:
cleaned_up = data.copy()

In [18]:
data=data[[
    'loan_limit','loan_rate','credit_score',
    'company_enter_month','desired_amount','income_type',
    'insert_time','user_id','application_id', 'is_applied'
]]
data['earnedincome']=(data['income_type']=='EARNEDINCOME')*1
data.insert_time = pd.to_datetime(data.insert_time)
del data ['income_type']
data = data.sort_values(by=['user_id','insert_time'])

In [19]:
user_array=data['user_id'].values
timestamp_array=data['insert_time'].dt.date.values
application_array=data['application_id'].values
array=np.zeros(len(data))
recent_user=0
recent_application=0
i = 0
store = float(0)

In [20]:
for u in user_array:
    if u==recent_user:
        if application_array[i]==recent_application:
            array[i]=store
        else:
            store=0
            for t in time_list:
                if timestamp_array[i]==t:
                    store=store+1
                else:
                    store=store+0.9**((timestamp_array[i]-t).days)
                    
            array[i]=store
            time_list.append(timestamp_array[i])
            recent_application=application_array[i]
    else:
        time_list=[]
        store=0
        array[i]=store
        time_list.append(timestamp_array[i])
        recent_user=user_array[i]
        recent_application=application_array[i]
    i=i+1

In [21]:
data['recent_app_count'] = array

In [22]:
problem1_train=data[data['insert_time']<'2022-06-01']
problem1_train=problem1_train.sort_values(by=['is_applied', 'loan_rate'])
problem1_train=problem1_train.drop_duplicates(['application_id'],keep='last')

problem1_test=data[data['insert_time']>='2022-06-01']
problem1_test=problem1_test.sort_values(by = ['loan_rate'])
problem1_test=problem1_test.drop_duplicates(['application_id'], keep = 'last')
test = problem1_test.copy()

In [23]:
problem1_train_label=problem1_train['is_applied']
del problem1_train['is_applied']

In [24]:
del problem1_train["insert_time"]
del problem1_train["user_id"]
del problem1_train["application_id"]
del problem1_test["insert_time"]
del problem1_test["user_id"]
del problem1_test["application_id"]
del problem1_test["is_applied"]

In [25]:
problem1_model = CatBoostClassifier(iterations=4,
                           depth=15,
                           learning_rate=0.83,
                           loss_function='Logloss',
                           verbose=True)

In [26]:
problem1_model.fit(problem1_train,problem1_train_label)

0:	learn: 0.4364161	total: 368ms	remaining: 1.1s
1:	learn: 0.3894163	total: 738ms	remaining: 738ms
2:	learn: 0.3720893	total: 1.03s	remaining: 343ms
3:	learn: 0.3652599	total: 1.32s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2b09af400>

In [27]:
applied_ratio = problem1_train_label.mean()
preds_proba = problem1_model.predict_proba(problem1_test)
preds_proba = preds_proba[:, 1]
problem1_test.loc[:, "predict"] = preds_proba > applied_ratio

In [28]:
idx = problem1_test.loc[problem1_test["predict"]].index

In [29]:
predict_app= data.loc[idx]

In [30]:
predict_app.loc[:, ["sum", "is_enough", "predict"]] = 0

In [31]:
predict_app = predict_app[[
    "application_id", "loan_limit", "loan_rate", "is_applied", 
    "desired_amount", "sum", "is_enough", "predict"
]]

In [32]:
predict_app = predict_app.sort_values(by = ["application_id", "loan_rate"])

In [33]:
predict_app.iloc[0, 5] = predict_app.iloc[0, 1]

In [34]:
for i in range (1, len(predict_app)):
    predict_app.iloc[i, 5] = predict_app.iloc[i, 1]
    if (predict_app.iloc[i, 0] == predict_app.iloc[i-1, 0]):
        predict_app.iloc[i, 5] += predict_app.iloc[i-1, 5]

In [35]:
predict_app["is_enough"] = predict_app["sum"] > predict_app.desired_amount
predict_app.iloc[0, 7] = 1
predict_app.iloc[1, 7] = 1

In [36]:
for i in range(2, len(predict_app)):
    if (predict_app.iloc[i, 0] == predict_app.iloc[i-1, 0]):
        if (not predict_app.iloc[i, 6]):
            predict_app.iloc[i, 7] = 1
        elif (not predict_app.iloc[i-1, 6]):
            predict_app.iloc[i, 7] = 1
        elif (not predict_app.iloc[i-2, 6]):
            predict_app.iloc[i, 7] = 1
    else:
        predict_app.iloc[i, 7] = 1

In [37]:
applied_index = predict_app.loc[predict_app.predict == 1].index

In [38]:
cleaned_up.loc[:, "predict"] = 0
cleaned_up.loc[applied_index, "predict"] = 1

In [39]:
prediction_data = pd.read_csv("predict_data.csv")

In [40]:
prediction_data

Unnamed: 0,application_id,product_id,is_applied
0,4,220,
1,4,191,
2,8,29,
3,8,159,
4,8,85,
...,...,...,...
3255189,2167778,258,
3255190,2167791,29,
3255191,2167822,149,
3255192,2167822,157,


In [41]:
df = pd.merge(prediction_data, cleaned_up, on = ["application_id", "product_id"])

In [42]:
df.loc[:, "is_applied_x"] = df.predict

In [43]:
df = df[["application_id", "product_id", "is_applied_x"]]

In [47]:
df.columns = [["appliction_id", "product_id", "is_applied"]]

In [48]:
df.to_csv("final.csv")