In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import StratifiedKFold

In [2]:
train_data = pd.read_csv('./train_dataset/train_public.csv')
test_public = pd.read_csv('./test_public.csv')
train_internet = pd.read_csv('./train_dataset/train_internet.csv', encoding='gb2312')

In [3]:
train_internet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 42 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   loan_id                   750000 non-null  int64  
 1   user_id                   750000 non-null  int64  
 2   total_loan                750000 non-null  int64  
 3   year_of_loan              750000 non-null  int64  
 4   interest                  750000 non-null  float64
 5   monthly_payment           750000 non-null  float64
 6   class                     750000 non-null  object 
 7   sub_class                 750000 non-null  object 
 8   work_type                 750000 non-null  object 
 9   employer_type             750000 non-null  object 
 10  industry                  750000 non-null  object 
 11  work_year                 706153 non-null  object 
 12  house_exist               750000 non-null  int64  
 13  house_loan_status         750000 non-null  i

### 特征工程 

#### work_year, class 

In [4]:
work_year_dict = {
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10,
}

train_data['work_year'] = train_data['work_year'].map(work_year_dict)
test_public['work_year'] = test_public['work_year'].map(work_year_dict)
train_data['work_year'] = train_data['work_year'].fillna(-1)
test_public['work_year'] = test_public['work_year'].fillna(-1)
train_internet['work_year'] = train_internet['work_year'].map(work_year_dict)
train_internet['work_year'] = train_internet['work_year'].fillna(-1)

In [5]:
class_dict = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7,
}

train_data['class'] = train_data['class'].map(class_dict)
test_public['class'] = test_public['class'].map(class_dict)
train_internet['class'] = train_internet['class'].map(class_dict)

#### issue_date, earlies_credit_mon

In [6]:
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
train_internet['issue_date'] = pd.to_datetime(train_internet['issue_date'])

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = train_data['issue_date'].dt.month
train_internet['issue_date_month'] = train_internet['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_public['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
train_internet['issue_date_dayofweek'] = train_internet['issue_date'].dt.dayofweek


In [7]:
for data in [train_data, test_public, train_internet]: 
    data['earlies_credit_mon'] = pd.to_datetime(data['earlies_credit_mon'],format='%Y-%m-%d',errors='coerce')
    

In [8]:
train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'])
train_data['earlies_credit_Mon'] = train_data['earlies_credit_mon'].dt.month
train_data['earlies_credit_Year'] = train_data['earlies_credit_mon'].dt.year

In [9]:
train_data['earlies_credit_Mon']

0       12
1        4
2       10
3        6
4        5
        ..
9995     2
9996     5
9997     2
9998    10
9999     2
Name: earlies_credit_Mon, Length: 10000, dtype: int64

In [10]:
test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'])
test_public['earlies_credit_Mon'] = test_public['earlies_credit_mon'].dt.month
test_public['earlies_credit_Year'] = test_public['earlies_credit_mon'].dt.year

train_internet['earlies_credit_mon'] = pd.to_datetime(train_internet['earlies_credit_mon'])
train_internet['earlies_credit_Mon'] = train_internet['earlies_credit_mon'].dt.month
train_internet['earlies_credit_Year'] = train_internet['earlies_credit_mon'].dt.year

In [11]:
col_to_drop = ['issue_date', 'earlies_credit_mon', 'issue_date_month', 'earlies_credit_Mon','earlies_credit_Year']
train_data = train_data.drop(col_to_drop, axis=1)
test_public = test_public.drop(col_to_drop, axis=1)
train_internet = train_internet.drop(col_to_drop, axis=1 )

#### employer_type, industry

In [12]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['employer_type', 'industry']
for col in cat_cols:
    lbl = LabelEncoder().fit(train_data[col])
    train_data[col] = lbl.transform(train_data[col])
    test_public[col] = lbl.transform(test_public[col])
    train_internet[col] = lbl.transform(train_internet[col])

In [13]:
train_data['industry']

0       13
1       13
2        3
3       10
4        2
        ..
9995     7
9996     4
9997     2
9998     4
9999     2
Name: industry, Length: 10000, dtype: int32

#### Internet表补充 

In [14]:
tr_cols = set(train_data.columns)
same_col = list(tr_cols.intersection(set(train_internet.columns)))
train_inteSame = train_internet[same_col].copy()

Inte_add_cos = list(tr_cols.difference(set(same_col)))
for col in Inte_add_cos:
    train_inteSame[col] = np.nan

In [15]:
train_inteSame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 38 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   f0                        712202 non-null  float64
 1   debt_loan_ratio           749770 non-null  float64
 2   house_exist               750000 non-null  int64  
 3   recircle_b                750000 non-null  int64  
 4   use                       750000 non-null  int64  
 5   monthly_payment           750000 non-null  float64
 6   pub_dero_bankrup          749625 non-null  float64
 7   interest                  750000 non-null  float64
 8   title                     749999 non-null  float64
 9   early_return              750000 non-null  int64  
 10  year_of_loan              750000 non-null  int64  
 11  scoring_low               750000 non-null  int64  
 12  user_id                   750000 non-null  int64  
 13  post_code                 749999 non-null  f

### 模型 

In [17]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

In [18]:
from sklearn.model_selection import ShuffleSplit

def train_model(data_, test_, y_, folds_):
    oof_preds = np.zeros(data_.shape[0])
    sub_preds = np.zeros(test_.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
    for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
        trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
        val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
        clf = LGBMClassifier(
            n_estimators=4000,
            learning_rate=0.08,
            num_leaves=2**5,
            colsample_bytree=.65,
            subsample=.9,
            max_depth=5,
#             max_bin=250,
            reg_alpha=.3,
            reg_lambda=.3,
            min_split_gain=.01,
            min_child_weight=2,
            silent=-1,
            verbose=-1,
        )
        
        clf.fit(trn_x, trn_y, 
                eval_set= [(trn_x, trn_y), (val_x, val_y)], 
                eval_metric='auc', verbose=100, early_stopping_rounds=50  #30
               )

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()
        
    print('Full AUC score %.6f' % roc_auc_score(y_, oof_preds)) 
    
    test_['isDefault'] = sub_preds

    return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
    
def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:50].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

In [19]:
train_label = train_data['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=546789)

In [20]:
oof_preds, IntePre, importances = train_model(train_data, train_inteSame, train_label, folds)

Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.955788	training's binary_logloss: 0.221644	valid_1's auc: 0.889459	valid_1's binary_logloss: 0.293621
Early stopping, best iteration is:
[75]	training's auc: 0.943318	training's binary_logloss: 0.237837	valid_1's auc: 0.890434	valid_1's binary_logloss: 0.29229
Fold  1 AUC : 0.890434
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.952455	training's binary_logloss: 0.227972	valid_1's auc: 0.88807	valid_1's binary_logloss: 0.290497
Early stopping, best iteration is:
[73]	training's auc: 0.941306	training's binary_logloss: 0.241807	valid_1's auc: 0.889551	valid_1's binary_logloss: 0.289273
Fold  2 AUC : 0.889551
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.951544	training's binary_logloss: 0.225107	valid_1's auc: 0.87341	valid_1's binary_logloss: 0.313308
Early stopping, best iteration is:
[81]	training's auc: 0.945248	training's 

In [21]:
IntePre['isDef'] = train_internet['is_default']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
from sklearn.metrics import roc_auc_score
roc_auc_score(IntePre['isDef'],IntePre.isDefault)

0.729664607024783

In [23]:
InteId = IntePre.loc[IntePre.isDefault<0.05, 'loan_id'].tolist()

train_data['dataSourse'] = 1
test_public['dataSourse'] = 1
train_inteSame['dataSourse'] = 0
train_inteSame['isDefault'] = train_internet['is_default']
use_te = train_inteSame[train_inteSame.loan_id.isin( InteId )].copy()
data = pd.concat([ train_data,test_public,use_te]).reset_index(drop=True)


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74765 entries, 0 to 74764
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   loan_id                   74765 non-null  int64  
 1   user_id                   74765 non-null  int64  
 2   total_loan                74765 non-null  float64
 3   year_of_loan              74765 non-null  int64  
 4   interest                  74765 non-null  float64
 5   monthly_payment           74765 non-null  float64
 6   class                     74765 non-null  int64  
 7   employer_type             74765 non-null  int32  
 8   industry                  74765 non-null  int32  
 9   work_year                 74765 non-null  float64
 10  house_exist               74765 non-null  int64  
 11  censor_status             74765 non-null  int64  
 12  use                       74765 non-null  int64  
 13  post_code                 74764 non-null  float64
 14  region

In [None]:
# plt.figure(figsize=(16,6))
# plt.title("Distribution of Default values IntePre")
# sns.distplot(IntePre['isDefault'],color="black", kde=True,bins=120, label='train_data')
# # sns.distplot(train_inte[col],color="red", kde=True,bins=120, label='train_inte')
# plt.legend();plt.show()

In [26]:
train = data[data['isDefault'].notna()]
test  = data[data['isDefault'].isna()]
# for col in ['sub_class', 'work_type']:
#     del train[col]
#     del test[col]


# del data
# del train_data,test_public


y = train['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=546789)
oof_preds, test_preds, importances = train_model(train, test, y, folds)

Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.993511	training's binary_logloss: 0.0329506	valid_1's auc: 0.985983	valid_1's binary_logloss: 0.0448928
Early stopping, best iteration is:
[74]	training's auc: 0.991818	training's binary_logloss: 0.0352189	valid_1's auc: 0.986134	valid_1's binary_logloss: 0.0446655
Fold  1 AUC : 0.986134
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[32]	training's auc: 0.990056	training's binary_logloss: 0.0404838	valid_1's auc: 0.984362	valid_1's binary_logloss: 0.0446579
Fold  2 AUC : 0.984362
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.992912	training's binary_logloss: 0.0342843	valid_1's auc: 0.986089	valid_1's binary_logloss: 0.0424979
Early stopping, best iteration is:
[141]	training's auc: 0.995357	training's binary_logloss: 0.030763	valid_1's auc: 0.98629	valid_1's binary_logloss: 0.0422579
Fold  3 AUC : 0.986290
Trainin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
test_preds

Unnamed: 0,loan_id,isDefault
10000,1000575,0.007282
10001,1028125,0.057512
10002,1010694,0.005172
10003,1026712,0.008515
10004,1002895,0.007363
...,...,...
14995,1008856,0.341777
14996,1016651,0.009579
14997,1024140,0.003448
14998,1014316,0.007802


In [None]:
test_preds.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('./submit/test1.csv', index=False)

In [None]:
test_preds

In [None]:
tmp = test_preds['isDefault']
c = np.sum(tmp>1)
c