In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score,precision_score,recall_score,confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth',None)

In [2]:
from catboost import CatBoostClassifier

In [3]:
train=pd.read_csv(r".\Train\Train\train_Data.csv")

In [4]:
trainb=pd.read_excel(r".\Train\Train\train_bureau.xlsx")

In [5]:
test=pd.read_excel(r".\Test\test_Data.xlsx")

In [6]:
testb=pd.read_csv(r".\Test\test_bureau.csv")

In [8]:
train.columns

Index(['ID', 'Frequency', 'InstlmentMode', 'LoanStatus', 'PaymentMode',
       'BranchID', 'Area', 'Tenure', 'AssetCost', 'AmountFinance',
       'DisbursalAmount', 'EMI', 'DisbursalDate', 'MaturityDAte', 'AuthDate',
       'AssetID', 'ManufacturerID', 'SupplierID', 'LTV', 'SEX', 'AGE',
       'MonthlyIncome', 'City', 'State', 'ZiPCODE', 'Top-up Month'],
      dtype='object')

In [12]:
trainb['ACCOUNT-STATUS'].value_counts()

Closed                         320255
Active                         201897
Delinquent                      32457
Written Off                      2937
Suit Filed                       2062
Settled                           626
Restructured                      511
SUIT FILED (WILFUL DEFAULT)        70
WILFUL DEFAULT                     27
Sold/Purchased                      1
Cancelled                           1
Name: ACCOUNT-STATUS, dtype: int64

In [7]:
trainb=trainb.drop_duplicates(subset=['ID','DISBURSED-DT','ACCT-TYPE'])

In [8]:
testb=testb.drop_duplicates(subset=['ID','DISBURSED-DT','ACCT-TYPE'])

In [42]:
trainb=trainb.replace("nan",None)
testb=testb.replace("nan",None)

  mask = arr == x


In [9]:
def freqconvert(frq):
    if frq=='Half Yearly': return 1/6
    elif frq=='Monthly' : return 1.0
    elif frq=='Quatrly' : return 1/3
    elif frq=='BI-Monthly' : return 1/2
    else : return 1.0 

In [91]:
def preprocessingdata(data,bureau):
    data['freqn']=data['Frequency'].apply(freqconvert)
    data['interest']=data['EMI']*data['freqn']*data['Tenure']
    data['intoamt']=data['interest']/data['DisbursalAmount']
    data['anninc']=data['MonthlyIncome']*12
    data['amttoinc']=data['DisbursalAmount']/data['anninc']
    data['MaturityDAte']=pd.to_datetime(data['MaturityDAte'])
    data['DisbursalDate']=pd.to_datetime(data['DisbursalDate'])
    data['tmtom']=(data['MaturityDAte']-data['DisbursalDate'])/np.timedelta64(1,'M')
    data['payrej']=data['PaymentMode'].apply(lambda x : 1 if "Reject" in x else 0)
    bureau=bureau.replace(np.nan,None)
    bureau['CURRENT-BAL']=bureau['CURRENT-BAL'].astype(str)
    bureau['CURRENT-BAL']=bureau['CURRENT-BAL'].fillna(0)
    bureau['CURRENT-BAL']=bureau['CURRENT-BAL'].apply(lambda x : x.replace(",",""))
    bureau['CURRENT-BAL']=bureau['CURRENT-BAL'].astype(int)
    idgrpbal=bureau.groupby('ID')['CURRENT-BAL'].sum()
    idgrpaccstt=bureau.groupby('ID')['ACCOUNT-STATUS'].value_counts().unstack().fillna(0)
    idgrpaccstt['bad']=idgrpaccstt['Delinquent']+idgrpaccstt['Written Off']
    idgrpaccstt.reset_index(inplace=True)
    idgrpaccstt=idgrpaccstt[['ID','bad']]
    temp=pd.merge(data,bureau,on='ID',how='left')
    temp['DISBURSED-DT']=pd.to_datetime(temp['DISBURSED-DT'])
    temp['sameloan']=np.where((temp['DisbursalDate']==temp['DISBURSED-DT']) & (temp['SELF-INDICATOR']==True),1,0)
    temp2=temp[temp['sameloan']==1]
    temp2=temp2[['ID','ACCT-TYPE']]
    tempfinal=pd.merge(temp,temp2,on=['ID','ACCT-TYPE'],how='inner')
    tempfinal['datediff']=(tempfinal['DISBURSED-DT']-tempfinal['DisbursalDate'])/np.timedelta64(1,'M')
    tempfinal=tempfinal[tempfinal['datediff']>0]
    tempfinal=tempfinal[tempfinal['SELF-INDICATOR']==True]
    idgrpnwlndf=tempfinal.groupby('ID')['datediff'].mean()
    idgrpnwlndf=idgrpnwlndf.to_frame(name='mttonewloan')
    datafinal=data.merge(idgrpbal,on='ID',how='left').merge(idgrpaccstt,on='ID',how='left').merge(idgrpnwlndf,on='ID',how='left')
    datafinal.loc[:,'AGE']=datafinal.loc[:,'AGE'].fillna(datafinal.loc[:,'AGE'].mean())
    datafinal.loc[:,'SEX']=datafinal.loc[:,'SEX'].fillna('X')
    datafinal.loc[:,'tmtom']=datafinal.loc[:,'tmtom'].fillna(9999999)
    datafinal.loc[:,'mttonewloan']=datafinal.loc[:,'mttonewloan'].fillna(-9999)
    datafinal.loc[:,'amttoinc']=datafinal.loc[:,'amttoinc'].fillna(datafinal.loc[:,'amttoinc'].mean())
    datafinal.loc[:,'anninc']=datafinal.loc[:,'anninc'].fillna(datafinal.loc[:,'anninc'].mean())
    return datafinal

In [92]:
traindata=preprocessingdata(train,trainb)
testdata=preprocessingdata(test,testb)

In [137]:
fli=['Tenure','LoanStatus','AGE','LTV','intoamt','amttoinc','anninc','payrej',\
     'CURRENT-BAL','bad','mttonewloan','tmtom','Top-up Month']

In [138]:
traindata1=traindata[fli]

In [139]:
xtrain, xval, ytrain, yval=train_test_split(traindata1.iloc[:,:-1].values,traindata1.iloc[:,-1:].values,test_size=0.2,random_state=42)

In [142]:
model=CatBoostClassifier(random_strength=0.1,loss_function='MultiClass',eval_metric='TotalF1',leaf_estimation_method='Newton',\
                        learning_rate=0.11,depth=5,iterations=183)

In [143]:
model.fit(xtrain,ytrain,cat_features=[1,8],eval_set=(xval,yval))

0:	learn: 0.7893338	test: 0.7964225	best: 0.7964225 (0)	total: 117ms	remaining: 21.4s
1:	learn: 0.7896832	test: 0.7965194	best: 0.7965194 (1)	total: 232ms	remaining: 21s
2:	learn: 0.7937270	test: 0.7993488	best: 0.7993488 (2)	total: 348ms	remaining: 20.9s
3:	learn: 0.7937270	test: 0.7993488	best: 0.7993488 (2)	total: 464ms	remaining: 20.7s
4:	learn: 0.7936011	test: 0.7993488	best: 0.7993488 (2)	total: 582ms	remaining: 20.7s
5:	learn: 0.7936011	test: 0.7993488	best: 0.7993488 (2)	total: 717ms	remaining: 21.2s
6:	learn: 0.7935828	test: 0.7992760	best: 0.7993488 (2)	total: 847ms	remaining: 21.3s
7:	learn: 0.7936011	test: 0.7993488	best: 0.7993488 (2)	total: 978ms	remaining: 21.4s
8:	learn: 0.7937549	test: 0.7994759	best: 0.7994759 (8)	total: 1.1s	remaining: 21.2s
9:	learn: 0.7937549	test: 0.7994759	best: 0.7994759 (8)	total: 1.2s	remaining: 20.8s
10:	learn: 0.7936320	test: 0.7992899	best: 0.7994759 (8)	total: 1.33s	remaining: 20.9s
11:	learn: 0.7936394	test: 0.7992899	best: 0.7994759 (8)	

94:	learn: 0.8097330	test: 0.8145998	best: 0.8146259 (93)	total: 12.1s	remaining: 11.2s
95:	learn: 0.8097002	test: 0.8145486	best: 0.8146259 (93)	total: 12.2s	remaining: 11.1s
96:	learn: 0.8096569	test: 0.8147281	best: 0.8147281 (96)	total: 12.3s	remaining: 10.9s
97:	learn: 0.8097159	test: 0.8146774	best: 0.8147281 (96)	total: 12.4s	remaining: 10.8s
98:	learn: 0.8098192	test: 0.8148453	best: 0.8148453 (98)	total: 12.6s	remaining: 10.7s
99:	learn: 0.8098449	test: 0.8150104	best: 0.8150104 (99)	total: 12.7s	remaining: 10.5s
100:	learn: 0.8099137	test: 0.8149840	best: 0.8150104 (99)	total: 12.8s	remaining: 10.4s
101:	learn: 0.8099316	test: 0.8149679	best: 0.8150104 (99)	total: 13s	remaining: 10.3s
102:	learn: 0.8098859	test: 0.8148413	best: 0.8150104 (99)	total: 13.1s	remaining: 10.2s
103:	learn: 0.8101235	test: 0.8149767	best: 0.8150104 (99)	total: 13.2s	remaining: 10s
104:	learn: 0.8102488	test: 0.8148277	best: 0.8150104 (99)	total: 13.4s	remaining: 9.92s
105:	learn: 0.8102045	test: 0.8

<catboost.core.CatBoostClassifier at 0x1a732b32ac0>

In [144]:
model.feature_importances_.tolist()

[11.837492797332695,
 20.673345668134232,
 1.8186891048982932,
 3.0371117928490383,
 9.069556740439374,
 1.8887074358603468,
 3.6919574874242906,
 0.16864689696673246,
 10.798769734244905,
 2.1497406397058456,
 24.23410455161549,
 10.631877150528743]

In [145]:
fli=['Tenure','LoanStatus','AGE','LTV','intoamt','amttoinc','anninc','payrej',\
     'CURRENT-BAL','bad','mttonewloan','tmtom']

In [146]:
testdata1=testdata[fli]

In [147]:
testpred=model.predict(testdata1)

In [148]:
subfile=pd.read_csv(r"sample_submission_ejm25Dc.csv")

In [149]:
subfile['Top-up Month']=testpred.tolist()

In [150]:
subfile['Top-up Month']=subfile['Top-up Month'].astype(str)

In [151]:
subfile['Top-up Month']=subfile['Top-up Month'].apply(lambda x : x[2:])

In [152]:
subfile['Top-up Month']=subfile['Top-up Month'].apply(lambda x : x[:-2])

In [153]:
subfile['Top-up Month']=subfile['Top-up Month'].apply(lambda x : x.strip())

In [154]:
subfile['Top-up Month'].value_counts(normalize=True)

No Top-up Service    0.966972
> 48 Months          0.021092
24-30 Months         0.003391
36-48 Months         0.002781
18-24 Months         0.002577
30-36 Months         0.002509
12-18 Months         0.000678
Name: Top-up Month, dtype: float64

In [155]:
subfile.to_csv(".\SubResult\Catboost.csv",index=False)

In [156]:
valpred=model.predict(xval)

In [157]:
f1_score(yval,valpred,average='macro')

0.40244536616575566