In [4]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
app_train = pd.read_csv('/content/application_train.csv')
app_test = pd.read_csv('/content/application_test.csv')
print(app_train.shape)
print(app_test.shape)

(19395, 122)
(21188, 121)


In [5]:
app_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19395 entries, 0 to 19394
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(85), int64(21), object(16)
memory usage: 18.1+ MB


In [6]:
app_test.head(3)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0


In [7]:
missing = app_train.isnull().sum()
print("Max number of missing values in a certain feature:",missing.max())
print("Total number of missing values:",missing.sum())

Max number of missing values in a certain feature: 13562
Total number of missing values: 575557


In [8]:
train = app_train.loc[:,missing.index[missing<10000]]
train.shape

(19395, 92)

In [9]:
test = app_test.loc[:,(missing.index[missing<10000]).drop('TARGET')]
test.shape

(21188, 91)

In [10]:
test_id=test['SK_ID_CURR']
test_id

0        100001
1        100005
2        100013
3        100028
4        100038
          ...  
21183    253712
21184    253717
21185    253720
21186    253731
21187    253734
Name: SK_ID_CURR, Length: 21188, dtype: int64

In [11]:
miss = test.isnull().sum()
print("Max number of missing values in a certain feature:",miss.max())
print("Total number of missing values:",miss.sum())

Max number of missing values in a certain feature: 10411
Total number of missing values: 220225


In [12]:
y_train = train.TARGET
x_train = train.drop(['TARGET'],axis=1)
print(y_train.shape)
x_train.shape

(19395,)


(19395, 91)

In [13]:
fdata = pd.concat([x_train,test])
wdata = pd.concat([x_train,test])
wdata.shape

(40583, 91)

In [14]:
col_obj = wdata.columns[wdata.dtypes == 'object']
for i in col_obj:
    wdata.loc[:,i],_ = pd.factorize(wdata.loc[:,i])
wdata.head(3) 

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,1,0,1,0,0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
data = wdata.fillna(0)

In [16]:
data = data.drop(['SK_ID_CURR'],axis=1)
xtrain = data.iloc[0:len(x_train),:]
xtest = data.iloc[len(x_train):len(data),:] 
print(xtrain.shape)
print(xtest.shape)

(19395, 90)
(21188, 90)


In [17]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(xtrain)
x_scaled = scaler.transform(xtrain)
x_scaled = pd.DataFrame(x_scaled)
x_scaled.isna().sum().sum()

0

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(x_scaled, y_train, test_size=0.2, random_state=18)
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(max_iter=1000).fit(X_train, Y_train)

In [19]:
y_pred = lreg.predict_proba(X_val)
from sklearn.metrics import roc_auc_score
print("Validation ROC AUC:",roc_auc_score(Y_val, y_pred[:,1]))

Validation ROC AUC: 0.7043822297177686


In [20]:
xs_test = scaler.transform(xtest)
y_pred = lreg.predict_proba(xs_test)
y=pd.DataFrame(y_pred[:,1])
y.columns=['TARGET']

In [21]:
outcome = pd.concat([test_id,y],axis=1)
outcome.head(3)

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.037548
1,100005,0.166211
2,100013,0.050604


In [22]:
outcome.to_csv (r'/content/pred_test.csv', index = False, header=True)

In [23]:
import lightgbm as lgb
lgb_train = lgb.Dataset(data=X_train, label=Y_train)
lgb_eval = lgb.Dataset(data=X_val, label=Y_val)
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)



Training until validation scores don't improve for 150 rounds.
[200]	valid_0's auc: 0.726638
[400]	valid_0's auc: 0.728178
Early stopping, best iteration is:
[328]	valid_0's auc: 0.729126


In [24]:
xtrain0 = wdata.iloc[0:len(x_train),:]
xtrain0=xtrain0.drop(['SK_ID_CURR'],axis=1)
fd = pd.concat([y_train,xtrain0],axis=1)
cormat = fd.corr()
vars = cormat.index[abs(cormat.TARGET)>0.01]
vars = vars.drop('TARGET') 
print(vars)
print(len(vars))

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_GOODS_PRICE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_2',
       'EXT_SOURCE_3', 'FLOORSMAX_AVG', 'YEARS_BEGINEXPLUATATION_MODE',
       'FLOORSMAX_MODE', 'FLOORSMAX_MEDI', 'HOUSETYPE_MODE', 'TOTALAREA_MODE',
       'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE',
       'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DAY

In [25]:
fdata = wdata.loc[:,vars]
fdata = fdata.fillna(0)
xtrain = fdata.iloc[0:len(x_train),:]
xtest = fdata.iloc[len(x_train):len(fdata),:] 
print(xtrain.shape)
print(xtest.shape)
scaler = preprocessing.StandardScaler().fit(xtrain)
x_scaled = scaler.transform(xtrain)
x_scaled = pd.DataFrame(x_scaled)

(19395, 50)
(21188, 50)


In [26]:
X_train, X_val, Y_train, Y_val = train_test_split(x_scaled, y_train, test_size=0.2, random_state=18)
lreg = LogisticRegression(max_iter=1000).fit(X_train, Y_train)
y_pred = lreg.predict_proba(X_val)
print("Validation ROC AUC:",roc_auc_score(Y_val, y_pred[:,1]))

Validation ROC AUC: 0.7064104052734468


In [27]:
scaler = preprocessing.MaxAbsScaler().fit(xtrain)
x_scaled = scaler.transform(xtrain)
x_scaled = pd.DataFrame(x_scaled)
X_train, X_val, Y_train, Y_val = train_test_split(x_scaled, y_train, test_size=0.2, random_state=18)
lreg = LogisticRegression(max_iter=1000).fit(X_train, Y_train)
y_pred = lreg.predict_proba(X_val)
print("Validation ROC AUC:",roc_auc_score(Y_val, y_pred[:,1]))

Validation ROC AUC: 0.708212064741724


In [28]:
scaler = preprocessing.MinMaxScaler().fit(xtrain)
x_scaled = scaler.transform(xtrain)
x_scaled = pd.DataFrame(x_scaled)
X_train, X_val, Y_train, Y_val = train_test_split(x_scaled, y_train, test_size=0.2, random_state=18)
lreg = LogisticRegression(max_iter=1000).fit(X_train, Y_train)
y_pred = lreg.predict_proba(X_val)
print("Validation ROC AUC:",roc_auc_score(Y_val, y_pred[:,1]))

Validation ROC AUC: 0.7083671997091457


In [29]:
wdata = pd.concat([x_train,test])
col_obj = wdata.columns[wdata.dtypes == 'object']
for i in col_obj:
    wdata.loc[:,i],_ = pd.factorize(wdata.loc[:,i])
wdata = wdata.drop(['SK_ID_CURR'],axis=1)
fdata = wdata.loc[:,vars]
print(fdata.isna().sum().sum())
fdata.head(3)

126744


Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_GOODS_PRICE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,...,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,FLAG_DOCUMENT_6,FLAG_DOCUMENT_11,FLAG_DOCUMENT_14,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_YEAR
0,0,0,0,202500.0,406597.5,351000.0,0,0,0,0.018801,...,2.0,-1134.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,1,0,270000.0,1293502.5,1129500.0,1,1,0,0.003541,...,0.0,-828.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,67500.0,135000.0,135000.0,0,0,0,0.010032,...,0.0,-815.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(fdata)
cols = fdata.columns
idata = imp.transform(fdata)
idata = pd.DataFrame(idata,columns=cols)

In [31]:
xtrain = idata.iloc[0:len(x_train),:]
xtest = idata.iloc[len(x_train):len(idata),:] 
scaler = preprocessing.StandardScaler().fit(xtrain)
x_scaled = scaler.transform(xtrain)
x_scaled = pd.DataFrame(x_scaled)
X_train, X_val, Y_train, Y_val = train_test_split(x_scaled, y_train, test_size=0.2, random_state=18)
lreg = LogisticRegression(max_iter=1000).fit(X_train, Y_train)
y_pred = lreg.predict_proba(X_val)
print("Validation ROC AUC:",roc_auc_score(Y_val, y_pred[:,1]))

Validation ROC AUC: 0.7191942879876044


In [32]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0).fit(X_train, Y_train)
y_pred = RF.predict_proba(X_val)
print("Validation ROC AUC:",roc_auc_score(Y_val, y_pred[:,1]))

Validation ROC AUC: 0.7077114451536026
