In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import plotly.offline as py
import plotly.graph_objs as go

import cufflinks as cf
cf.go_offline()

print(os.listdir("../input"))

In [None]:
application_train = pd.read_csv('../input/application_train.csv')
application_test = pd.read_csv('../input/application_test.csv')

In [None]:
print(application_train.shape)
application_train.head()

In [None]:
print(application_test.shape)
application_test.head()

In [None]:
total = application_train.isnull().sum().sort_values(ascending=False)
percent = (application_train.isnull().sum()/application_train.isnull().count()*100).sort_values(ascending=False)
missing_application_train_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_application_train_data.head(70)

In [None]:
total = application_test.isnull().sum().sort_values(ascending=False)
percent = (application_test.isnull().sum()/application_test.isnull().count()*100).sort_values(ascending=False)
missing_application_test_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_application_test_data.head(70)

In [None]:
cols = missing_application_train_data.query('Percent < 45.0')
cols = list(cols.index)

application_train = application_train.filter(items=cols)
application_test = application_test.filter(items=cols)

application_train

In [None]:
target = application_train["TARGET"].value_counts()

trace = go.Pie(
    labels=['Repayed', 'Not Repayed'],
    values=target,
    hole= 0.5
)
data = [trace]

layout = go.Layout(
    title = "Loan Repayed or not",
    titlefont=dict(
        size=20
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [None]:
from sklearn.preprocessing import LabelEncoder

application_train.dtypes.value_counts()

In [None]:
application_train.select_dtypes('object').apply(pd.Series.nunique)

In [None]:
label_encoder = LabelEncoder()
label_encoder_count = 0

for col in application_train:
    if application_train[col].dtype == 'object':
        if len(list(application_train[col].unique())) <= 2:
            label_encoder.fit(application_train[col])
            application_train[col] = label_encoder.transform(application_train[col])
            application_test[col] = label_encoder.transform(application_test[col])
            
            label_encoder_count +=1
print('%d columns were label encoded.' % label_encoder_count)

In [None]:
# one-hot encoding of categorical variables
application_train = pd.get_dummies(application_train)
application_test = pd.get_dummies(application_test)

In [None]:
application_train.shape, application_test.shape

In [None]:
train_target = application_train['TARGET']

application_train, application_test = application_train.align(application_test, join='inner', axis=1)

In [None]:
application_train.shape, application_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
#from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
#from sklearn.neighbors import KNeighborsClassifier

X = application_train.drop('SK_ID_CURR', axis=1)
y = train_target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

imputer = SimpleImputer(strategy='mean').fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

"""
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
"""

In [None]:
print(classification_report(y_test, pred), '\n')
print('Confusion Matrix:\n TN FP\n FN TP\n\n', confusion_matrix(y_test, pred), '\n')
print('ROC AUC Score = {}%'.format(np.round(roc_auc_score(y_test, pred),2)))

In [None]:
SK_ID_CURR = application_test['SK_ID_CURR']
df_test = application_test.drop('SK_ID_CURR', axis=1)

In [None]:
imputer = SimpleImputer(strategy='mean')
df_test = imputer.fit_transform(df_test)

scaler = StandardScaler()
df_test = scaler.fit_transform(df_test)

lr_pred = lr.predict_proba(df_test)

In [None]:
lr_pred[:,1]

In [None]:
submission = pd.DataFrame({'SK_ID_CURR': SK_ID_CURR,
                           'TARGET': lr_pred[:,1]})

submission.head()

In [None]:
submission.to_csv('submission_less_features.csv', index=False)