In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

In [None]:
data=pd.read_csv('./Data/application_data.csv')

## Data Analysis

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.dtypes.unique()

In [None]:
data.isnull().value_counts()

In [None]:
obj_cols = data.select_dtypes(include='object').columns
obj_cols

In [None]:
obj_cols.shape

In [None]:
int_cols = data.select_dtypes(include='int64').columns
int_cols

In [None]:
int_cols.shape

In [None]:
float_cols = data.select_dtypes(include='float64').columns
float_cols

In [None]:
float_cols.shape

In [None]:
count_float_cols = len(float_cols) 
count_int_cols = len(int_cols) 
count_obj_cols = len(obj_cols)
print(count_float_cols)
print(count_int_cols)
print(count_obj_cols)
print(count_float_cols+count_int_cols+count_obj_cols)

In [None]:
{column: len(data[column].unique()) for column in data[float_cols].columns}

In [None]:
{column: len(data[column].unique()) for column in data[obj_cols].columns}

In [None]:
for col in obj_cols :
    print("--------------------------------------------------")
    print(col)
    print(data[col].unique())
    print("--------------------------------------------------")

In [None]:
for col in int_cols :
    print("--------------------------------------------------")
    print(col)
    print(data[col].unique())
    print("--------------------------------------------------")

In [None]:
code_gender = data['CODE_GENDER'].value_counts()

In [None]:
code_gender.plot(kind='bar')

In [None]:
data.drop(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG',
       'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG',
       'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG',
       'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG','NAME_CONTRACT_TYPE',
       'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG',
       'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
       'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE','WEEKDAY_APPR_PROCESS_START',
       'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE',
       'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE',
       'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI','OCCUPATION_TYPE',
       'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI',
       'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI',
       'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI',
       'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE'],axis=1,inplace=True)

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.drop(["CODE_GENDER","FLAG_OWN_CAR","FLAG_OWN_REALTY","CNT_CHILDREN","NAME_EDUCATION_TYPE","NAME_FAMILY_STATUS",
         "NAME_HOUSING_TYPE","REGION_POPULATION_RELATIVE","DAYS_BIRTH","DAYS_EMPLOYED","DAYS_REGISTRATION","DAYS_ID_PUBLISH","OWN_CAR_AGE","FLAG_MOBIL",
         "FLAG_EMP_PHONE","FLAG_WORK_PHONE","FLAG_CONT_MOBILE","FLAG_PHONE","FLAG_EMAIL","FLAG_EMAIL","CNT_FAM_MEMBERS","REGION_RATING_CLIENT",
         "REGION_RATING_CLIENT_W_CITY","REG_REGION_NOT_LIVE_REGION","REG_REGION_NOT_WORK_REGION","NAME_TYPE_SUITE",
         "LIVE_REGION_NOT_WORK_REGION","REG_CITY_NOT_LIVE_CITY","REG_CITY_NOT_WORK_CITY","LIVE_CITY_NOT_WORK_CITY"],axis=1,inplace=True)

In [None]:
data.drop(['FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
       'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE',
       'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_2',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
       'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
       'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
       'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR'],axis=1,inplace=True)

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
def missing (df1):
    missing_number = df1.isnull().sum().sort_values(ascending=False)
    missing_percent = ((df1.isnull().sum()/df1.isnull().count())*100).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

In [None]:
data.isnull().sum()

In [None]:
for i in data.columns:
    if data[i].dtypes == 'object':
        data[i].fillna(data[i].mode()[0], inplace=True)
    else:
        data[i].fillna(data[i].median(), inplace=True)
print(data)

In [None]:
data.isnull().sum()

In [None]:
data.head()

In [None]:
# finding numerical data and categorical data in dataset
numerical= data.drop(['TARGET'], axis=1).select_dtypes('number').columns

categorical = data.select_dtypes('object').columns

print(f'Numerical Columns:  {data[numerical].columns}')
print('\n')
print(f'Categorical Columns: {data[categorical].columns}')

In [None]:
sns.countplot(data.dtypes.map(str))
plt.show()

In [None]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
data['NAME_INCOME_TYPE']= label_encoder.fit_transform(data['NAME_INCOME_TYPE'])

data['NAME_INCOME_TYPE']

In [None]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
data['ORGANIZATION_TYPE']= label_encoder.fit_transform(data['ORGANIZATION_TYPE'])

data['ORGANIZATION_TYPE']

In [None]:
for col in data.columns:
    if data[col].dtypes != object:
        q1 = data[col].quantile(0.25)
        q3 = data[col].quantile(0.75)
        IQR = q3 - q1
        llp = q1-1.5*IQR
        ulp = q3+1.5*IQR
        print('column name',col)
        print('mean:',data[col].mean())
        print('mode:',data[col].mode()[0])
        print('median:', data[col].median())
        print('skewness:',data[col].skew())
        print('kurtosis:',data[col].kurtosis())
        print('null_value count:',data[col].isnull().sum())
        print('\n')

In [None]:
#checking outliers
#Outlier Search: This helps you to get some insights about the outliers in the data.
data.plot(kind='box', layout=(3,3),subplots=1,figsize=(25,11))
plt.show()

In [None]:
count=1
plt.subplots(figsize=(30,25))
for i in data.columns:
    if data[i].dtypes!='object':
        plt.subplot(6,7,count)
        sns.distplot(data[i])
        count+=1

plt.show()

In [None]:
cor = data.corr()
cor

In [None]:
ax = sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
X = data.drop("TARGET",axis=1)
y = data.TARGET

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

X_train.shape,X_test.shape

In [None]:
# Applying random forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_Classifier = RandomForestClassifier(n_estimators = 10, random_state = 0)
rf_Classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_Classifier.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import  confusion_matrix,accuracy_score
cm=confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

In [None]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
data.shape

In [None]:
X.shape

In [None]:
y.shape