### Import Packages

In [2]:
import pandas as pd

### Import Data 

In [3]:
application = pd.read_csv("original_data/application_record.csv")
credit = pd.read_csv("original_data/credit_record.csv")

### Data Exploration

In [4]:
application.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [5]:
credit.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


We first drop duplicate IDs in application records and keeping the last one.

The duplicate IDs in the credit record is not dropped because it keeps records of multiple months, meaning that duplication is expected and needed.

In [6]:
application = application.drop_duplicates('ID', keep='last')

In [7]:
application.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 438510 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438510 non-null  int64  
 1   CODE_GENDER          438510 non-null  object 
 2   FLAG_OWN_CAR         438510 non-null  object 
 3   FLAG_OWN_REALTY      438510 non-null  object 
 4   CNT_CHILDREN         438510 non-null  int64  
 5   AMT_INCOME_TOTAL     438510 non-null  float64
 6   NAME_INCOME_TYPE     438510 non-null  object 
 7   NAME_EDUCATION_TYPE  438510 non-null  object 
 8   NAME_FAMILY_STATUS   438510 non-null  object 
 9   NAME_HOUSING_TYPE    438510 non-null  object 
 10  DAYS_BIRTH           438510 non-null  int64  
 11  DAYS_EMPLOYED        438510 non-null  int64  
 12  FLAG_MOBIL           438510 non-null  int64  
 13  FLAG_WORK_PHONE      438510 non-null  int64  
 14  FLAG_PHONE           438510 non-null  int64  
 15  FLAG_EMAIL       

In [8]:
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [9]:
# calculating the number of matching records from the two records
matched_id = set(application["ID"]) & set(credit["ID"])
print(f"Number of matched records: {len(matched_id)}")

Number of matched records: 36457


In [10]:
application_cat_cols = application.select_dtypes(include=["object"]).columns
application_num_cols = application.select_dtypes(exclude=["object"]).columns
credit_cat_cols = credit.select_dtypes(include=["object"]).columns
credit_num_cols = credit.select_dtypes(exclude=["object"]).columns

In [11]:
# count of the each possible values for categorical variables in application records
print("Application Records")
print("-------------------")
for col in application_cat_cols:
    value_set = set(application[col])
    value_dict = {val: 0 for val in value_set}
    for val in value_set:
        count = application.loc[application[col] == val, col].count()
        value_dict[val] = count
    print(f"{col}: {value_dict}")

# count of the each possible values for categorical variables in credit records
print("\nCredit Records")
print("---------------")
for col in credit_cat_cols:
    value_set = set(credit[col])
    value_dict = {val: 0 for val in value_set}
    for val in value_set:
        count = credit.loc[credit[col] == val, col].count()
        value_dict[val] = count
    print(f"{col}: {value_dict}")

Application Records
-------------------
CODE_GENDER: {'M': 144098, 'F': 294412}
FLAG_OWN_CAR: {'N': 275428, 'Y': 163082}
FLAG_OWN_REALTY: {'N': 134467, 'Y': 304043}
NAME_INCOME_TYPE: {'Pensioner': 75483, 'Commercial associate': 100739, 'Working': 226087, 'Student': 17, 'State servant': 36184}
NAME_EDUCATION_TYPE: {'Incomplete higher': 14849, 'Lower secondary': 4051, 'Higher education': 117509, 'Secondary / secondary special': 301789, 'Academic degree': 312}
NAME_FAMILY_STATUS: {'Civil marriage': 36524, 'Widow': 19671, 'Separated': 27249, 'Single / not married': 55268, 'Married': 299798}
NAME_HOUSING_TYPE: {'House / apartment': 393788, 'Co-op apartment': 1539, 'Office apartment': 3922, 'Municipal apartment': 14213, 'With parents': 19074, 'Rented apartment': 5974}
OCCUPATION_TYPE: {nan: 0, 'Cooking staff': 8074, 'Accountants': 15982, 'Drivers': 26085, 'Secretaries': 2044, 'Realty agents': 1041, 'High skill tech staff': 17287, 'Managers': 35483, 'Cleaning staff': 5845, 'Sales staff': 4109

The number of missing values in each column from application records is shown as follows:

In [13]:
application.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134187
CNT_FAM_MEMBERS             0
dtype: int64

The number of missing values in each column from application records is shown as follows:

In [14]:
credit.isnull().sum()

ID                0
MONTHS_BALANCE    0
STATUS            0
dtype: int64