In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
apps = pd.read_csv('application_record.csv')
credit = pd.read_csv('credit_record.csv')

# Initial Exploration
## application_record.csv

In [None]:
# There are 438,557 rows, but occupation type has a considerable number of null values
apps.info()

In [None]:
apps.head()

In [None]:
apps.describe()

In [None]:
apps['DAYS_EMPLOYED'] = apps['DAYS_EMPLOYED'] * -1 # Converting length of time to a more intuitive format
apps['DAYS_BIRTH'] = apps['DAYS_BIRTH'] * -1 # Converting length of time to a more intuitive format
apps['AGE_YEARS'] = round((apps['DAYS_BIRTH'] / 365), 1) # Adding column representing years instead of days

In [None]:
apps.head()

In [None]:
apps.shape

In [None]:
len(apps.ID.unique()) # The number of unique ID's does not match the shape of the dataframe. Therefore there are duplicated IDs.

In [None]:
apps[apps.duplicated('ID', keep=False)].sort_values(by='ID') # These ID's are duplicated, but they clearly are not the same person.

### Gender

In [None]:
apps.CODE_GENDER.value_counts()

### Car Ownership

In [None]:
apps.FLAG_OWN_CAR.value_counts()

### Gender/Car Ownership Crosstab

In [None]:
pd.crosstab(apps.CODE_GENDER, apps.FLAG_OWN_CAR)

### Own Realty

In [None]:
apps.FLAG_OWN_REALTY.value_counts()

### Children

In [None]:
apps.CNT_CHILDREN.value_counts()

In [None]:
apps[apps.CNT_CHILDREN > 10] # This has revealed that there are a number of applicants that have entered more than one application, resulting in multiple application IDs

In [None]:
apps.CNT_CHILDREN.describe()

### Income Total

In [None]:
apps.AMT_INCOME_TOTAL.describe().apply(lambda x: format(x, 'f'))

### Income Type

In [None]:
apps.NAME_INCOME_TYPE.value_counts()

### Education Type

In [None]:
apps.NAME_EDUCATION_TYPE.value_counts()

### Family Status

In [None]:
apps.NAME_FAMILY_STATUS.value_counts()

### Housing Type

In [None]:
apps.NAME_HOUSING_TYPE.value_counts()

### Age in Days/Years

In [None]:
apps.DAYS_BIRTH.describe()

In [None]:
apps.AGE_YEARS.describe()

### Days Employed

In [None]:
apps.DAYS_EMPLOYED.describe()

### Mobile Phone Ownership

In [None]:
apps.FLAG_MOBIL.value_counts() # 100% of people own a mobile phone. Is this because the application is made through a mobile phone?

### Phone Ownership (Landline?)

In [None]:
apps.FLAG_PHONE.value_counts()

In [None]:
apps.FLAG_PHONE.mean()

### Phone Ownership - Work Line

In [None]:
apps.FLAG_WORK_PHONE.value_counts()

In [None]:
apps.FLAG_WORK_PHONE.mean()

### E-Mail Ownership

In [None]:
apps.FLAG_EMAIL.value_counts()

In [None]:
apps.FLAG_EMAIL.mean()

### Occupation Type

In [None]:
apps.OCCUPATION_TYPE.value_counts()

In [None]:
apps[apps['OCCUPATION_TYPE'].isna()]

In [None]:
apps[apps['OCCUPATION_TYPE'].isna()]['NAME_INCOME_TYPE'].value_counts()

In [None]:
apps[~apps['OCCUPATION_TYPE'].isna()]['NAME_INCOME_TYPE'].value_counts()

In [None]:
apps[apps.NAME_INCOME_TYPE == 'Pensioner']

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(apps.groupby('OCCUPATION_TYPE').NAME_INCOME_TYPE.value_counts())

### Count Family Members

In [None]:
apps.CNT_FAM_MEMBERS.describe()

## credit_record.csv

In [None]:
credit.info()

In [None]:
credit.head()

In [None]:
credit.MONTHS_BALANCE.describe().apply(lambda x: format(x, 'f'))

In [None]:
credit.STATUS.value_counts()

0: 1-29 days past due  
1: 30-59 days past due  
2: 60-89 days overdue  
3: 90-119 days overdue  
4: 120-149 days overdue  
5: Overdue or bad debts, write-offs for more than 150 days  

C: paid off that month  
X: No loan for the month  

In [None]:
credit.shape

In [None]:
[column.lower() for column in apps.columns]

In [None]:
columns_lower = (apps.columns).str.lower()

In [None]:
grouped = credit.groupby('ID')

In [None]:
pivot_tb = credit.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')

In [None]:
pivot_tb

In [None]:
apps = pd.read_csv('application_record.csv')
credit = pd.read_csv('credit_record.csv')

In [None]:
def prep_applications(apps):
    '''
    This function takes in the dataframe read from application_record.csv and prepares it by:
    1. Converting all column names to lower case
    2. Replacing nulls in the occupation type with 'Other'
    3. Converts number of days employed to years employed
    4. Converts the age in days (days_birth) to age in years
    5. Converts all "Y" and "N" throughout the dataframe into 1s and 0s
    '''
    # Convert column names to lower case
    apps.columns = (apps.columns).str.lower()
    
    # Fills null values in occupation type with other
    apps['occupation_type'].fillna('Other', inplace=True)
    
    # Convert days employed to years employed
    apps['employed_years'] = [round(val/(-365)) if val < -365 else val/(-365)  if val < 0 else round(val/365) for val in list(apps.days_employed)]
    
    # Convert DAYS_BIRTH to age in years
    apps['age'] = (apps['days_birth']/365 * -1).apply(np.floor)
    
    # Convert Yes and Nos to 1s and 0s whole data frame
    apps.replace({'Y':1, 'N':0}, inplace=True)
    
    return apps

In [None]:
prep_applications(apps)

In [None]:
apps.employed_years.describe()

In [None]:
plt.hist(apps.employed_years)

For pensioners who have 1000 years of working history, we will exclude them from the dataframe, find the median number of years worked for each occupation_type, name_income_source combination, and then use those to replace the 1000s with those values. 

In [None]:
apps[apps.employed_years > 1000]

Turns out that all of the pensioners fall into the 'Other' category for occupation type. There is no further distinction to be made. 

In [None]:
apps[apps.employed_years < 1000].occupation_type.value_counts()

In [None]:
apps.head()

In [None]:
apps[apps.duplicated(subset=['id']) == False].sort_values(by='id')

In [2]:
from anthony_wrangle import get_reports_data

In [3]:
import pandas as pd
import numpy as np

In [4]:
apps = pd.read_csv('application_record.csv')

In [5]:
credit = pd.read_csv('credit_record.csv')

In [8]:
expanded, score, full_history = get_reports_data('credit_record.csv')

In [9]:
expanded

Unnamed: 0,id,0-29,120-149,30-59,60-89,90-119,bad_debt,no_debt,paid_off,months_active
0,5001711,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,5001712,10.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,19
2,5001713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,22
3,5001714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,15
4,5001715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,60
...,...,...,...,...,...,...,...,...,...,...
45980,5150482,12.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,18
45981,5150483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,18
45982,5150484,12.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,13
45983,5150485,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [10]:
score

Unnamed: 0,id,0-29,120-149,30-59,60-89,90-119,bad_debt,no_debt,paid_off,months_active,score
0,5001711,3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,4,5.0
1,5001712,10.0,0.0,0.0,0.0,0.0,0.0,9.0,-0.0,19,38.0
2,5001713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-44.0,22,-22.0
3,5001714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-30.0,15,-15.0
4,5001715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-120.0,60,-60.0
...,...,...,...,...,...,...,...,...,...,...,...
45980,5150482,12.0,0.0,0.0,0.0,0.0,0.0,6.0,-0.0,18,36.0
45981,5150483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-36.0,18,-18.0
45982,5150484,12.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.0,13,26.0
45983,5150485,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,2,4.0


In [11]:
full_history

Unnamed: 0,id,account_months,status,1month_ago,2month_ago,3month_ago,4month_ago,5month_ago,6month_ago,7month_ago,...,51month_ago,52month_ago,53month_ago,54month_ago,55month_ago,56month_ago,57month_ago,58month_ago,59month_ago,60month_ago
0,5001711,3,0,0,0,X,,,,,...,,,,,,,,,,
1,5001712,18,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2,5001713,21,X,X,X,X,X,X,X,X,...,,,,,,,,,,
3,5001714,14,X,X,X,X,X,X,X,X,...,,,,,,,,,,
4,5001715,59,X,X,X,X,X,X,X,X,...,X,X,X,X,X,X,X,X,X,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45980,5150482,28,0,0,0,0,0,0,0,0,...,,,,,,,,,,
45981,5150483,17,X,X,X,X,X,X,X,X,...,,,,,,,,,,
45982,5150484,12,0,0,0,0,0,0,0,0,...,,,,,,,,,,
45983,5150485,1,0,0,,,,,,,...,,,,,,,,,,
