# Week 1 - Data Wrangling

### import packages

In [1]:
#Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly as py
import datetime as dt
import scipy as scp
from scipy import stats
from scipy.stats import ttest_ind

import warnings
warnings.filterwarnings('ignore')

### create dataFrame

In [2]:
applications = pd.read_csv('../Data/application_record.csv')

In [3]:
records = pd.read_csv('../Data/credit_record.csv')

In [4]:
applications.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [5]:
# it appears that there may be some duplicate records. deal with this later.

In [6]:
# check for missing values
applications.isna().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64

In [7]:
# drop missing values
applications.dropna(inplace = True)

In [8]:
applications.isna().sum()

ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
dtype: int64

In [9]:
# subset data - drop variables that most likely won't affect the DV.
applications2 = applications.drop(['FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL'], axis=1)

In [10]:
# rename columns 
applications2.rename(columns={'CODE_GENDER': 'gender', 'FLAG_OWN_CAR' : 'ownsCar', 'FLAG_OWN_REALTY' : 'ownsRealty', 'CNT_CHILDREN' : 'numChildren', 'AMT_INCOME_TOTAL' : 'totalIncome', 'NAME_INCOME_TYPE' : 'incomeType', 'NAME_EDUCATION_TYPE' : 'eduLvl', 'NAME_FAMILY_STATUS': 'famStatus', 'NAME_HOUSING_TYPE' : 'housingType', 'DAYS_BIRTH' : 'daysBirth', 'DAYS_EMPLOYED' : 'daysEmpl', 'OCCUPATION_TYPE' : 'occupation', 'CNT_FAM_MEMBERS' : 'famSize'}, inplace = True)

In [11]:
# convert daysBirth to Age in years for better understanding
applications2['ageYrs'] = np.ceil(pd.to_timedelta(applications2['daysBirth'], unit='D').dt.days / -365.25)
applications2.drop('daysBirth', axis = 1, inplace = True)

In [12]:
# convert daysEmpl to yrsEmpl to keep in the same units
applications2['yrsEmpl']=-applications2['daysEmpl']/365.2425
applications2.loc[applications2['yrsEmpl']<0,'yrsEmpl']=0
applications2.drop('daysEmpl', axis=1, inplace=True)

In [13]:
# drop duplicates
applications3=applications2.drop_duplicates(subset=applications2.columns[1:], keep='first')

In [14]:
applications3.head()

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,Security staff,2.0,59.0,3.104787
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,1.0,53.0,8.353354
10,5008815,M,Y,Y,0,270000.0,Working,Higher education,Married,House / apartment,Accountants,2.0,47.0,2.10545
13,5008819,M,Y,Y,0,135000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,Laborers,2.0,49.0,3.269061
19,5008825,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,Accountants,2.0,30.0,3.019911


In [15]:
# make sure data is not too related (>.9)
applications3.corr()

Unnamed: 0,ID,numChildren,totalIncome,famSize,ageYrs,yrsEmpl
ID,1.0,-0.002885,-0.006742,-0.002113,-0.004054,-0.006438
numChildren,-0.002885,1.0,-0.01482,0.893676,-0.237324,-0.065774
totalIncome,-0.006742,-0.01482,1.0,-0.014715,0.065162,0.03104
famSize,-0.002113,0.893676,-0.014715,1.0,-0.17174,-0.032965
ageYrs,-0.004054,-0.237324,0.065162,-0.17174,1.0,0.346357
yrsEmpl,-0.006438,-0.065774,0.03104,-0.032965,0.346357,1.0


In [None]:
# This data appears to be independent, which is great for our analyses.

In [None]:
# dummy code binary variables (ownsCar, ownsRealty, gender)

# dummy code binary variables

def genderRecode (series):
    if series == "F":
        return 0
    if series == "M":
        return 1
applications3['genderR'] = applications3['gender'].apply(genderRecode)


In [None]:
def carRecode (series):
    if series == "N":
        return 0
    if series == "Y":
        return 1
applications3['ownsCarR'] = applications3['ownsCar'].apply(carRecode)


In [None]:
def realtyRecode (series):
    if series == "N":
        return 0
    if series == "Y":
        return 1
applications3['ownsRealtyR'] = applications3['ownsRealty'].apply(realtyRecode)

In [16]:
applications3.head()

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,Security staff,2.0,59.0,3.104787
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,1.0,53.0,8.353354
10,5008815,M,Y,Y,0,270000.0,Working,Higher education,Married,House / apartment,Accountants,2.0,47.0,2.10545
13,5008819,M,Y,Y,0,135000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,Laborers,2.0,49.0,3.269061
19,5008825,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,Accountants,2.0,30.0,3.019911


In [None]:
# Check out what we have for records
records.head()

In [None]:
records.STATUS.value_counts()

In [None]:
# calling 1's bad, 0's Good

In [None]:
# find unique months balance values
records['MONTHS_BALANCE'].unique()

In [None]:
# look at unique status types for applicant's records
records['STATUS'].unique()

In [None]:
# recode status types
def status(s):
    if s == "C":
        return 0
    elif s == "X":
        return 0
    elif s == '0':
        return 0
    elif s == '1':
        return 0
    elif s == '2':
        return 1
    elif s == '3':
        return 1
    elif s == '4':
        return 1
    elif s == '5':
        return 1

In [None]:
records['STATUS'] = records['STATUS'].apply(lambda x:status(x))

In [None]:
records.head()

In [None]:
# merge the datasets
merged = pd.merge(applications3, records, on="ID")

In [None]:
merged.head()

# Independent t- test

## Is there a difference in total income between M and F in this dataset?

### Test Assumptions

In [None]:
# Normality of the DV (totalIncome)

In [None]:
num_bins = 250
n, bins, patches = plt.hist(applications3['totalIncome'], num_bins, facecolor='green', alpha = .8)
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.title('Applicant Income Dist')

In [None]:
sns.distplot(applications3['totalIncome'])

In [None]:
applications3.totalIncome[applications3.gender == 'M'].hist()

In [None]:
applications3.totalIncome[applications3.gender == 'F'].hist()

In [None]:
ttest_ind(applications3.totalIncome[applications3.gender == 'M'],
          applications3.totalIncome[applications3.gender =='F'])

# p value is <.05, so there is a significant difference in income between males and females in this dataset. Outliers in income may be influencing this analysis.

In [None]:
# Look for outliers
applications3.boxplot('totalIncome')

## Is there a difference in total income between those who own cars and those who don't in this dataset?

In [None]:
applications3.totalIncome[applications3.ownsCar == 'Y'].hist()

In [None]:
applications3.totalIncome[applications3.ownsCar == 'N'].hist()

In [None]:
ttest_ind(applications3.totalIncome[applications3.ownsCar == 'Y'],
          applications3.totalIncome[applications3.ownsCar =='N'])

### p value is <.05, so there is a significant difference in income between applicants who own a car and those who don't in this dataset. 