# Week 2 Data Wrangling

### Import Packages

In [1]:
#Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly as py
import datetime as dt
import scipy as scp
from scipy import stats
from scipy.stats import ttest_ind

import warnings
warnings.filterwarnings('ignore')

### Load Datasets

In [2]:
applications = pd.read_csv('../Data/application_record.csv')

In [3]:
records = pd.read_csv('../Data/credit_record.csv')

In [4]:
applications.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [5]:
records.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [6]:
applications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [7]:
records.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


#### There are more data points in records frame than applications frame. merge the 2 on ID.

In [8]:
# subset data - drop variables that most likely won't affect the DV.
applications2 = applications.drop(['FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL'], axis=1)

In [9]:
# rename columns 
applications2.rename(columns={'CODE_GENDER': 'gender', 'FLAG_OWN_CAR' : 'ownsCar', 'FLAG_OWN_REALTY' : 'ownsRealty', 'CNT_CHILDREN' : 'numChildren', 'AMT_INCOME_TOTAL' : 'totalIncome', 'NAME_INCOME_TYPE' : 'incomeType', 'NAME_EDUCATION_TYPE' : 'eduLvl', 'NAME_FAMILY_STATUS': 'famStatus', 'NAME_HOUSING_TYPE' : 'housingType', 'DAYS_BIRTH' : 'daysBirth', 'DAYS_EMPLOYED' : 'daysEmpl', 'OCCUPATION_TYPE' : 'occupation', 'CNT_FAM_MEMBERS' : 'famSize'}, inplace = True)

In [10]:
# check for missing values
applications2.isna().sum()

ID                  0
gender              0
ownsCar             0
ownsRealty          0
numChildren         0
totalIncome         0
incomeType          0
eduLvl              0
famStatus           0
housingType         0
daysBirth           0
daysEmpl            0
occupation     134203
famSize             0
dtype: int64

In [11]:
# replace missing values for occupation type with "Not identified"
applications2['occupation'].fillna(value='Not identified', inplace=True)

In [12]:
# convert daysBirth to Age in years for clarity
applications2['ageYrs'] = np.ceil(pd.to_timedelta(applications2['daysBirth'], unit='D').dt.days / -365.25)
applications2.drop('daysBirth', axis = 1, inplace = True)

In [13]:
# convert daysEmpl to yrsEmpl to keep in the same units
applications2['yrsEmpl']=-applications2['daysEmpl']/365.2425
applications2.loc[applications2['yrsEmpl']<0,'yrsEmpl']=0
applications2.drop('daysEmpl', axis=1, inplace=True)

In [14]:
# Create unemployed indicator - according to source explanation, positive daysEmpl are unemployed. 1=has a job, 0=unemployed
applications2['UNEMPLOYED']=0
applications2.loc[-applications2['yrsEmpl']<0,'UNEMPLOYED']=1

In [15]:
applications2.head()

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl,UNEMPLOYED
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Not identified,2.0,33.0,12.435574,1
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Not identified,2.0,33.0,12.435574,1
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,Security staff,2.0,59.0,3.104787,1
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,1.0,53.0,8.353354,1
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,Sales staff,1.0,53.0,8.353354,1


In [16]:
applications2.UNEMPLOYED.unique()

array([1, 0], dtype=int64)

In [17]:
applications2.UNEMPLOYED.value_counts()

1    363228
0     75329
Name: UNEMPLOYED, dtype: int64

In [18]:
# merge the datasets
merged = pd.merge(applications2, records, on="ID")

In [19]:
merged.head()

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl,UNEMPLOYED,MONTHS_BALANCE,STATUS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Not identified,2.0,33.0,12.435574,1,0,C
1,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Not identified,2.0,33.0,12.435574,1,-1,C
2,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Not identified,2.0,33.0,12.435574,1,-2,C
3,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Not identified,2.0,33.0,12.435574,1,-3,C
4,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Not identified,2.0,33.0,12.435574,1,-4,C


In [20]:
# check data types of all variables
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 777715 entries, 0 to 777714
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              777715 non-null  int64  
 1   gender          777715 non-null  object 
 2   ownsCar         777715 non-null  object 
 3   ownsRealty      777715 non-null  object 
 4   numChildren     777715 non-null  int64  
 5   totalIncome     777715 non-null  float64
 6   incomeType      777715 non-null  object 
 7   eduLvl          777715 non-null  object 
 8   famStatus       777715 non-null  object 
 9   housingType     777715 non-null  object 
 10  occupation      777715 non-null  object 
 11  famSize         777715 non-null  float64
 12  ageYrs          777715 non-null  float64
 13  yrsEmpl         777715 non-null  float64
 14  UNEMPLOYED      777715 non-null  int64  
 15  MONTHS_BALANCE  777715 non-null  int64  
 16  STATUS          777715 non-null  object 
dtypes: float64

In [21]:
# make sure continuous data is not too related (multicollinearity present if >.6/.7)
applications2.corr()

Unnamed: 0,ID,numChildren,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED
ID,1.0,-0.005178,0.011179,-0.001862,0.005072,0.012096,0.002274
numChildren,-0.005178,1.0,0.019177,0.884781,-0.349072,0.038843,0.242624
totalIncome,0.011179,0.019177,1.0,0.011454,-0.053882,0.075149,0.141016
famSize,-0.001862,0.884781,0.011454,1.0,-0.306144,0.059437,0.235051
ageYrs,0.005072,-0.349072,-0.053882,-0.306144,1.0,-0.015476,-0.622177
yrsEmpl,0.012096,0.038843,0.075149,0.059437,-0.015476,1.0,0.412679
UNEMPLOYED,0.002274,0.242624,0.141016,0.235051,-0.622177,0.412679,1.0


#### IVs famSize and numChildren are highly positively correlated, but the others are not. This makes sense, as the more children there are in a family, the family size gets bigger.

In [22]:
# look at unique status types for applicant's records
merged['STATUS'].unique()

array(['C', '1', '0', 'X', '5', '4', '3', '2'], dtype=object)

#### From source explanation: C= account paid (up-to-date), X= 0 balance for the month, 0= payment overdue 1-29 days, 1= payment overdue 30-59 days, 2= payment overdue 60-89 days, 3= payment overdue 90-119 days, 4= overdue 120-149 days, 5= overdue >150 days. We will call anything more than 60 days payment past due delinquent/bad. 

In [23]:
merged.STATUS.value_counts()

C    329536
0    290654
X    145950
1      8747
5      1527
2       801
3       286
4       214
Name: STATUS, dtype: int64

In [24]:
# find unique months balance values
merged['MONTHS_BALANCE'].unique()

array([  0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9, -10, -11, -12,
       -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25,
       -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38,
       -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51,
       -52, -53, -54, -55, -56, -57, -58, -59, -60], dtype=int64)

## recode status types to account for 60 day cutoff. 1 = 'high risk' for any month where the individual's balance is >60 days overdue, and 0 is 'low risk'.

In [25]:
merged['GoodBad'] = merged['STATUS']
merged = merged.replace({'GoodBad' :{'C' : 'Good','X' : 'Good','0' : 'Good', '1' : 'Good','2' : 'Bad','3' : 'Bad','4' : 'Bad','5' : 'Bad'}})

In [26]:
mergedRes = merged.value_counts(subset=['ID', 'GoodBad']).unstack(fill_value=0).reset_index()

In [27]:
mergedRes.head()

GoodBad,ID,Bad,Good
0,5008804,0,16
1,5008805,0,15
2,5008806,0,30
3,5008808,0,5
4,5008809,0,5


In [28]:
#decide approval/rejection status based on applicant history majority
mergedRes.loc[(mergedRes['Good'] / mergedRes['Bad'] >= 1), 'ApprStatus'] = 1
mergedRes.loc[(mergedRes['Good'] / mergedRes['Bad'] < 1 ), 'ApprStatus'] = 0
mergedRes['ApprStatus'] = mergedRes['ApprStatus'].astype(int)
# combine datasets
merged3 = merged.merge(mergedRes, how='inner', on=['ID'])

In [29]:
merged3.head()

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,...,famSize,ageYrs,yrsEmpl,UNEMPLOYED,MONTHS_BALANCE,STATUS,GoodBad,Bad,Good,ApprStatus
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,0,C,Good,0,16,1
1,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-1,C,Good,0,16,1
2,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-2,C,Good,0,16,1
3,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-3,C,Good,0,16,1
4,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-4,C,Good,0,16,1


In [30]:
# Extract how many months account has been open for
merged4=pd.DataFrame(merged.groupby(['ID'])['MONTHS_BALANCE'].agg(min)).reset_index()

# Rename column
merged4.rename(columns={'MONTHS_BALANCE':'ACCOUNT_LENGTH'}, inplace=True)

# Make values positive
merged4['ACCOUNT_LENGTH']=-merged4['ACCOUNT_LENGTH']



In [31]:
merged4.head()

Unnamed: 0,ID,ACCOUNT_LENGTH
0,5008804,15
1,5008805,14
2,5008806,29
3,5008808,4
4,5008809,26


In [32]:
merged3.head()

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,...,famSize,ageYrs,yrsEmpl,UNEMPLOYED,MONTHS_BALANCE,STATUS,GoodBad,Bad,Good,ApprStatus
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,0,C,Good,0,16,1
1,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-1,C,Good,0,16,1
2,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-2,C,Good,0,16,1
3,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-3,C,Good,0,16,1
4,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,2.0,33.0,12.435574,1,-4,C,Good,0,16,1


In [33]:
# convert continuous variables to integers
merged3.numChildren = merged.numChildren.astype(int)
merged3.totalIncome = merged.totalIncome.astype(int)
merged3.ageYrs = merged.ageYrs.astype(int)
merged3.yrsEmpl = merged.yrsEmpl.astype(int)
merged3.famSize = merged.famSize.astype(int)

In [34]:
# sort dataFrame by ID so that each unique applicants' info appears together
mergedID = merged3.sort_values('ID')

In [35]:
# verify that acct length for one ID number matches the number of unique ID records
mergedID.head(17)

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,...,famSize,ageYrs,yrsEmpl,UNEMPLOYED,MONTHS_BALANCE,STATUS,GoodBad,Bad,Good,ApprStatus
0,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,0,C,Good,0,16,1
15,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-15,X,Good,0,16,1
14,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-14,0,Good,0,16,1
13,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-13,1,Good,0,16,1
12,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-12,C,Good,0,16,1
10,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-10,C,Good,0,16,1
9,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-9,C,Good,0,16,1
8,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-8,C,Good,0,16,1
11,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-11,C,Good,0,16,1
6,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,...,2,33,12,1,-6,C,Good,0,16,1


In [36]:
mergedID.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 777715 entries, 0 to 171517
Data columns (total 21 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ID              777715 non-null  int64 
 1   gender          777715 non-null  object
 2   ownsCar         777715 non-null  object
 3   ownsRealty      777715 non-null  object
 4   numChildren     777715 non-null  int32 
 5   totalIncome     777715 non-null  int32 
 6   incomeType      777715 non-null  object
 7   eduLvl          777715 non-null  object
 8   famStatus       777715 non-null  object
 9   housingType     777715 non-null  object
 10  occupation      777715 non-null  object
 11  famSize         777715 non-null  int32 
 12  ageYrs          777715 non-null  int32 
 13  yrsEmpl         777715 non-null  int32 
 14  UNEMPLOYED      777715 non-null  int64 
 15  MONTHS_BALANCE  777715 non-null  int64 
 16  STATUS          777715 non-null  object
 17  GoodBad         777715 non-nu

In [37]:
# drop columns used to come up with ApprovalStatus and save it in a new variable
mergedID2 = mergedID[['ID', 'gender', 'ownsCar', 'ownsRealty', 'numChildren', 'totalIncome', 'incomeType', 'eduLvl', 'famStatus', 'housingType', 'occupation', 'famSize', 'ageYrs', 'yrsEmpl', 'UNEMPLOYED', 'ApprStatus']]

In [38]:
mergedID2.head()

Unnamed: 0,ID,gender,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus
0,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1
15,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1
14,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1
13,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1
12,5008804,M,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1


In [39]:
#save dataset as csv
mergedID2.to_csv("../Data/cleanMerge1.csv")

## Recode String Variables

In [49]:
# recode, drop original string columns, and ensure int data type
def genderRecode (series):
    if series == "F":
        return 0
    if series == "M":
        return 1
mergedID2['genderR'] = mergedID2['gender'].apply(genderRecode)
mergedID2.drop('gender', axis=1, inplace=True)

In [50]:
mergedID2.head()

Unnamed: 0,ID,ownsCar,ownsRealty,numChildren,totalIncome,incomeType,eduLvl,famStatus,housingType,occupation,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,genderR
0,5008804,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1,1
15,5008804,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1,1
14,5008804,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1,1
13,5008804,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1,1
12,5008804,Y,Y,0,427500,Working,Higher education,Civil marriage,Rented apartment,Not identified,2,33,12,1,1,1


In [52]:
mergedID2.genderR = mergedID2.genderR.astype(int)

In [56]:
def carRecode (series):
    if series == "N":
        return 0
    if series == "Y":
        return 1
mergedID2['ownsCarR'] = mergedID2['ownsCar'].apply(carRecode)
mergedID2.drop('ownsCar', axis=1, inplace=True)

In [57]:
mergedID2.ownsCarR = mergedID2.ownsCarR.astype(int)

In [58]:
def realtyRecode (series):
    if series == "N":
        return 0
    if series == "Y":
        return 1
mergedID2['ownsRealtyR'] = mergedID2['ownsRealty'].apply(realtyRecode)
mergedID2.drop('ownsRealty', axis=1, inplace=True)

In [60]:
mergedID2.ownsRealtyR = mergedID2.ownsRealtyR.astype(int)

In [61]:
mergedID2['occupation'].unique()

array(['Not identified', 'Security staff', 'Sales staff', 'Accountants',
       'Laborers', 'Managers', 'Drivers', 'Core staff',
       'High skill tech staff', 'Cleaning staff', 'Private service staff',
       'Cooking staff', 'Low-skill Laborers', 'Medicine staff',
       'Secretaries', 'Waiters/barmen staff', 'HR staff', 'Realty agents',
       'IT staff'], dtype=object)

In [63]:
def occuR (series):
    if series == "Not identified":
        return 0
    if series == "Security staff":
        return 1
    if series == "Sales staff":
        return 2
    if series == "Accountants":
        return 3
    if series == "Laborers":
        return 4
    if series == "Managers":
        return 5
    if series == "Drivers":
        return 6
    if series == "Core staff":
        return 7
    if series == "High skill tech staff":
        return 8
    if series == "Cleaning staff":
        return 9
    if series == "Private service staff":
        return 10
    if series == "Cooking staff":
        return 11
    if series == "Low-skill Laborers":
        return 12
    if series == "Medicine staff":
        return 13
    if series == "Secretaries":
        return 14
    if series == "Waiters/barmen staff":
        return 15
    if series == "HR staff":
        return 16
    if series == "Realty agents":
        return 17
    if series == "IT staff":
        return 18
    
mergedID2['occupationR'] = mergedID2['occupation'].apply(occuR)
mergedID2.drop('occupation', axis=1, inplace=True)

In [64]:
mergedID2.occupationR = mergedID2.occupationR.astype(int)

In [65]:
mergedID2['eduLvl'].unique()

array(['Higher education', 'Secondary / secondary special',
       'Incomplete higher', 'Lower secondary', 'Academic degree'],
      dtype=object)

In [66]:
def eduR (series):
    if series == "Higher education":
        return 0
    if series == "Secondary / secondary special":
        return 1
    if series == "Incomplete higher":
        return 2
    if series == "Lower secondary":
        return 3
    if series == "Academic degree":
        return 4
    
mergedID2['eduLvlR'] = mergedID2['eduLvl'].apply(eduR)
mergedID2.drop('eduLvl', axis=1, inplace=True)

In [67]:
mergedID2.eduLvlR = mergedID2.eduLvlR.astype(int)

In [69]:
mergedID2['incomeType'].unique()

array(['Working', 'Commercial associate', 'Pensioner', 'State servant',
       'Student'], dtype=object)

In [70]:
def incR (series):
    if series == "Working":
        return 0
    if series == "Commercial associate":
        return 1
    if series == "Pensioner":
        return 2
    if series == "State servant":
        return 3
    if series == "Student":
        return 4
    
mergedID2['incomeTypeR'] = mergedID2['incomeType'].apply(incR)
mergedID2.drop('incomeType', axis=1, inplace=True)

In [71]:
mergedID2.incomeTypeR = mergedID2.incomeTypeR.astype(int)

In [72]:
mergedID2['housingType'].unique()

array(['Rented apartment', 'House / apartment', 'Municipal apartment',
       'With parents', 'Co-op apartment', 'Office apartment'],
      dtype=object)

In [74]:
def housR (series):
    if series == "Rented apartment":
        return 0
    if series == "House / apartment":
        return 1
    if series == "Municipal apartment":
        return 2
    if series == "With parents":
        return 3
    if series == "Co-op apartment":
        return 4
    if series == "Office apartment":
        return 5
    
mergedID2['housingTypeR'] = mergedID2['housingType'].apply(housR)
mergedID2.drop('housingType', axis=1, inplace=True)

In [75]:
mergedID2.housingTypeR = mergedID2.housingTypeR.astype(int)

In [76]:
mergedID2['famStatus'].unique()

array(['Civil marriage', 'Married', 'Single / not married', 'Separated',
       'Widow'], dtype=object)

In [77]:
def famR (series):
    if series == "Civil marriage":
        return 0
    if series == "Married":
        return 1
    if series == "Single / not married":
        return 2
    if series == "Separated":
        return 3
    if series == "Widow":
        return 4
    
mergedID2['famStatusR'] = mergedID2['famStatus'].apply(famR)
mergedID2.drop('famStatus', axis=1, inplace=True)

In [78]:
mergedID2.famStatusR = mergedID2.famStatusR.astype(int)

In [79]:
mergedID2.head()

Unnamed: 0,ID,numChildren,totalIncome,famSize,ageYrs,yrsEmpl,UNEMPLOYED,ApprStatus,genderR,ownsCarR,ownsRealtyR,occupationR,eduLvlR,incomeTypeR,housingTypeR,famStatusR
0,5008804,0,427500,2,33,12,1,1,1,1,1,0,0,0,0,0
15,5008804,0,427500,2,33,12,1,1,1,1,1,0,0,0,0,0
14,5008804,0,427500,2,33,12,1,1,1,1,1,0,0,0,0,0
13,5008804,0,427500,2,33,12,1,1,1,1,1,0,0,0,0,0
12,5008804,0,427500,2,33,12,1,1,1,1,1,0,0,0,0,0


# Next week, I will continue with data exploration on my categorical and continuous variables using mergedID data frame, balance out any variables that may need it, look for outliers and decide what to do with them, and make sure I know which analyses I'll run and ensure I have all of my data ready for it.