# How to use wrangle.py and preprocessing.py functions

In [1]:
import pandas as pd
import numpy as np
from wrangle import get_application_data
from wrangle import get_reports_data
import preprocessing as prep

### `get_reports_data(creditrecordcsv)`

Getting the data from `credit_record.csv` is simple:

In [2]:
expanded, score, full_history = get_reports_data('credit_record.csv')

What does `get_reports_data` do for us? 

Returns `expanded`, `score`, and `full_history`

1. Returns `expanded` - This dataframe contains unique observations representing the history of a single credit card.
    - `id`: Unique ID (may have a matching ID in the application data)
    - `0-29`: Number of months within a 60 month period that a balance was 0-29 days past due
    - `30-59`: Number of months within a 60 month period that a balance was 30-59 days past due
    - `60-89`: Number of months within a 60 month period that a balance was 60-89 days past due
    - `90-119`: Number of months within a 60 month period that a balance was 90-119 days past due
    - `120-149`: Number of months within a 60 month period that a balance was 120-149 days past due
    - `bad_debt`: Number of months within a 60 month period that a balance was 150+ days past due
    - `no_debt`: Number of months within a 60 month period that there was no debt on record
    - `paid_off`: Number of months within a 60 month period where the existing debt was paid off
    - `months_active`: Number of months that the credit line has been active, up to 60 months

In [3]:
expanded

Unnamed: 0,id,0-29,30-59,60-89,90-119,120-149,bad_debt,no_debt,paid_off,months_active
0,5001711,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,5001712,10.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,19
2,5001713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,22
3,5001714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,15
4,5001715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,60
...,...,...,...,...,...,...,...,...,...,...
45980,5150482,12.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,18
45981,5150483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,18
45982,5150484,12.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,13
45983,5150485,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [4]:
expanded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45985 entries, 0 to 45984
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             45985 non-null  int64  
 1   0-29           45985 non-null  float64
 2   30-59          45985 non-null  float64
 3   60-89          45985 non-null  float64
 4   90-119         45985 non-null  float64
 5   120-149        45985 non-null  float64
 6   bad_debt       45985 non-null  float64
 7   no_debt        45985 non-null  float64
 8   paid_off       45985 non-null  float64
 9   months_active  45985 non-null  int64  
dtypes: float64(8), int64(2)
memory usage: 3.9 MB


2. Returns `score`
- This is similar to `expanded` except for the addition of the `score` column.
- The score is equal to the following formula:

        Number of months 30-59 days past due * 2
        + 
        Number of months 60-89 days past due * 3
        +
        Number of months 90-119 days past due * 4
        + 
        Number of months 120-149 days past due * 5
        + 
        Number of months 150+ days past due (bad_debt) * 6
        -
        Number of months debt was paid off (paid_off) * 2
- Note that the `id` column for score is an object (string) type

In [5]:
score

Unnamed: 0,id,0-29,30-59,60-89,90-119,120-149,bad_debt,no_debt,paid_off,months_active,score
0,5001711,3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,4,5.0
1,5001712,10.0,0.0,0.0,0.0,0.0,0.0,9.0,-0.0,19,38.0
2,5001713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-44.0,22,-22.0
3,5001714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-30.0,15,-15.0
4,5001715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-120.0,60,-60.0
...,...,...,...,...,...,...,...,...,...,...,...
45980,5150482,12.0,0.0,0.0,0.0,0.0,0.0,6.0,-0.0,18,36.0
45981,5150483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-36.0,18,-18.0
45982,5150484,12.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.0,13,26.0
45983,5150485,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,2,4.0


In [6]:
score.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45985 entries, 0 to 45984
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             45985 non-null  object 
 1   0-29           45985 non-null  float64
 2   30-59          45985 non-null  float64
 3   60-89          45985 non-null  float64
 4   90-119         45985 non-null  float64
 5   120-149        45985 non-null  float64
 6   bad_debt       45985 non-null  float64
 7   no_debt        45985 non-null  float64
 8   paid_off       45985 non-null  float64
 9   months_active  45985 non-null  int64  
 10  score          45985 non-null  float64
dtypes: float64(9), int64(1), object(1)
memory usage: 4.2+ MB


3. Returns `full_history`
- Like `expanded` this dataframe shows the credit record for each unique ID. 
- `account_months`: The length that the credit line has been active
- `status`: The most recent month's code:
    - 0: 1-29 days past due 
    - 1: 30-59 days past due 
    - 2: 60-89 days overdue 
    - 3: 90-119 days overdue 
    - 4: 120-149 days overdue 
    - 5: Overdue or bad debts, write-offs for more than 150 days 
    - C: paid off that month 
    - X: No loan for the month
- The remaining columns show the code for the preceding months in reverse chronological order.

In [7]:
full_history

Unnamed: 0,id,account_months,status,1month_ago,2month_ago,3month_ago,4month_ago,5month_ago,6month_ago,7month_ago,...,51month_ago,52month_ago,53month_ago,54month_ago,55month_ago,56month_ago,57month_ago,58month_ago,59month_ago,60month_ago
0,5001711,3,0,0,0,X,,,,,...,,,,,,,,,,
1,5001712,18,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2,5001713,21,X,X,X,X,X,X,X,X,...,,,,,,,,,,
3,5001714,14,X,X,X,X,X,X,X,X,...,,,,,,,,,,
4,5001715,59,X,X,X,X,X,X,X,X,...,X,X,X,X,X,X,X,X,X,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45980,5150482,28,0,0,0,0,0,0,0,0,...,,,,,,,,,,
45981,5150483,17,X,X,X,X,X,X,X,X,...,,,,,,,,,,
45982,5150484,12,0,0,0,0,0,0,0,0,...,,,,,,,,,,
45983,5150485,1,0,0,,,,,,,...,,,,,,,,,,


In [8]:
full_history.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45985 entries, 0 to 45984
Data columns (total 63 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              45985 non-null  int64 
 1   account_months  45985 non-null  int64 
 2   status          45985 non-null  object
 3   1month_ago      45586 non-null  object
 4   2month_ago      44498 non-null  object
 5   3month_ago      43335 non-null  object
 6   4month_ago      41996 non-null  object
 7   5month_ago      40769 non-null  object
 8   6month_ago      39330 non-null  object
 9   7month_ago      37873 non-null  object
 10  8month_ago      36342 non-null  object
 11  9month_ago      34921 non-null  object
 12  10month_ago     33621 non-null  object
 13  11month_ago     32274 non-null  object
 14  12month_ago     30918 non-null  object
 15  13month_ago     29581 non-null  object
 16  14month_ago     28409 non-null  object
 17  15month_ago     27277 non-null  object
 18  16mont

### `get_application_data(applicationrecordcsv)`

Getting the data from the 'application_record.csv' is simple:

In [9]:
apps = get_application_data('application_record.csv')

What does `get_application_data` do for us?
1. Reads the csv into a dataframe
1. Converts all column headers to lowercase
1. Converts 'id' to string type
1. Fills null values in occupation type with 'Other'
1. Adds a column for years employed
1. Adds a column for age in years (rounded down)
1. Replaces "Y" and "N" throughout the dataframe for 1s and 0s (except for gender)
1. Reverses the sign on days_birth and days_employed
1. Changes the employed_years for permanently retired pensioners to an estimate based on their age, gender, and occupation
1. Converts the days_employed for retired pensioners from a placeholder value to an estimate based on their age, gender, and occupation

Returns `apps`, the cleaned dataframe

In [10]:
apps

Unnamed: 0,id,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_income_type,name_education_type,name_family_status,name_housing_type,days_birth,days_employed,flag_mobil,flag_work_phone,flag_phone,flag_email,occupation_type,cnt_fam_members,employed_years,age
0,5008804,M,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,12005,4542,1,1,0,0,Other,2.0,12.0,32.0
1,5008805,M,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,12005,4542,1,1,0,0,Other,2.0,12.0,32.0
2,5008806,M,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,21474,1134,1,0,0,0,Security staff,2.0,3.0,58.0
3,5008808,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,19110,3051,1,0,1,1,Sales staff,1.0,8.0,52.0
4,5008809,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,19110,3051,1,0,1,1,Sales staff,1.0,8.0,52.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,M,0,1,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,22717,15705,1,0,0,0,Other,1.0,43.0,62.0
438553,6840222,F,0,0,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,15939,3007,1,0,0,0,Laborers,1.0,8.0,43.0
438554,6841878,F,0,0,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,8169,372,1,1,0,0,Sales staff,1.0,1.0,22.0
438555,6842765,F,0,1,0,72000.0,Pensioner,Secondary / secondary special,Married,House / apartment,21673,13879,1,0,0,0,Other,2.0,38.0,59.0


In [11]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   438557 non-null  object 
 1   code_gender          438557 non-null  object 
 2   flag_own_car         438557 non-null  int64  
 3   flag_own_realty      438557 non-null  int64  
 4   cnt_children         438557 non-null  int64  
 5   amt_income_total     438557 non-null  float64
 6   name_income_type     438557 non-null  object 
 7   name_education_type  438557 non-null  object 
 8   name_family_status   438557 non-null  object 
 9   name_housing_type    438557 non-null  object 
 10  days_birth           438557 non-null  int64  
 11  days_employed        438557 non-null  int64  
 12  flag_mobil           438557 non-null  int64  
 13  flag_work_phone      438557 non-null  int64  
 14  flag_phone           438557 non-null  int64  
 15  flag_email       

### `prep.add_apps_dummies(apps)`
By passing the apps dataframe into this function, we add on dummy variables to encode our categorical variables. This function does not drop the original categorical variables, as certain exploration tasks are made easier by keeping the originals. 

Naturally, the original variables will need to be dropped before modeling. Alternatively, use the similar function `encode_dummies` which drops the categorical variables after dummy creation. `encode_dummies` also drops protected classes (age, marital status, and gender).

`add_apps_dummies()` returns the dataframe `apps_dummies`

In [12]:
apps_dummies = prep.add_apps_dummies(apps)

In [13]:
apps_dummies.columns

Index(['id', 'code_gender', 'flag_own_car', 'flag_own_realty', 'cnt_children',
       'amt_income_total', 'name_income_type', 'name_education_type',
       'name_family_status', 'name_housing_type', 'days_birth',
       'days_employed', 'flag_mobil', 'flag_work_phone', 'flag_phone',
       'flag_email', 'occupation_type', 'cnt_fam_members', 'employed_years',
       'age', 'name_income_type_commercial_associate',
       'name_income_type_pensioner', 'name_income_type_state_servant',
       'name_income_type_student', 'name_income_type_working',
       'name_education_type_academic_degree',
       'name_education_type_higher_education',
       'name_education_type_incomplete_higher',
       'name_education_type_lower_secondary',
       'name_education_type_secondary_/_secondary_special',
       'name_family_status_civil_marriage', 'name_family_status_married',
       'name_family_status_separated',
       'name_family_status_single_/_not_married', 'name_family_status_widow',
       'name

### `prep.encode_dummies(apps)`
This function is similar to add_apps_dummies except that it prepares the dataframe for proper modeling. Original categorical variables, including `code_gender`, `name_family_status`, `days_birth`, and `age` are dropped from the dataframe. 

In [14]:
apps_encoded = prep.encode_dummies(apps)

In [15]:
apps_encoded

Unnamed: 0,id,flag_own_car,flag_own_realty,cnt_children,amt_income_total,days_birth,days_employed,flag_mobil,flag_work_phone,flag_phone,...,occupation_type_low-skill_laborers,occupation_type_managers,occupation_type_medicine_staff,occupation_type_other,occupation_type_private_service_staff,occupation_type_realty_agents,occupation_type_sales_staff,occupation_type_secretaries,occupation_type_security_staff,occupation_type_waiters/barmen_staff
0,5008804,1,1,0,427500.0,12005,4542,1,1,0,...,0,0,0,1,0,0,0,0,0,0
1,5008805,1,1,0,427500.0,12005,4542,1,1,0,...,0,0,0,1,0,0,0,0,0,0
2,5008806,1,1,0,112500.0,21474,1134,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5008808,0,1,0,270000.0,19110,3051,1,0,1,...,0,0,0,0,0,0,1,0,0,0
4,5008809,0,1,0,270000.0,19110,3051,1,0,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,0,1,0,135000.0,22717,15705,1,0,0,...,0,0,0,1,0,0,0,0,0,0
438553,6840222,0,0,0,103500.0,15939,3007,1,0,0,...,0,0,0,0,0,0,0,0,0,0
438554,6841878,0,0,0,54000.0,8169,372,1,1,0,...,0,0,0,0,0,0,1,0,0,0
438555,6842765,0,1,0,72000.0,21673,13879,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [16]:
apps_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 48 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   id                                                 438557 non-null  object 
 1   flag_own_car                                       438557 non-null  int64  
 2   flag_own_realty                                    438557 non-null  int64  
 3   cnt_children                                       438557 non-null  int64  
 4   amt_income_total                                   438557 non-null  float64
 5   days_birth                                         438557 non-null  int64  
 6   days_employed                                      438557 non-null  int64  
 7   flag_mobil                                         438557 non-null  int64  
 8   flag_work_phone                                    438557 non-null  int64 

### `prep.add_score_target(apps, score)`
The apps dataframe created from `get_application_data()` includes many rows that have no corresponding credit record.

`prep.add_score_target` joins the score column from the score dataframe onto the apps dataframe and returns two dataframes:
1. `apps_cred`: applications that have a corresponding credit record
2. `apps_none`: applications that DO NOT have a corresponding credit record

In [17]:
apps_cred, apps_none = prep.add_score_target(apps, score)

`apps_cred` **is what we can split into train, validate, and test to explore and model**

In [18]:
apps_cred

Unnamed: 0,id,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_income_type,name_education_type,name_family_status,name_housing_type,...,days_employed,flag_mobil,flag_work_phone,flag_phone,flag_email,occupation_type,cnt_fam_members,employed_years,age,score
0,5008804,M,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,4542,1,1,0,0,Other,2.0,12.0,32.0,33.0
1,5008805,M,1,1,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,...,4542,1,1,0,0,Other,2.0,12.0,32.0,31.0
2,5008806,M,1,1,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,...,1134,1,0,0,0,Security staff,2.0,3.0,58.0,12.0
3,5008808,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,3051,1,0,1,1,Sales staff,1.0,8.0,52.0,1.0
4,5008809,F,0,1,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,...,3051,1,0,1,1,Sales staff,1.0,8.0,52.0,-5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,M,1,1,0,315000.0,Working,Secondary / secondary special,Married,House / apartment,...,2420,1,0,0,0,Managers,2.0,7.0,47.0,18.0
36453,5149834,F,0,1,0,157500.0,Commercial associate,Higher education,Married,House / apartment,...,1325,1,0,1,1,Medicine staff,2.0,4.0,33.0,118.0
36454,5149838,F,0,1,0,157500.0,Pensioner,Higher education,Married,House / apartment,...,1325,1,0,1,1,Medicine staff,2.0,4.0,33.0,136.0
36455,5150049,F,0,1,0,283500.0,Working,Secondary / secondary special,Married,House / apartment,...,655,1,0,0,0,Sales staff,2.0,2.0,49.0,25.0


In [19]:
apps_cred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36457 entries, 0 to 36456
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   36457 non-null  object 
 1   code_gender          36457 non-null  object 
 2   flag_own_car         36457 non-null  int64  
 3   flag_own_realty      36457 non-null  int64  
 4   cnt_children         36457 non-null  int64  
 5   amt_income_total     36457 non-null  float64
 6   name_income_type     36457 non-null  object 
 7   name_education_type  36457 non-null  object 
 8   name_family_status   36457 non-null  object 
 9   name_housing_type    36457 non-null  object 
 10  days_birth           36457 non-null  int64  
 11  days_employed        36457 non-null  int64  
 12  flag_mobil           36457 non-null  int64  
 13  flag_work_phone      36457 non-null  int64  
 14  flag_phone           36457 non-null  int64  
 15  flag_email           36457 non-null 

`apps_none` can be explored, but it cannot be used as part of the train, validate, test split for modeling purposes. We can later apply the model to the apps_none dataframe to predict their scores, but there is no way to evaluate performance. 

In [20]:
apps_none

Unnamed: 0,id,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,name_income_type,name_education_type,name_family_status,name_housing_type,days_birth,days_employed,flag_mobil,flag_work_phone,flag_phone,flag_email,occupation_type,cnt_fam_members,employed_years,age
0,6153651,M,1,1,0,270000.0,Working,Higher education,Married,House / apartment,16872,769,1,1,1,1,Accountants,2.0,2.0,46.0
1,6153712,F,0,1,1,112500.0,Working,Secondary / secondary special,Single / not married,House / apartment,10968,1620,1,0,0,0,Other,2.0,4.0,30.0
2,6153733,M,1,1,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,20502,4450,1,0,1,0,Drivers,2.0,12.0,56.0
3,6153734,M,1,1,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,20502,4450,1,0,1,0,Drivers,2.0,12.0,56.0
4,6153735,M,1,1,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,20502,4450,1,0,1,0,Drivers,2.0,12.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402095,6840104,M,0,1,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,22717,15705,1,0,0,0,Other,1.0,43.0,62.0
402096,6840222,F,0,0,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,15939,3007,1,0,0,0,Laborers,1.0,8.0,43.0
402097,6841878,F,0,0,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,8169,372,1,1,0,0,Sales staff,1.0,1.0,22.0
402098,6842765,F,0,1,0,72000.0,Pensioner,Secondary / secondary special,Married,House / apartment,21673,13879,1,0,0,0,Other,2.0,38.0,59.0


In [21]:
apps_none.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402100 entries, 0 to 402099
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   402100 non-null  object 
 1   code_gender          402100 non-null  object 
 2   flag_own_car         402100 non-null  int64  
 3   flag_own_realty      402100 non-null  int64  
 4   cnt_children         402100 non-null  int64  
 5   amt_income_total     402100 non-null  float64
 6   name_income_type     402100 non-null  object 
 7   name_education_type  402100 non-null  object 
 8   name_family_status   402100 non-null  object 
 9   name_housing_type    402100 non-null  object 
 10  days_birth           402100 non-null  int64  
 11  days_employed        402100 non-null  int64  
 12  flag_mobil           402100 non-null  int64  
 13  flag_work_phone      402100 non-null  int64  
 14  flag_phone           402100 non-null  int64  
 15  flag_email       

### `prep.split_data(df, pct=0.10)`

This simply splits the dataframe of your choice into train, validate, and test sets. The random_state is set to 123 for reproduction.

In [22]:
train, validate, test = prep.split_data(apps_cred)

In [23]:
train.shape, validate.shape, test.shape

((26248, 21), (6563, 21), (3646, 21))

### `prep.split_stratify_data(df, stratify_target, pct=0.10)`
This is similar to `split_data` but this allows us to stratify on a variable, which we may use if we eventually bin the target. This also has the random_state set to allow for reproducibility. 

In [24]:
train, validate, test = prep.split_stratify_data(apps_cred, 'code_gender')

In [25]:
train.shape, validate.shape, test.shape

((26248, 21), (6563, 21), (3646, 21))

### `prep.create_scaled_x_y(train, validate, test, target)`
This creates scaled X and y sets from properly encoded train, validate, and test sets.

In [26]:
# First lets get a properly encoded dataframe
apps_encoded = prep.encode_dummies(apps)

# Lets add the target variable to that dataframe
apps_encoded_cred, apps_encoded_none = prep.add_score_target(apps_encoded, score)

# Lets split the dataframe
train, validate, test = prep.split_data(apps_encoded_cred)

Now we can call our function to scale and separate our sets into scaled version ready for modeling

In [27]:
X_train_scaled, y_train, X_validate_scaled, y_validate, X_test_scaled, y_test = prep.create_scaled_x_y(train, validate, test, 'score')

In [28]:
X_train_scaled.shape, y_train.shape

((26248, 47), (26248,))

In [29]:
X_validate_scaled.shape, y_validate.shape

((6563, 47), (6563,))

In [30]:
X_test_scaled.shape, y_test.shape

((3646, 47), (3646,))

In [31]:
X_train_scaled.head()

Unnamed: 0,flag_own_car,flag_own_realty,cnt_children,amt_income_total,days_birth,days_employed,flag_mobil,flag_work_phone,flag_phone,flag_email,...,occupation_type_low-skill_laborers,occupation_type_managers,occupation_type_medicine_staff,occupation_type_other,occupation_type_private_service_staff,occupation_type_realty_agents,occupation_type_sales_staff,occupation_type_secretaries,occupation_type_security_staff,occupation_type_waiters/barmen_staff
32073,-0.787113,0.69747,-0.57803,-0.956459,0.839074,-0.349186,0.0,-0.54077,-0.647571,-0.315193,...,-0.069174,-0.299527,-0.187887,-0.668616,-0.099439,-0.047866,-0.323092,-0.065755,-0.127674,-0.069452
31598,-0.787113,0.69747,2.104453,-0.064663,-0.618781,0.438324,0.0,1.849216,-0.647571,-0.315193,...,-0.069174,-0.299527,-0.187887,-0.668616,-0.099439,-0.047866,-0.323092,-0.065755,-0.127674,-0.069452
24158,-0.787113,-1.433752,-0.57803,-0.153843,1.367341,1.987221,0.0,-0.54077,-0.647571,-0.315193,...,-0.069174,-0.299527,-0.187887,1.495626,-0.099439,-0.047866,-0.323092,-0.065755,-0.127674,-0.069452
34581,-0.787113,0.69747,-0.57803,0.515004,1.717693,1.987221,0.0,-0.54077,-0.647571,-0.315193,...,-0.069174,-0.299527,-0.187887,1.495626,-0.099439,-0.047866,-0.323092,-0.065755,-0.127674,-0.069452
16771,-0.787113,0.69747,0.763212,-0.510561,-0.564478,-0.610203,0.0,-0.54077,-0.647571,-0.315193,...,-0.069174,-0.299527,-0.187887,-0.668616,-0.099439,-0.047866,-0.323092,-0.065755,-0.127674,14.398523


In [32]:
y_train.head()

32073    -30.0
31598     14.0
24158     84.0
34581    113.0
16771     48.0
Name: score, dtype: float64