### Importing Required Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, KBinsDiscretizer
import re
import pickle

### Importing the data

In [2]:
train_path = "D:/Code/Projects/Bank Default Hackathon/Training Data.csv"
test_path = "D:/Code/Projects/Bank Default Hackathon/Test Data.csv"
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_train.rename(columns={'Id':'id'},inplace=True)
# tof = len(df_train)-28000
# df_test = df_train.iloc[:28000,:]
# df_train = df_train.iloc[28000:,:]

print(df_train.shape, df_test.shape)
# Renaming the Id column to id in train set


(252000, 13) (28000, 12)


### Missing values

In [3]:
def print_missing(dataset, print_percent=True):
    missing = dataset.isnull().mean()
    missing = missing[missing>0]
    
    if len(missing)>0:
        missing.sort_values(inplace=True,ascending=False)
        missing.plot.bar()
        plt.ylabel("Fraction missing values")
        plt.title("Missing values")
        if print_percent:
            missing_features = missing.index.tolist()
            print("Percent missing values: ")
            for feature in missing_features:
                print(feature,'  \t:' ,np.round(dataset[feature].isnull().mean(),4), '%')
    else:
        print("No missing values!")

In [4]:
print_missing(df_train)

No missing values!


In [5]:
print_missing(df_test)

No missing values!


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   id                   252000 non-null  int64 
 1   income               252000 non-null  int64 
 2   age                  252000 non-null  int64 
 3   experience           252000 non-null  int64 
 4   married              252000 non-null  object
 5   house_ownership      252000 non-null  object
 6   car_ownership        252000 non-null  object
 7   profession           252000 non-null  object
 8   city                 252000 non-null  object
 9   state                252000 non-null  object
 10  current_job_years    252000 non-null  int64 
 11  current_house_years  252000 non-null  int64 
 12  risk_flag            252000 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 25.0+ MB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   28000 non-null  int64 
 1   income               28000 non-null  int64 
 2   age                  28000 non-null  int64 
 3   experience           28000 non-null  int64 
 4   married              28000 non-null  object
 5   house_ownership      28000 non-null  object
 6   car_ownership        28000 non-null  object
 7   profession           28000 non-null  object
 8   city                 28000 non-null  object
 9   state                28000 non-null  object
 10  current_job_years    28000 non-null  int64 
 11  current_house_years  28000 non-null  int64 
dtypes: int64(6), object(6)
memory usage: 2.6+ MB


In [8]:
cols = df_train.columns.to_list()

### Numerical, Year and Categorical features

In [9]:
def get_fnames_by_type(dataset):
    # list of numerical variables
    numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']
    print('Number of numerical variables: ', len(numerical_features))

    year_features = [feature for feature in numerical_features if 'yr'\
                     in feature.lower() or 'year' in feature.lower()]
    print(f"Year features: {year_features}")
    
    categorical_features = [feature for feature in dataset.columns if \
                           feature not in numerical_features]
    print('Number of categorical variables: ', len(categorical_features))
    
    return numerical_features, year_features, categorical_features

In [10]:
numerical_features, year_features, categorical_features = \
get_fnames_by_type(df_train)

Number of numerical variables:  7
Year features: ['current_job_years', 'current_house_years']
Number of categorical variables:  6


## Encoding Categorical Features

In [11]:
print(categorical_features)
print(numerical_features)

['married', 'house_ownership', 'car_ownership', 'profession', 'city', 'state']
['id', 'income', 'age', 'experience', 'current_job_years', 'current_house_years', 'risk_flag']


In [15]:
df_train[categorical_features].head()

Unnamed: 0,married,house_ownership,car_ownership,profession,city,state
0,single,rented,no,Mechanical engineer,Rewa,Madhya Pradesh
1,single,rented,no,Software Developer,Parbhani,Maharashtra
2,married,rented,no,Technical writer,Alappuzha,Kerala
3,single,rented,yes,Software Developer,Bhubaneswar,Odisha
4,single,rented,no,Civil servant,Tiruchirappalli,Tamil Nadu


In [16]:
df_test[categorical_features].head()

Unnamed: 0,married,house_ownership,car_ownership,profession,city,state
0,single,rented,no,Geologist,Malda,West Bengal
1,single,rented,no,Firefighter,Jalna,Maharashtra
2,single,rented,no,Lawyer,Thane,Maharashtra
3,married,rented,yes,Analyst,Latur,Maharashtra
4,single,rented,yes,Comedian,Berhampore,West Bengal


## Get number of categories

In [14]:
# First clean the text in the columns
# Remove all text except characters
for col in categorical_features:
    df_train[col] = df_train[col].apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x).strip())
    df_test[col] = df_test[col].apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x).strip())

In [17]:
def get_number_categories(dataset, categorical_features):
    print("Number of categories per categorical feature: ")
    for feature in categorical_features:
        print(f"{feature:20}: {len(dataset[feature].unique())}")

In [18]:
get_number_categories(df_train, categorical_features)

Number of categories per categorical feature: 
married             : 2
house_ownership     : 3
car_ownership       : 2
profession          : 51
city                : 316
state               : 28


In [18]:
with open('profession.pickle', 'rb') as f:
    profession = pickle.load(f)
    f.close()
with open('city.pickle', 'rb') as f:
    city = pickle.load(f)
    f.close()
with open('state.pickle', 'rb') as f:
    state = pickle.load(f)
    f.close()
    
ordinal_values = {
    'house_ownership' : {
        'owned': 2,
        'norent_noown': 0,
        'rented': 1
    },
    'profession':profession,
    'city': city,
    'state': state
}
nominal = ['married', 'car_ownership', 'profession', 'city', 'state']
low_cardinality_nom = [col for col in nominal if df_train[col].nunique()<10]
high_cardinality_nom = list(set(nominal)-set(low_cardinality_nom))

In [19]:
low_cardinality_nom +=  ['house_ownership']

In [20]:
high_cardinality_nom 

['city', 'state', 'profession']

### Label encoder for nominal features with high cardinality

In [21]:
label_encoder = LabelEncoder()
df_train_LE = df_train[high_cardinality_nom].copy()
df_test_LE = df_test[high_cardinality_nom].copy()

In [22]:
for col in high_cardinality_nom:
    df_train_LE[col] = label_encoder.fit_transform(df_train[col])
    # Get the dictionary to map the values
    le_dict = dict(zip(label_encoder.classes_, \
                       label_encoder.transform(label_encoder.classes_)))
    # We add -1 in case of unknown values
    df_test_LE[col] = df_test_LE[col].apply(lambda x: le_dict.get(x, -1))

In [23]:
df_train_LE

Unnamed: 0,city,state,profession
0,250,13,33
1,226,14,43
2,8,12,47
3,53,17,43
4,295,22,11
...,...,...,...
251995,161,27,45
251996,250,13,3
251997,143,14,17
251998,232,18,27


In [24]:
df_test_LE

Unnamed: 0,city,state,profession
0,180,27,26
1,130,14,24
2,289,14,30
3,170,14,1
4,38,27,12
...,...,...,...
27995,89,0,44
27996,248,13,47
27997,189,14,30
27998,46,4,7


### One hot encoding for nominal features with low cardinality

In [25]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype='int')
df_train_OHE = df_train[low_cardinality_nom].copy()
df_test_OHE = df_test[low_cardinality_nom].copy()

In [26]:
df_train_OHE = pd.DataFrame(OH_encoder.fit_transform(df_train[low_cardinality_nom]))
df_test_OHE = pd.DataFrame(OH_encoder.transform(df_test[low_cardinality_nom]))

df_train_OHE.index = df_train.index
df_test_OHE.index = df_test.index
df_train_OHE.columns = OH_encoder.get_feature_names(df_train[low_cardinality_nom].columns.tolist())
df_test_OHE.columns = OH_encoder.get_feature_names(df_test[low_cardinality_nom].columns.tolist())


In [27]:
df_train_OHE

Unnamed: 0,married_married,married_single,car_ownership_no,car_ownership_yes,house_ownership_norent noown,house_ownership_owned,house_ownership_rented
0,0,1,1,0,0,0,1
1,0,1,1,0,0,0,1
2,1,0,1,0,0,0,1
3,0,1,0,1,0,0,1
4,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...
251995,0,1,1,0,0,0,1
251996,0,1,1,0,0,0,1
251997,0,1,1,0,0,0,1
251998,0,1,1,0,0,0,1


In [28]:
df_test_OHE

Unnamed: 0,married_married,married_single,car_ownership_no,car_ownership_yes,house_ownership_norent noown,house_ownership_owned,house_ownership_rented
0,0,1,1,0,0,0,1
1,0,1,1,0,0,0,1
2,0,1,1,0,0,0,1
3,1,0,0,1,0,0,1
4,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...
27995,0,1,1,0,0,0,1
27996,0,1,1,0,0,0,1
27997,0,1,1,0,0,0,1
27998,0,1,0,1,0,0,1


### Encoding according to dictionary mapping for Ordinal features

In [34]:
df_train_OF = pd.DataFrame()
df_test_OF = pd.DataFrame()

In [35]:
# ordinal_values

In [36]:
for col, mapping in ordinal_values.items():
    df_train_OF[col+'_OF'] = df_train[col].apply(lambda x: mapping.get(x, -1))
    df_test_OF[col+'_OF'] = df_test[col].apply(lambda x: mapping.get(x, -1))

In [37]:
df_test_OF

Unnamed: 0,house_ownership_OF,profession_OF,city_OF,state_OF
0,1,0.144264,0.112277,0.128135
1,1,0.135789,0.108449,0.113254
2,1,0.129514,0.072746,0.113254
3,1,0.121465,0.039655,0.113254
4,1,0.119604,0.148101,0.128135
...,...,...,...,...
27995,1,0.115570,0.064309,0.116022
27996,1,0.134167,0.191964,0.154369
27997,1,0.129514,0.038627,0.113254
27998,1,0.153572,0.141583,0.133281


In [38]:
df_train_OF

Unnamed: 0,house_ownership_OF,profession_OF,city_OF,state_OF
0,1,0.111558,0.121554,0.154369
1,1,0.148427,0.122497,0.113254
2,1,0.134167,0.130814,0.167097
3,1,0.148427,0.326194,0.142550
4,1,0.115794,0.121137,0.103163
...,...,...,...,...
251995,1,0.115465,0.081317,0.128135
251996,1,0.152113,0.121554,0.154369
251997,1,0.106999,0.086370,0.113254
251998,1,0.115370,0.089186,0.116539


## Combining all to get final dataset

In [39]:
df_train_categorical = pd.concat([df_train_LE,df_train_OHE,df_train_OF],axis=1)
df_test_categorical = pd.concat([df_test_LE,df_test_OHE,df_test_OF],axis=1)

In [40]:
df_train_categorical

Unnamed: 0,city,state,profession,married_married,married_single,car_ownership_no,car_ownership_yes,house_ownership_norent noown,house_ownership_owned,house_ownership_rented,house_ownership_OF,profession_OF,city_OF,state_OF
0,250,13,33,0,1,1,0,0,0,1,1,0.111558,0.121554,0.154369
1,226,14,43,0,1,1,0,0,0,1,1,0.148427,0.122497,0.113254
2,8,12,47,1,0,1,0,0,0,1,1,0.134167,0.130814,0.167097
3,53,17,43,0,1,0,1,0,0,1,1,0.148427,0.326194,0.142550
4,295,22,11,0,1,1,0,0,0,1,1,0.115794,0.121137,0.103163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,161,27,45,0,1,1,0,0,0,1,1,0.115465,0.081317,0.128135
251996,250,13,3,0,1,1,0,0,0,1,1,0.152113,0.121554,0.154369
251997,143,14,17,0,1,1,0,0,0,1,1,0.106999,0.086370,0.113254
251998,232,18,27,0,1,1,0,0,0,1,1,0.115370,0.089186,0.116539


In [41]:
with open('original.pickle','wb')  as f:
    pickle.dump(cols, f)
    f.close()

## Feature Generation


In [42]:
df_train_numerical = df_train[numerical_features]
numerical_features_test = numerical_features.copy()
numerical_features_test.remove('risk_flag')
df_test_numerical = df_test[numerical_features_test]
numerical_features,df_test_numerical.columns.to_list()

(['id',
  'income',
  'age',
  'experience',
  'current_job_years',
  'current_house_years',
  'risk_flag'],
 ['id',
  'income',
  'age',
  'experience',
  'current_job_years',
  'current_house_years'])

In [43]:
numerical_features

['id',
 'income',
 'age',
 'experience',
 'current_job_years',
 'current_house_years',
 'risk_flag']

In [44]:
for feature1 in numerical_features:
    for feature2 in numerical_features:
        if feature1==feature2 or feature1=='risk_flag' or feature2=='risk_flag'\
        or feature1=='id' or feature2=='id':
#             print(feature1,feature2)
            continue
        df_train_numerical.loc[:,feature1+'_by_'+feature2] = \
        df_train_numerical.loc[:,feature1]/df_train_numerical.loc[:,feature2]
        df_test_numerical.loc[:,feature1+'_by_'+feature2] = \
        df_test_numerical.loc[:,feature1]/df_test_numerical.loc[:,feature2]
        
        df_train_numerical.loc[:,feature1+'_into_'+feature2] = \
        df_train_numerical.loc[:,feature1]*df_train_numerical.loc[:,feature2]
        df_test_numerical.loc[:,feature1+'_into_'+feature2] = \
        df_test_numerical.loc[:,feature1]*df_test_numerical.loc[:,feature2]
        

df_train_numerical.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test_numerical.replace([np.inf, -np.inf], np.nan, inplace=True)
df_train_numerical.fillna(-1,inplace=True)
df_test_numerical.fillna(-1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [45]:
df_train_numerical['age_minus_experience'] = df_train_numerical['age']-\
df_train_numerical['experience']
df_train_numerical['experience_minus_current_job_years'] = df_train_numerical['experience']-\
df_train_numerical['current_job_years']

df_test_numerical['age_minus_experience'] = df_test_numerical['age']-\
df_test_numerical['experience']
df_test_numerical['experience_minus_current_job_years'] = df_test_numerical['experience']-\
df_test_numerical['current_job_years']

In [46]:
df_train_numerical.describe()

Unnamed: 0,id,income,age,experience,current_job_years,current_house_years,risk_flag,income_by_age,income_into_age,income_by_experience,...,current_house_years_by_income,current_house_years_into_income,current_house_years_by_age,current_house_years_into_age,current_house_years_by_experience,current_house_years_into_experience,current_house_years_by_current_job_years,current_house_years_into_current_job_years,age_minus_experience,experience_minus_current_job_years
count,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,...,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0,252000.0
mean,126000.5,4997117.0,49.954071,10.084437,6.333877,11.997794,0.123,115048.672833,249594300.0,838516.9,...,9e-06,59944720.0,0.276441,598.857976,1.974613,121.153139,2.566875,76.019956,39.869635,3.75056
std,72746.278255,2878311.0,17.063863,6.00259,3.647053,1.399037,0.328438,85055.178772,174817600.0,1285004.0,...,4.5e-05,35453080.0,0.118069,216.450029,2.591292,74.129495,2.504436,44.992425,18.095181,4.587336
min,1.0,10310.0,21.0,0.0,0.0,10.0,0.0,147.285714,245525.0,-1.0,...,1e-06,129789.0,0.126582,210.0,-1.0,0.0,-1.0,0.0,1.0,0.0
25%,63000.75,2503015.0,35.0,5.0,3.0,11.0,0.0,50378.155172,107610700.0,206967.1,...,2e-06,29479790.0,0.184615,416.0,0.736842,60.0,1.222222,40.0,25.0,0.0
50%,126000.5,5000694.0,50.0,10.0,6.0,12.0,0.0,100069.371563,216204800.0,452148.6,...,2e-06,59087710.0,0.23913,590.0,1.090909,120.0,1.857143,70.0,40.0,1.0
75%,189000.25,7477502.0,65.0,15.0,9.0,13.0,0.0,157110.862857,361975200.0,853405.7,...,5e-06,88354580.0,0.342105,768.0,2.0,180.0,3.25,108.0,55.0,7.0
max,252000.0,9999938.0,79.0,20.0,14.0,14.0,1.0,473038.380952,789474300.0,9993041.0,...,0.001358,139991600.0,0.666667,1106.0,14.0,280.0,14.0,196.0,79.0,17.0


In [47]:
df_test_numerical.describe()

Unnamed: 0,id,income,age,experience,current_job_years,current_house_years,income_by_age,income_into_age,income_by_experience,income_into_experience,...,current_house_years_by_income,current_house_years_into_income,current_house_years_by_age,current_house_years_into_age,current_house_years_by_experience,current_house_years_into_experience,current_house_years_by_current_job_years,current_house_years_into_current_job_years,age_minus_experience,experience_minus_current_job_years
count,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,...,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0
mean,14000.5,5029563.0,50.054679,10.120393,6.339286,11.991786,115740.825253,251664500.0,840006.5,50991490.0,...,8e-06,60287420.0,0.276086,599.603821,1.963228,121.511786,2.554242,76.055786,39.934286,3.781107
std,8083.048105,2864938.0,17.129879,6.027317,3.645222,1.397754,85252.925605,174787800.0,1284041.0,45524890.0,...,4.3e-05,35237140.0,0.11868,216.765737,2.578478,74.276377,2.492657,44.925823,18.118313,4.573153
min,1.0,10310.0,21.0,0.0,0.0,10.0,147.285714,245525.0,-1.0,0.0,...,1e-06,126621.0,0.126582,210.0,-1.0,0.0,-1.0,0.0,1.0,0.0
25%,7000.75,2545414.0,35.0,5.0,3.0,11.0,51378.973333,109930600.0,211085.5,13321190.0,...,2e-06,30061130.0,0.184211,416.0,0.736842,60.0,1.222222,40.0,25.0,0.0
50%,14000.5,5046744.0,50.0,10.0,6.0,12.0,100895.15,217911000.0,453001.0,37872740.0,...,2e-06,60199360.0,0.238095,594.0,1.083333,120.0,1.857143,70.0,40.0,2.0
75%,21000.25,7474639.0,65.0,15.0,9.0,13.0,156959.796296,365197800.0,849169.1,79307970.0,...,5e-06,88327990.0,0.342105,768.0,2.0,182.0,3.25,108.0,55.0,7.0
max,28000.0,9999814.0,79.0,20.0,14.0,14.0,469867.047619,787129700.0,9991569.0,199996300.0,...,0.001358,139861400.0,0.666667,1106.0,14.0,280.0,14.0,196.0,79.0,17.0


## Discretization

In [48]:
# TODO: binning

In [49]:
# df_train_numerical = df_train[numerical_features]
# numerical_features_test = numerical_features.copy()
# numerical_features_test.remove('risk_flag')
# df_test_numerical = df_test[numerical_features_test]
# numerical_features,df_test_numerical.columns.to_list()

In [50]:
df_train_numerical['income_bnd'] = round(df_train['income']/400000)
df_test_numerical['income_bnd'] = round(df_test['income']/400000)
df_train_numerical['income_bnd'] = df_train_numerical['income_bnd'].astype(int)
df_test_numerical['income_bnd'] = df_test_numerical['income_bnd'].astype(int)

In [51]:
df_train.age.min()

21

In [52]:
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
cols_to_bin = ['age','experience', 'current_house_years', 'current_job_years']
df_train_numerical[[i+'_bnd' for i in cols_to_bin]] = est.fit_transform(df_train[cols_to_bin])
df_test_numerical[[i+'_bnd' for i in cols_to_bin]] = est.fit_transform(df_test[cols_to_bin])

## Combining numerical and categorical

In [53]:
df_train_numerical

Unnamed: 0,id,income,age,experience,current_job_years,current_house_years,risk_flag,income_by_age,income_into_age,income_by_experience,...,current_house_years_into_experience,current_house_years_by_current_job_years,current_house_years_into_current_job_years,age_minus_experience,experience_minus_current_job_years,income_bnd,age_bnd,experience_bnd,current_house_years_bnd,current_job_years_bnd
0,1,1303835,23,3,3,13,0,56688.478261,29988205,4.346117e+05,...,39,4.333333,39,20,0,3,0.0,1.0,7.0,2.0
1,2,7574516,40,10,9,13,0,189362.900000,302980640,7.574516e+05,...,130,1.444444,117,30,1,19,3.0,5.0,7.0,6.0
2,3,3991815,66,4,4,10,0,60482.045455,263459790,9.979538e+05,...,40,2.500000,40,62,0,10,7.0,2.0,0.0,2.0
3,4,6256451,41,2,2,12,1,152596.365854,256514491,3.128226e+06,...,24,6.000000,24,39,0,16,3.0,1.0,5.0,1.0
4,5,5768871,47,11,3,14,1,122741.936170,271136937,5.244428e+05,...,154,4.666667,42,36,8,14,4.0,5.0,9.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,251996,8154883,43,13,6,11,0,189648.441860,350659969,6.272987e+05,...,143,1.833333,66,30,7,20,3.0,6.0,2.0,4.0
251996,251997,2843572,26,10,6,11,0,109368.153846,73932872,2.843572e+05,...,110,1.833333,66,16,4,7,0.0,5.0,2.0,4.0
251997,251998,4522448,46,7,7,12,0,98314.086957,208032608,6.460640e+05,...,84,1.714286,84,39,0,11,4.0,3.0,5.0,5.0
251998,251999,6507128,45,0,0,10,0,144602.844444,292820760,-1.000000e+00,...,0,-1.000000,0,45,0,16,4.0,0.0,0.0,0.0


In [54]:
df_train_final = pd.concat([df_train_categorical, df_train_numerical],axis=1)
df_test_final = pd.concat([df_test_categorical, df_test_numerical],axis=1)

In [55]:
df_train_final

Unnamed: 0,city,state,profession,married_married,married_single,car_ownership_no,car_ownership_yes,house_ownership_norent noown,house_ownership_owned,house_ownership_rented,...,current_house_years_into_experience,current_house_years_by_current_job_years,current_house_years_into_current_job_years,age_minus_experience,experience_minus_current_job_years,income_bnd,age_bnd,experience_bnd,current_house_years_bnd,current_job_years_bnd
0,250,13,33,0,1,1,0,0,0,1,...,39,4.333333,39,20,0,3,0.0,1.0,7.0,2.0
1,226,14,43,0,1,1,0,0,0,1,...,130,1.444444,117,30,1,19,3.0,5.0,7.0,6.0
2,8,12,47,1,0,1,0,0,0,1,...,40,2.500000,40,62,0,10,7.0,2.0,0.0,2.0
3,53,17,43,0,1,0,1,0,0,1,...,24,6.000000,24,39,0,16,3.0,1.0,5.0,1.0
4,295,22,11,0,1,1,0,0,0,1,...,154,4.666667,42,36,8,14,4.0,5.0,9.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251995,161,27,45,0,1,1,0,0,0,1,...,143,1.833333,66,30,7,20,3.0,6.0,2.0,4.0
251996,250,13,3,0,1,1,0,0,0,1,...,110,1.833333,66,16,4,7,0.0,5.0,2.0,4.0
251997,143,14,17,0,1,1,0,0,0,1,...,84,1.714286,84,39,0,11,4.0,3.0,5.0,5.0
251998,232,18,27,0,1,1,0,0,0,1,...,0,-1.000000,0,45,0,16,4.0,0.0,0.0,0.0


In [56]:
df_test_final

Unnamed: 0,city,state,profession,married_married,married_single,car_ownership_no,car_ownership_yes,house_ownership_norent noown,house_ownership_owned,house_ownership_rented,...,current_house_years_into_experience,current_house_years_by_current_job_years,current_house_years_into_current_job_years,age_minus_experience,experience_minus_current_job_years,income_bnd,age_bnd,experience_bnd,current_house_years_bnd,current_job_years_bnd
0,180,27,26,0,1,1,0,0,0,1,...,247,3.250000,52,40,15,18,6.0,9.0,7.0,2.0
1,130,14,24,0,1,1,0,0,0,1,...,50,2.000000,50,20,0,3,0.0,2.0,0.0,3.0
2,289,14,30,0,1,1,0,0,0,1,...,168,1.555556,126,38,3,22,5.0,6.0,9.0,6.0
3,170,14,1,1,0,0,1,0,0,1,...,108,4.000000,36,40,6,5,4.0,4.0,5.0,2.0
4,38,27,12,0,1,0,1,0,0,1,...,198,0.846154,143,7,5,0,0.0,9.0,2.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27995,89,0,44,0,1,1,0,0,0,1,...,130,2.000000,50,44,8,25,6.0,6.0,0.0,3.0
27996,248,13,47,0,1,1,0,0,0,1,...,126,1.555556,126,38,0,7,4.0,4.0,9.0,6.0
27997,189,14,30,0,1,1,0,0,0,1,...,65,3.250000,52,19,1,20,0.0,2.0,7.0,2.0
27998,46,4,7,0,1,0,1,0,0,1,...,182,1.076923,182,38,0,24,5.0,6.0,9.0,9.0


In [57]:
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 68 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   city                                        252000 non-null  int32  
 1   state                                       252000 non-null  int32  
 2   profession                                  252000 non-null  int32  
 3   married_married                             252000 non-null  int32  
 4   married_single                              252000 non-null  int32  
 5   car_ownership_no                            252000 non-null  int32  
 6   car_ownership_yes                           252000 non-null  int32  
 7   house_ownership_norent noown                252000 non-null  int32  
 8   house_ownership_owned                       252000 non-null  int32  
 9   house_ownership_rented                      252000 non-null  int32  
 

In [58]:
df_test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Data columns (total 67 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   city                                        28000 non-null  int64  
 1   state                                       28000 non-null  int64  
 2   profession                                  28000 non-null  int64  
 3   married_married                             28000 non-null  int32  
 4   married_single                              28000 non-null  int32  
 5   car_ownership_no                            28000 non-null  int32  
 6   car_ownership_yes                           28000 non-null  int32  
 7   house_ownership_norent noown                28000 non-null  int32  
 8   house_ownership_owned                       28000 non-null  int32  
 9   house_ownership_rented                      28000 non-null  int32  
 10  house_owne

In [59]:
df_train_final.to_csv("D:/Code/Projects/Bank Default Hackathon/train_preprocessed.csv",\
                     index=False)

In [60]:
df_test_final.to_csv("D:/Code/Projects/Bank Default Hackathon/test_preprocessed.csv",\
                    index=False)

In [214]:
df_test_preds = df_test['risk_flag']

In [215]:
df_test_preds.to_csv("D:/Code/Projects/Bank Default Hackathon/test_preds.csv",\
                     index=False)

In [216]:
df_test_preds

0        0
1        0
2        0
3        1
4        1
        ..
27995    0
27996    1
27997    0
27998    0
27999    0
Name: risk_flag, Length: 28000, dtype: int64