### First steps
- Split whole dataset into two parts: for modellers and for validators (70% and 30%)
- Split the first part into two parts: for training and for validation (70% and 30%)
- Split validation part from the first part into two parts: for validation and for testing (66% and 34%)
- Save all parts into separate files
- Remember to set the random seed for reproducibility

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
np.random.seed = 42

df=pd.read_csv('../../data/credit_score.csv')
# take without first column and last
X=df.iloc[:,1:-1]
y=df.iloc[:,-1]

In [11]:
X

Unnamed: 0,INCOME,SAVINGS,DEBT,R_SAVINGS_INCOME,R_DEBT_INCOME,R_DEBT_SAVINGS,T_CLOTHING_12,T_CLOTHING_6,R_CLOTHING,R_CLOTHING_INCOME,...,R_EXPENDITURE_INCOME,R_EXPENDITURE_SAVINGS,R_EXPENDITURE_DEBT,CAT_GAMBLING,CAT_DEBT,CAT_CREDIT_CARD,CAT_MORTGAGE,CAT_SAVINGS_ACCOUNT,CAT_DEPENDENTS,CREDIT_SCORE
0,33269,0,532304,0.0000,16.0000,1.2000,1889,945,0.5003,0.0568,...,1.0000,0.0000,0.0625,High,1,0,0,0,0,444
1,77158,91187,315648,1.1818,4.0909,3.4615,5818,111,0.0191,0.0754,...,0.9091,0.7692,0.2222,No,1,0,0,1,0,625
2,30917,21642,534864,0.7000,17.3000,24.7142,1157,860,0.7433,0.0374,...,1.0000,1.4286,0.0578,High,1,0,0,1,0,469
3,80657,64526,629125,0.8000,7.8000,9.7499,6857,3686,0.5376,0.0850,...,1.0000,1.2500,0.1282,High,1,0,0,1,0,559
4,149971,1172498,2399531,7.8182,16.0000,2.0465,1978,322,0.1628,0.0132,...,0.9091,0.1163,0.0568,High,1,1,1,1,1,473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,328892,1465066,5501471,4.4546,16.7273,3.7551,16701,10132,0.6067,0.0508,...,0.9091,0.2041,0.0543,High,1,1,1,1,1,418
996,81404,88805,680837,1.0909,8.3637,7.6667,5400,1936,0.3585,0.0663,...,0.9091,0.8333,0.1087,No,1,0,0,1,0,589
997,0,42428,30760,3.2379,8.1889,0.7250,0,0,0.8779,0.0047,...,1.0668,0.2500,0.3448,No,1,0,0,1,0,499
998,36011,8002,604181,0.2222,16.7777,75.5037,1993,1271,0.6377,0.0553,...,1.1111,5.0002,0.0662,No,1,1,0,1,0,507


In [12]:
y

0      1
1      0
2      1
3      0
4      0
      ..
995    0
996    1
997    0
998    0
999    0
Name: DEFAULT, Length: 1000, dtype: int64

In [13]:
# Split whole dataset into two parts: for modellers and for validators (70% and 30%)
X_mod, X_val, y_mod, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(X_mod.shape, X_val.shape, y_mod.shape, y_val.shape)

(700, 85) (300, 85) (700,) (300,)


In [15]:
df_validators=pd.concat([X_val, y_val], axis=1)
df_validators.to_csv('../../data/for_validators/credit_score_validators.csv', index=False)

In [16]:
X_train,X_valid,y_train,y_valid = train_test_split(X_mod, y_mod, test_size=0.3, random_state=42, stratify=y_mod)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(490, 85) (210, 85) (490,) (210,)


In [20]:
df_modellers=pd.concat([X_train, y_train], axis=1)
#df_modellers
df_modellers.to_csv('../../data/for_modelling/credit_score_train.csv', index=False)

In [21]:
# Split validation part from the first part into two parts: for validation and for testing (66% and 34%)
X_validation,X_test,y_validation,y_test = train_test_split(X_valid, y_valid, test_size=0.34, random_state=42, stratify=y_valid)
print(X_validation.shape, X_test.shape, y_validation.shape, y_test.shape)

(138, 85) (72, 85) (138,) (72,)


In [22]:
df_valid=pd.concat([X_validation, y_validation], axis=1)
df_valid.to_csv('../../data/for_modelling/credit_score_valid.csv', index=False)

In [23]:
df_test=pd.concat([X_test, y_test], axis=1)
df_test.to_csv('../../data/for_modelling/credit_score_test.csv', index=False)

### YEY! We have prepared the data for modellers and validators