In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df_train = pd.read_csv('data/Yes_Bank_Train.csv')
df_orginal_test = pd.read_csv('data/Yes_Bank_Test_int.csv')

In [3]:
df_train.head()

Unnamed: 0,serial number,account_info,duration_month,credit_history,purpose,credit_amount,savings_account,employment_st,poi,personal_status,...,resident_since,property_type,age,installment_type,housing_type,credits_no,job_type,liables,telephone,foreigner
0,1,A11,6,A34,A43,1169,A65,A75,4,A93,...,4,A121,67,A143,A152,2,A173,1,A192,A201
1,2,A12,48,A32,A43,5951,A61,A73,2,A92,...,2,A121,22,A143,A152,1,A173,1,A191,A201
2,3,A14,12,A34,A46,2096,A61,A74,2,A93,...,3,A121,49,A143,A152,1,A172,2,A191,A201
3,4,A11,42,A32,A42,7882,A61,A74,2,A93,...,4,A122,45,A143,A153,1,A173,2,A191,A201
4,5,A11,24,A33,A40,4870,A61,A73,3,A93,...,4,A124,53,A143,A153,2,A173,2,A191,A201


In [4]:
df_orginal_test.head()

Unnamed: 0,serial number,account_info,duration_month,credit_history,purpose,savings_account,employment_st,poi,personal_status,gurantors,resident_since,property_type,age,installment_type,housing_type,credits_no,job_type,liables,telephone,foreigner
0,1,A14,24,A34,A46,A61,A75,4,A93,A101,4,A124,54,A143,A153,2,A173,2,A191,A201
1,2,A12,18,A34,A43,A61,A75,3,A92,A103,4,A121,48,A141,A151,2,A172,1,A192,A201
2,3,A11,20,A34,A42,A61,A75,1,A92,A101,4,A122,24,A143,A152,2,A173,1,A191,A201
3,4,A14,12,A34,A43,A65,A75,4,A93,A101,4,A123,35,A143,A152,2,A173,1,A191,A201
4,5,A12,12,A32,A40,A65,A71,1,A92,A101,2,A121,24,A143,A151,1,A171,1,A191,A201


In [7]:
df_train.purpose.value_counts()

A43     223
A40     184
A42     144
A41      81
A49      77
A46      45
A45      19
A410     10
A44       9
A48       8
Name: purpose, dtype: int64

In [6]:
df_train.purpose.replace({'A48':'A46'}).value_counts()

A43     223
A40     184
A42     144
A41      81
A49      77
A46      53
A45      19
A410     10
A44       9
Name: purpose, dtype: int64

In [8]:
def feature_engineering(dataframe):
    
    # clubbing reskilling with education, old car with new car, it's just car maintenance, all the household appliances
    dataframe.purpose.replace({'A48':'A46', 'A41':'A40', 'A43':'A42', 'A44':'A42'}, inplace = True)
    # clubbing delinquent cases
    dataframe.credit_history.replace({'A33':'A34'}, inplace = True)
    # clubbing loyal cases
    dataframe.employment_st.replace({'A74':'A75'}, inplace = True)

In [9]:
feature_engineering(df_train)
feature_engineering(df_orginal_test)

In [10]:
df_train.head()

Unnamed: 0,serial number,account_info,duration_month,credit_history,purpose,credit_amount,savings_account,employment_st,poi,personal_status,...,resident_since,property_type,age,installment_type,housing_type,credits_no,job_type,liables,telephone,foreigner
0,1,A11,6,A34,A42,1169,A65,A75,4,A93,...,4,A121,67,A143,A152,2,A173,1,A192,A201
1,2,A12,48,A32,A42,5951,A61,A73,2,A92,...,2,A121,22,A143,A152,1,A173,1,A191,A201
2,3,A14,12,A34,A46,2096,A61,A75,2,A93,...,3,A121,49,A143,A152,1,A172,2,A191,A201
3,4,A11,42,A32,A42,7882,A61,A75,2,A93,...,4,A122,45,A143,A153,1,A173,2,A191,A201
4,5,A11,24,A34,A40,4870,A61,A73,3,A93,...,4,A124,53,A143,A153,2,A173,2,A191,A201


In [11]:
df_train.purpose.value_counts()

A42     376
A40     265
A49      77
A46      53
A45      19
A410     10
Name: purpose, dtype: int64

In [12]:
df_train.drop(['serial number'], axis = 1, inplace=True)
df_test = df_orginal_test.drop(['serial number'], axis = 1)

In [13]:
df_train.shape

(800, 20)

In [14]:
df_test.shape

(200, 19)

In [15]:
df_train.isnull().sum()

account_info        0
duration_month      0
credit_history      0
purpose             0
credit_amount       0
savings_account     0
employment_st       0
poi                 0
personal_status     0
gurantors           0
resident_since      0
property_type       0
age                 0
installment_type    0
housing_type        0
credits_no          0
job_type            0
liables             0
telephone           0
foreigner           0
dtype: int64

In [16]:
obj_df = df_train.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,account_info,credit_history,purpose,savings_account,employment_st,personal_status,gurantors,property_type,installment_type,housing_type,job_type,telephone,foreigner
0,A11,A34,A42,A65,A75,A93,A101,A121,A143,A152,A173,A192,A201
1,A12,A32,A42,A61,A73,A92,A101,A121,A143,A152,A173,A191,A201
2,A14,A34,A46,A61,A75,A93,A101,A121,A143,A152,A172,A191,A201
3,A11,A32,A42,A61,A75,A93,A103,A122,A143,A153,A173,A191,A201
4,A11,A34,A40,A61,A73,A93,A101,A124,A143,A153,A173,A191,A201


In [17]:
obj_df.shape

(800, 13)

In [18]:
object_columns = list(obj_df.columns)

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [20]:
for column_name in object_columns:
    df_train["{}_code".format(column_name)] = le.fit_transform(obj_df[column_name])
    df_test["{}_code".format(column_name)] = le.transform(df_test[column_name])

In [21]:
df_train.drop(object_columns, axis = 1, inplace=True)
df_test.drop(object_columns, axis = 1, inplace=True)

In [22]:
df_train.head()

Unnamed: 0,duration_month,credit_amount,poi,resident_since,age,credits_no,liables,account_info_code,credit_history_code,purpose_code,savings_account_code,employment_st_code,personal_status_code,gurantors_code,property_type_code,installment_type_code,housing_type_code,job_type_code,telephone_code,foreigner_code
0,6,1169,4,4,67,2,1,0,3,2,4,3,2,0,0,2,1,2,1,0
1,48,5951,2,2,22,1,1,1,2,2,0,2,1,0,0,2,1,2,0,0
2,12,2096,2,3,49,1,2,3,3,4,0,3,2,0,0,2,1,1,0,0
3,42,7882,2,4,45,1,2,0,2,2,0,3,2,2,1,2,2,2,0,0
4,24,4870,3,4,53,2,2,0,3,0,0,2,2,0,3,2,2,2,0,0


In [23]:
X = df_train.drop(['credit_amount'], axis=1)
y = df_train['credit_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [24]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

from sklearn.model_selection import GridSearchCV

In [66]:
models = [
  [LinearRegression(), {"fit_intercept": [True, False]}], 
#   [SVR(), {"kernel": ["linear", "poly", "rbf", "sigmoid"]}], 
  [KNeighborsRegressor(), {"n_neighbors": [1,2], "weights": ["uniform", "distance"]}], 
  [DecisionTreeRegressor(), {"criterion": ["mse", "friedman_mse"], "splitter": ["best", "random"],
    "min_samples_split": [x for x in range(2,6)] # generates a list [2,3,4,5]
  }],
  [GradientBoostingRegressor(), {"loss": ["ls", "lad", "huber", "quantile"]}],
  [GaussianProcessRegressor(), {}],
#   [PLSRegression(), {}],
#     [RandomForestRegressor()],
  [AdaBoostRegressor(), {}],
    [Lasso(), {"alpha": [0.2, 0.3, 0.7, 0.75, 0.8]}],
    [Ridge(), {"alpha": [0.2, 0.3, 0.7, 0.75, 0.8]}]
]

In [67]:
def evaluation_metric(y_test, y_pred):
    rmse = np.sqrt(np.mean((y_test-y_pred)**2))
    em = (1-rmse/100000)*100
    return em

In [68]:
for model in models:
    regressor = model[0]
    param_grid = model[1]
    model = GridSearchCV(regressor, param_grid, cv = 10)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model.best_params_, model.best_estimator_)
    evaluation_metric_perf = evaluation_metric(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(evaluation_metric_perf, r2)

{'fit_intercept': False} LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)
98.08255697861927 0.5844215032684266
{'n_neighbors': 2, 'weights': 'uniform'} KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=2, p=2,
          weights='uniform')
97.65444258623883 0.3781290511009282
{'criterion': 'friedman_mse', 'min_samples_split': 5, 'splitter': 'random'} DecisionTreeRegressor(criterion='friedman_mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=5,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='random')
97.76415305397282 0.43494298013679744
{'loss': 'huber'} GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='huber', max_depth=3,
             max_feat

IndexError: list index out of range

In [55]:
from sklearn.metrics.scorer import make_scorer
my_custom_scorer = make_scorer(r2_score, greater_is_better=True)

In [56]:
# from tpot import TPOTRegressor

# tot_generations = [10, 25, 50, 100]
# # tot_generations = [1, 2, 3]

# for generation in tot_generations:
#     tpot = TPOTRegressor(generations=generation, population_size=100, verbosity=2, scoring = my_custom_scorer)
#     tpot.fit(X_train, y_train)
#     print(tpot.score(X_test, y_test))
#     tpot.export('teapee_yesbank__r2_gen_{}.py'.format(generation))

In [57]:
# from tpot import TPOTRegressor

# tot_generations = [10, 25, 50, 100]
# # tot_generations = [1, 2, 3]

# my_custom_em_scorer = make_scorer(evaluation_metric, greater_is_better=True)
# for generation in tot_generations:
#     tpot = TPOTRegressor(generations=generation, population_size=100, verbosity=2, scoring = evaluation_metric)
#     tpot.fit(X_train, y_train)
#     print(tpot.score(X_test, y_test))
#     tpot.export('teapee_yesbank_em_gen_{}.py'.format(generation))

# xgboost

In [28]:
import xgboost as xgb
model = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.09,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
evaluation_metric_perf = evaluation_metric(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(evaluation_metric_perf, r2)

98.09578326195513 0.5901349472615035


# lgbm

In [29]:
import lightgbm as lgb

In [30]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [31]:
# specify your configurations as a dict
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.19,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


gbm = lgb.train(lgb_params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
evaluation_metric_perf = evaluation_metric(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(evaluation_metric_perf, r2)

[1]	valid_0's l1: 1822.65	valid_0's l2: 7.30978e+06
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 1692.73	valid_0's l2: 6.48356e+06
[3]	valid_0's l1: 1597.53	valid_0's l2: 5.73786e+06
[4]	valid_0's l1: 1523.29	valid_0's l2: 5.16745e+06
[5]	valid_0's l1: 1472.87	valid_0's l2: 4.85897e+06
[6]	valid_0's l1: 1465.79	valid_0's l2: 4.64844e+06
[7]	valid_0's l1: 1436.02	valid_0's l2: 4.42165e+06
[8]	valid_0's l1: 1401.38	valid_0's l2: 4.18216e+06
[9]	valid_0's l1: 1357.76	valid_0's l2: 3.90366e+06
[10]	valid_0's l1: 1355.68	valid_0's l2: 3.88206e+06
[11]	valid_0's l1: 1345.63	valid_0's l2: 3.85658e+06
[12]	valid_0's l1: 1345	valid_0's l2: 3.8628e+06
[13]	valid_0's l1: 1321.7	valid_0's l2: 3.78722e+06
[14]	valid_0's l1: 1319.52	valid_0's l2: 3.76777e+06
[15]	valid_0's l1: 1288.09	valid_0's l2: 3.64154e+06
[16]	valid_0's l1: 1285.49	valid_0's l2: 3.65035e+06
[17]	valid_0's l1: 1275.31	valid_0's l2: 3.65082e+06
[18]	valid_0's l1: 1271.52	valid_0's l2: 3.62915e+06

# catboost

In [32]:
from catboost import Pool, CatBoostRegressor

In [33]:
cat_params = {'depth': 11, 'iterations': 250, 'l2_leaf_reg': 9, 
        'learning_rate': 0.15, 'random_seed': 42,
        'loss_function': 'MAE'}
cat_features = [i for i in range(6,19)]
model = CatBoostRegressor(**cat_params)

train_data = Pool(X_train, y_train, cat_features=cat_features)
test_data = Pool(X_test, cat_features=cat_features)

model.fit(train_data, verbose=False)
y_pred = model.predict(test_data)

evaluation_metric_perf = evaluation_metric(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(evaluation_metric_perf, r2)

95.70882114005443 -1.0814278749159536


# ensemble

In [34]:
estimator1 = GradientBoostingRegressor(loss='huber')
estimator2 = lgb_params
estimator3 = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.09,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
estimator4 = CatBoostRegressor(**cat_params)

In [43]:
estimator1.fit(X_train, y_train)
estimator2 = lgb.train(lgb_params,
                lgb_train,
                num_boost_round=25,
                valid_sets=lgb_eval,
                early_stopping_rounds=10)
estimator3.fit(X_train, y_train)
estimator4.fit(train_data, verbose = False)

[1]	valid_0's l1: 1822.65	valid_0's l2: 7.30978e+06
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's l1: 1692.73	valid_0's l2: 6.48356e+06
[3]	valid_0's l1: 1597.53	valid_0's l2: 5.73786e+06
[4]	valid_0's l1: 1523.29	valid_0's l2: 5.16745e+06
[5]	valid_0's l1: 1472.87	valid_0's l2: 4.85897e+06
[6]	valid_0's l1: 1465.79	valid_0's l2: 4.64844e+06
[7]	valid_0's l1: 1436.02	valid_0's l2: 4.42165e+06
[8]	valid_0's l1: 1401.38	valid_0's l2: 4.18216e+06
[9]	valid_0's l1: 1357.76	valid_0's l2: 3.90366e+06
[10]	valid_0's l1: 1355.68	valid_0's l2: 3.88206e+06
[11]	valid_0's l1: 1345.63	valid_0's l2: 3.85658e+06
[12]	valid_0's l1: 1345	valid_0's l2: 3.8628e+06
[13]	valid_0's l1: 1321.7	valid_0's l2: 3.78722e+06
[14]	valid_0's l1: 1319.52	valid_0's l2: 3.76777e+06
[15]	valid_0's l1: 1288.09	valid_0's l2: 3.64154e+06
[16]	valid_0's l1: 1285.49	valid_0's l2: 3.65035e+06
[17]	valid_0's l1: 1275.31	valid_0's l2: 3.65082e+06
[18]	valid_0's l1: 1271.52	valid_0's l2: 3.62915e+0

<catboost.core.CatBoostRegressor at 0x2b0edcaa400>

In [36]:
y_pred1 = estimator1.predict(X_test)
y_pred2 = estimator2.predict(X_test, num_iteration=gbm.best_iteration)
y_pred3 = estimator3.predict(X_test)
y_pred4 = estimator4.predict(test_data)

In [37]:
def ensemble_four(y_test, y_pred):
    evaluation_metric_perf = evaluation_metric(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(evaluation_metric_perf, r2)

In [38]:
ensemble_four(y_test, y_pred1)
ensemble_four(y_test, y_pred2)
ensemble_four(y_test, y_pred3)

98.19483727172211 0.6316668145293662
98.09496835295222 0.5897840689366054
98.09578326195513 0.5901349472615035


In [39]:
y_all_pred = np.mean(np.array([y_pred1, y_pred2, y_pred3]), axis = 0)

In [40]:
ensemble_four(y_test, y_all_pred)

98.2292086967286 0.64555986451922


In [41]:
estimator1 = GradientBoostingRegressor(loss='huber')
estimator2 = Lasso(alpha=0.8)
estimator3 = Ridge(alpha=0.8)

estimator1.fit(X_train, y_train)
estimator2.fit(X_train, y_train)
estimator3.fit(X_train, y_train)

y_pred1 = estimator1.predict(X_test)
y_pred2 = estimator2.predict(X_test)
y_pred3 = estimator3.predict(X_test)

ensemble_four(y_test, y_pred1)
ensemble_four(y_test, y_pred2)
ensemble_four(y_test, y_pred3)

print('overall')
y_all_pred = np.mean(np.array([y_pred1, y_pred2, y_pred3]), axis = 0)
ensemble_four(y_test, y_all_pred)

98.19483727172211 0.6316668145293662
98.08788424139206 0.5867275122386397
98.08740212135983 0.5865190812759864
overall
98.16234622845978 0.6182882633451672


In [42]:
estimator1 = GradientBoostingRegressor(loss='huber')
estimator2 = lgb_params


estimator1.fit(X_train, y_train)
estimator2 = lgb.train(lgb_params,
                lgb.Dataset(X_train, y_train),
                num_boost_round=21)


y_pred1 = estimator1.predict(X_test)
y_pred2 = estimator2.predict(X_test, num_iteration=gbm.best_iteration)

ensemble_four(y_test, y_pred1)
ensemble_four(y_test, y_pred2)

print('overall')

y_all_pred = np.mean(np.array([y_pred1, y_pred2]), axis = 0)
ensemble_four(y_test, y_all_pred)

98.1806945259835 0.6258727122038534
98.09496835295222 0.5897840689366054
overall
98.17635129730895 0.6240842715334673


# for actual test data

In [44]:
X_valid = df_test

In [45]:
X_valid.head()

Unnamed: 0,duration_month,poi,resident_since,age,credits_no,liables,account_info_code,credit_history_code,purpose_code,savings_account_code,employment_st_code,personal_status_code,gurantors_code,property_type_code,installment_type_code,housing_type_code,job_type_code,telephone_code,foreigner_code
0,24,4,4,54,2,2,3,3,4,0,3,2,0,3,2,2,2,0,0
1,18,3,4,48,2,1,1,3,2,0,3,1,2,0,0,0,1,1,0
2,20,1,4,24,2,1,0,3,2,0,3,1,0,1,2,1,2,0,0
3,12,4,4,35,2,1,3,3,2,4,3,2,0,2,2,1,2,0,0
4,12,1,2,24,1,1,1,2,0,4,0,1,0,0,2,0,0,0,0


In [46]:
# including xgb
estimator1 = GradientBoostingRegressor(loss='huber')
estimator2 = lgb_params
estimator3 = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.09,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)

estimator1.fit(X, y)
estimator2 = lgb.train(lgb_params,
                lgb.Dataset(X, y),
                num_boost_round=21)
estimator3.fit(X, y)

y_pred1 = estimator1.predict(X_valid)
y_pred2 = estimator2.predict(X_valid, num_iteration=gbm.best_iteration)
y_pred3 = estimator3.predict(X_valid)

y_all_pred = np.mean(np.array([y_pred1, y_pred2, y_pred3]), axis = 0)

In [53]:
estimator1 = GradientBoostingRegressor(loss='huber')
estimator2 = lgb_params


estimator1.fit(X, y)
estimator2 = lgb.train(lgb_params,
                lgb.Dataset(X, y),
                num_boost_round=21)


y_pred1 = estimator1.predict(X_valid)
y_pred2 = estimator2.predict(X_valid, num_iteration=gbm.best_iteration)

y_all_pred = np.mean(np.array([y_pred1, y_pred2]), axis = 0)

In [59]:
estimator1 = GradientBoostingRegressor(loss='huber')
estimator2 = Lasso(alpha=0.8)
estimator3 = Ridge(alpha=0.8)

estimator1.fit(X, y)
estimator2.fit(X, y)
estimator3.fit(X, y)

y_pred1 = estimator1.predict(X_valid)
y_pred2 = estimator2.predict(X_valid)
y_pred3 = estimator3.predict(X_valid)


y_all_pred = np.mean(np.array([y_pred1, y_pred2, y_pred3]), axis = 0)

In [60]:
df_submission = pd.DataFrame({'serial number': df_orginal_test['serial number'].values})

if rounded off values is required

In [61]:
df_submission['credit_amount'] = np.round(y_all_pred).astype('int64')

In [62]:
df_submission.head()

Unnamed: 0,serial number,credit_amount
0,1,2685
1,2,2534
2,3,3990
3,4,1381
4,5,2515


In [63]:
df_submission.credit_amount.values

array([ 2685,  2534,  3990,  1381,  2515,  6408,   438,  1407, 10133,
        1919,  1241,    81,  4599,  6651,  5907,  6043,  2177,  2727,
        7219,  1304,  1866,  1690,  5494,   144,  3608,  2737,  4329,
        3308,  5182,  6391,  3096,  2446,  7457,  4704,  1861,  1611,
        1147,  1467,  3103,  2056,  3595,  2274,  1195,  2777,  2411,
        4482,  3929,  2039,  1233,   441,  2378,  3970,  2418,  1867,
        6669,  2600,  1027,  2380,  2681,  2659,  2280,  2038,  2672,
        3250,  1344,  2562,  1823,  2105,  4770,   728,  6387,  2475,
        2580,  1239,  2055,  1433,  5099,  3674,  3205,  5110,  4541,
        5088,  3339,  1111,  3638,  1063,  3533,  7956,  5323,  4762,
        4921,  2061,   931,  5607,  2521,  6176,  3203,  1664,  2715,
        3468,  3640,  4105,  6569,  1740,  3128,  3263,  3302,  6031,
        3181,  2379,  3519,  3441,  5055,  3110,  2916,  9459,  2735,
        4309,  4728,  3679,  2050,  7020,  1229,  2326,  4623,  2228,
        1561,  6755,

In [64]:
df_submission.to_csv('submission_ensembles_fe_regr_rounded.csv', index=None)

# classification thing

In [None]:
# Cluster number 1 (Correct value is 1) : When the value of credit_amount is between 4000 and 20000 
# Cluster number 2 (Correct value is 2): When the value of credit_amount is between 4000 and 1500 
# Cluster number 3 (Correct value is 3) : When the value of credit_amount is less than 1500
# ["serial number", "cluster_number"] in df_submission

In [103]:
y_all_pred_cluster = list(y_all_pred)

In [109]:
cluster_result = []
for c_value in y_all_pred_cluster:
    if((c_value >= 4000) and (c_value <= 20000)):
        cluster_result.append(1)
    elif((c_value >= 1500) and (c_value < 4000)):
        cluster_result.append(2)
    elif c_value < 1500:
        cluster_result.append(3)

In [110]:
np.array(cluster_result)

array([2, 2, 2, 3, 1, 1, 3, 3, 1, 2, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 3, 3,
       1, 3, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2,
       2, 1, 2, 2, 2, 3, 2, 2, 2, 3, 1, 2, 3, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 1, 3, 1, 2, 2, 3, 2, 3, 1, 2, 2, 1, 1, 1, 2, 3, 2, 3, 2, 1,
       1, 1, 1, 3, 3, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 3, 2, 2, 2, 1, 2, 2,
       2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 3, 2, 2, 2, 2, 1, 2, 3, 2, 3,
       2, 3, 3, 1, 3, 3, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 3, 2, 2, 1, 2, 1,
       3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 1, 2, 3, 1, 1, 1, 2, 2,
       3, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 3, 2, 1, 2, 2, 1, 3,
       1, 1])

In [111]:
df_submission_cluster = pd.DataFrame({'serial number': df_orginal_test['serial number'].values})
df_submission_cluster['cluster_number'] = np.array(cluster_result)
df_submission_cluster.to_csv('submission_ensembles_cluster.csv', index=None)