## Importing data and required libraries

### Importing necessary libraries

In [15]:
import numpy as np 
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

%matplotlib inline

### Checking the directories of the datasets available

In [16]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/santander-value-prediction-challenge/train.csv
/kaggle/input/santander-value-prediction-challenge/test.csv
/kaggle/input/santander-value-prediction-challenge/sample_submission.csv


### Importing the dataset

In [17]:
dataset = pd.read_csv("../input/santander-value-prediction-challenge/train.csv")
dataset.shape

(4459, 4993)

### Printing the first five records

In [18]:
dataset.head(5)

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


## Data Exploration

### Statistics of the data

In [19]:
dataset.describe()

Unnamed: 0,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,...,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,5944923.0,14654.93,1390.895,26722.45,4530.164,26409.96,30708.11,16865.22,4669.208,2569407.0,...,467605.7,444623.9,805621.9,781296.6,143.529939,121380.9,35734.51,312374.1,92199.6,227910.0
std,8234312.0,389329.8,64283.02,569965.2,235912.4,1514730.0,577059.0,751275.6,187944.9,9610183.0,...,4068038.0,4428889.0,4513246.0,6839451.0,9584.318507,4720709.0,1614622.0,4318501.0,1635993.0,1811139.0
min,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2260000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40000000.0,20000000.0,4000000.0,20000000.0,14800000.0,100000000.0,20708000.0,40000000.0,10400000.0,319612000.0,...,76000000.0,123588000.0,130000000.0,144400000.0,640000.0,301312000.0,106420000.0,140000000.0,61768000.0,43200000.0


### Checking the null values in the data

In [None]:
dataset.isnull().sum()

### Checking the data type of the columns

In [None]:
dataset.dtypes

### Listing all the datatypes used in the dataset

In [None]:
dataset.dtypes.value_counts()

### Removing constant columns

In [20]:
columns = dataset.std() == 0
const_columns = columns.iloc[[i for i, x in enumerate(columns) if x]]
dataset.drop(const_columns.index, axis = 1, inplace = True)

### Removing sparse columns

In [21]:
def drop_sparse(train):
    sparse_columns = []
    flist = dataset.columns[2:]
    for f in flist:
        if len(np.unique(train[f]))<2:
            sparse_columns.append(f)
    return sparse_columns

sparse_columns = drop_sparse(dataset)
dataset.drop(sparse_columns, axis = 1, inplace = True)

### Removing duplicate columns

In [55]:
def duplicate_columns(dataset):
    groups = dataset.columns.to_series().groupby(dataset.dtypes).groups
    my_dict = {}
    duplicate_features = []
    
    for d_type, columns in groups.items():
        columns_group = dataset[columns]
        list_of_column_names = dataset[columns].columns
        length = len(columns)
        
        for i in range(length):
            a = tuple(columns_group.iloc[:, i])
            if a in my_dict:
                duplicate_features.append(list_of_column_names[i])
            else:
                my_dict[a] = list_of_column_names[i]
            
    return duplicate_features

duplicate_features = duplicate_columns(dataset)
dataset.drop(sparse_columns, axis = 1, inplace = True)

## Model Training

### XGBoost

In [60]:
dataset = dataset.drop(['ID'], axis = 1)
X = dataset.iloc[:,1:]       
Y = dataset.iloc[:,0]  

In [61]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.21, random_state=42)

In [62]:
xb_model = XGBRegressor(colsample_bytree=0.055, colsample_bylevel =0.5, 
                             gamma=1.5, learning_rate=0.01, max_depth=32, 
                             objective='reg:linear',booster='gbtree',
                             min_child_weight=57, n_estimators=800, reg_alpha=0, 
                             reg_lambda = 0,eval_metric = 'rmse', subsample=0.7, 
                             silent=1, n_jobs = -1, early_stopping_rounds = 14,
                             random_state =7, nthread = -1)
xb_model.fit(x_train, y_train,
             eval_set=[(x_test, y_test)], verbose=False)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
             colsample_bynode=1, colsample_bytree=0.055,
             early_stopping_rounds=14, eval_metric='rmse', gamma=1.5,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=32, min_child_weight=57, missing=None, n_estimators=800,
             n_jobs=-1, nthread=-1, objective='reg:linear', random_state=7,
             reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=None, silent=1,
             subsample=0.7, verbosity=1)

In [65]:
# Predicting the values form the test_set
y_pred = xb_model.predict(x_test)

# the root-mean squared error for predictions
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

6472258.270427615

In [66]:
# Comparing the predictions rmse with benchmark rmse
y_mean = [y_test.mean()] * y_test.shape[0]

rmse_benchmarch = np.sqrt(mean_squared_error(y_mean, y_pred))
print(rmse_benchmarch)

4147856.2977376375


In [68]:
# Loading the test dataset for predicting the target feature
test_dataset = pd.read_csv("../input/santander-value-prediction-challenge/test.csv")
test_id_s = test_dataset['ID']
test_dataset = test_dataset[dataset.columns[1:]]
test_pred_xgb = xb_model.predict(test_dataset)
test_pred_xgb = np.clip(test_pred_xgb, 1, float('inf'))

### CatBoost

In [69]:
cb_model = CatBoostRegressor(iterations=1000,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

In [70]:
cb_model.fit(x_train, y_train,
             eval_set=(x_test, y_test),
             use_best_model=True,
             verbose=50)



0:	learn: 8311366.4128798	test: 7619986.4909407	best: 7619986.4909407 (0)	total: 2.08s	remaining: 34m 37s
50:	learn: 6995051.0155254	test: 6727565.4464549	best: 6727565.4464549 (50)	total: 1m 37s	remaining: 30m 20s
100:	learn: 6601671.3306085	test: 6616971.9021352	best: 6614182.5275583 (98)	total: 3m 15s	remaining: 28m 58s
150:	learn: 6359982.9780276	test: 6585429.5980834	best: 6585369.0309191 (146)	total: 4m 52s	remaining: 27m 26s
200:	learn: 6081030.4533234	test: 6576062.6011977	best: 6571619.9557888 (194)	total: 6m 29s	remaining: 25m 46s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 6564198.07
bestIteration = 223

Shrink model to first 224 iterations.


<catboost.core.CatBoostRegressor at 0x7f7272123b38>

In [71]:
test_pred_cb = cb_model.predict(test_dataset)
test_pred_cb = np.clip(test_pred_cb, 0, float('inf'))

In [72]:
final_preds = (test_pred_xgb * 0.5 + test_pred_cb * 0.3)
pred_df = pd.DataFrame({'ID':test_id_s, 'target': final_preds})
pred_df.to_csv("submission.csv", float_format="%.10g", index = False)