In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

%matplotlib inline

from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv("trainregressor.csv")

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age,AveMonthSpend
0,0,0,0,0,0,0,1,0,1,0,1,-1.320246,-0.65671,-0.006876,1.502657,-0.396685,89
1,0,0,0,0,0,0,1,0,1,1,0,-0.442232,1.32004,0.587186,0.576502,-0.307694,117
2,0,0,0,0,0,0,1,0,1,0,1,-0.442232,1.32004,0.587186,0.345102,-0.307694,123
3,0,0,0,0,0,0,1,0,0,1,0,-0.442232,-0.65671,-1.195002,0.212819,-0.574666,50
4,0,0,0,0,0,0,1,0,0,1,1,2.191812,2.637874,1.775311,0.365887,-0.574666,95


### Split data and start training

In [5]:
X = df.iloc[:,0:16]
y = df.iloc[:,16]

In [6]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age
0,0,0,0,0,0,0,1,0,1,0,1,-1.320246,-0.65671,-0.006876,1.502657,-0.396685
1,0,0,0,0,0,0,1,0,1,1,0,-0.442232,1.32004,0.587186,0.576502,-0.307694
2,0,0,0,0,0,0,1,0,1,0,1,-0.442232,1.32004,0.587186,0.345102,-0.307694
3,0,0,0,0,0,0,1,0,0,1,0,-0.442232,-0.65671,-1.195002,0.212819,-0.574666
4,0,0,0,0,0,0,1,0,0,1,1,2.191812,2.637874,1.775311,0.365887,-0.574666


In [7]:
y.head()

0     89
1    117
2    123
3     50
4     95
Name: AveMonthSpend, dtype: int64

In [8]:
X.values, y.values

(array([[ 0.        ,  0.        ,  0.        , ..., -0.00687645,
          1.50265655, -0.39668469],
        [ 0.        ,  0.        ,  0.        , ...,  0.58718608,
          0.5765021 , -0.30769417],
        [ 0.        ,  0.        ,  0.        , ...,  0.58718608,
          0.34510189, -0.30769417],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  1.77531115,
          1.37950816,  1.91706882],
        [ 0.        ,  1.        ,  0.        , ...,  1.18124862,
         -1.16506383,  1.3831257 ],
        [ 0.        ,  1.        ,  0.        , ...,  1.18124862,
         -0.47428537,  1.47211622]]),
 array([ 89, 117, 123, ...,  79,  65,  68], dtype=int64))

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1234)

In [11]:
X_train.shape

(13215, 16)

In [13]:
X_test.shape

(3304, 16)

In [15]:
model = GradientBoostingRegressor(random_state=1234)

### Perform cross-validation

In [22]:
score = cross_val_score(model,X,y,cv=5, scoring='neg_mean_squared_error')
score

array([-10.37905538, -10.13506761, -10.14967651,  -9.84431469,
       -10.59125289])

In [23]:
score.mean()

-10.219873416016252

### Fit and do prediction

In [24]:
model.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=1234, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
y_pred = model.predict(X_test)

In [26]:
y_pred

array([ 51.04952737, 139.07355186,  49.25587648, ...,  62.47957124,
        81.35402071,  80.38546144])

### Model evaluation

In [31]:
mae = mean_absolute_error(y_test,y_pred)
mae

2.5154368023920197

In [32]:
mse = mean_squared_error(y_test,y_pred)
mse

9.954290451168086

In [33]:
rmse = np.sqrt(mse)
rmse

3.155042068050454

In [35]:
R2score = r2_score(y_test,y_pred)
R2score

0.9873669315691596

### Load test data and predict

In [36]:
test = pd.read_csv("testregressor.csv")

In [37]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age
0,0,0,0,0,1,0,0,0,0,1,0,0.435783,-0.65671,1.775311,0.218934,1.472116
1,0,0,0,0,0,0,0,1,1,0,1,0.435783,0.661124,1.181249,0.550936,-0.218704
2,1,0,0,0,1,0,0,0,0,0,1,0.435783,-0.65671,1.181249,0.648066,2.451012
3,1,0,0,0,0,0,0,1,1,0,1,-1.320246,-0.65671,1.181249,1.231247,0.315239
4,0,1,0,0,0,1,0,0,0,0,1,-0.442232,0.661124,-0.006876,-1.418054,-0.307694


In [38]:
test.shape

(500, 16)

In [39]:
test.values

array([[ 0.        ,  0.        ,  0.        , ...,  1.77531115,
         0.21893391,  1.47211622],
       [ 0.        ,  0.        ,  0.        , ...,  1.18124862,
         0.55093635, -0.21870365],
       [ 1.        ,  0.        ,  0.        , ...,  1.18124862,
         0.64806607,  2.45101194],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.58718608,
         1.50643102,  2.18404038],
       [ 0.        ,  0.        ,  1.        , ..., -0.00687645,
         0.58465496,  0.40422998],
       [ 0.        ,  1.        ,  0.        , ...,  1.18124862,
        -1.48770559,  1.29413518]])

In [40]:
y_pred2 = model.predict(test)

In [41]:
y_pred2

array([ 46.94023614, 113.02756108,  46.04870674,  86.06545079,
        61.22953579,  47.70987803,  92.91790192, 141.40253358,
        99.53980721,  54.65060749,  57.73161022,  53.18442954,
        72.80075933,  49.48464757,  40.51214681,  53.80704489,
        83.30913614,  71.85273927,  98.45156269,  59.54148265,
        66.54446441,  74.42825635, 166.18683793,  83.47950839,
        54.79525098,  68.68912979,  86.38025338, 123.5840566 ,
        77.18492745,  60.99921741,  64.73272796,  79.44770998,
        45.06479134,  69.76494704,  96.24511265,  98.94201954,
       166.69035635,  93.7304523 ,  55.18450057,  86.38025338,
        49.80242945,  78.26868114,  78.88659935,  51.65543141,
        56.93775022,  74.42825635,  61.92703236,  80.35989323,
       121.70812739,  80.17549157,  77.07169097,  88.33779127,
        77.87782415,  63.12879767,  50.2882531 ,  76.08832325,
        56.77471527,  73.24008284,  60.65230766,  66.25531832,
        48.4910548 ,  63.32389832,  89.01761532,  79.28

### Save the results 

In [42]:
regresult = pd.DataFrame(y_pred2,columns=['AveMonthSpend'])

In [43]:
regresult

Unnamed: 0,AveMonthSpend
0,46.940236
1,113.027561
2,46.048707
3,86.065451
4,61.229536
...,...
495,50.158653
496,85.148046
497,72.262843
498,55.268800


In [44]:
#regresult.to_csv("r2result.csv",index=False)