In [1]:
from sklearn.datasets import load_boston
boston = load_boston()

In [2]:
boston.data.shape

(506, 13)

In [3]:
boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [4]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
import pandas as pd

data = pd.DataFrame(boston.data)
data.columns = boston.feature_names

In [6]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
data['PRICE'] = boston.target

In [8]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
PRICE      506 non-null float64
dtypes: float64(14)
memory usage: 55.4 KB


In [10]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/51/c1/198915b13e98b62a98f48309c41012638464651da755d941f4abe384c012/xgboost-0.82-py2.py3-none-win_amd64.whl (7.7MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.82


In [29]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

In [30]:
X, y = data.iloc[:,:-1], data.iloc[:, -1]

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## XGBoost Regressor

In [32]:
xgbReg = xgb.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [33]:
xgbReg.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \


XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [17]:
preds = xgbReg.predict(X_test)

In [28]:
xgbReg.score(X_test,y_test)

-0.32851477624683945

In [18]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse

10.40090786904915

In [19]:
preds

array([16.246052 , 13.55405  , 15.34978  ,  7.366374 , 16.223549 ,
       13.132597 , 13.11659  , 15.194435 , 13.059908 , 14.350715 ,
        6.755509 , 10.103982 , 10.282673 ,  7.1032615, 24.054825 ,
       20.078764 , 13.572018 , 20.992254 , 17.014915 , 15.061404 ,
       15.78944  , 14.8552885, 13.580473 , 18.07453  , 15.9728365,
       11.947342 , 12.481521 , 11.169417 , 24.674229 , 12.35595  ,
       11.384965 , 12.040651 , 14.848779 , 14.873972 , 15.990213 ,
       10.475328 ,  7.0943527,  9.535613 , 11.329449 , 11.346666 ,
       15.253375 , 15.9728365, 15.773533 , 11.428838 , 14.971526 ,
       17.176994 , 14.935408 , 12.583196 , 12.46     , 14.863089 ,
       11.169417 , 12.620752 , 15.380381 , 23.564884 , 11.341556 ,
       14.0057955, 14.873972 , 13.831604 ,  8.710036 , 13.769907 ,
       15.575518 , 13.318052 , 18.971651 , 18.345165 , 12.981849 ,
       19.471153 , 10.924454 , 13.654911 ,  9.791859 , 14.753401 ,
       16.223549 , 16.223549 , 17.594921 , 19.016098 , 14.6429

In [20]:
y_test

329    22.6
371    50.0
219    23.0
403     8.3
78     21.2
15     19.9
487    20.6
340    18.7
310    16.1
102    18.6
418     8.8
411    17.2
446    14.9
386    10.5
162    50.0
299    29.0
480    23.0
196    33.3
175    29.4
37     21.0
320    23.8
171    19.1
107    20.4
278    29.1
45     19.3
367    23.1
21     19.6
153    19.4
97     38.7
113    18.7
       ... 
65     23.5
344    31.2
481    23.7
387     7.4
233    48.3
206    24.4
90     22.6
497    18.3
239    23.3
137    17.1
407    27.9
224    44.8
225    50.0
326    23.0
96     21.4
426    10.2
159    23.3
391    23.2
54     18.9
435    13.4
254    21.9
300    24.8
505    11.9
246    24.3
374    13.8
56     24.7
455    14.1
60     18.7
213    28.1
108    19.8
Name: PRICE, Length: 102, dtype: float64

## XGBoost Classifier

In [21]:
from sklearn.datasets import load_iris
iris = load_iris()

In [22]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np
Xtr, Xte, ytr, yte = train_test_split(iris.data, iris.target, random_state = 0, stratify = iris.target)

In [23]:
xgbClf = xgb.XGBClassifier(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [24]:
xgbClf.fit(Xtr, ytr)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=10,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [26]:
pred = xgbClf.predict(Xte)
pred

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 1, 0, 0, 2, 0, 2, 2, 1,
       1, 0, 1, 0, 0, 1, 2, 1, 0, 2, 2, 1, 0, 2, 2, 2])

In [27]:
xgbClf.score(Xte, yte)

0.868421052631579