In [1]:
import numpy as np
import pandas as pd

# Preparing the Boston Housing data
Remove header and convert to CSV... or download from [here](https://github.com/selva86/datasets/blob/master/BostonHousing.csv)

## Description
 The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
 prices and the demand for clean air', J. Environ. Economics & Management,
 vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
 ...', Wiley, 1980.   N.B. Various transformations are used in the table on
 pages 244-261 of the latter.


## Features
Variables in order:
- CRIM:     per capita crime rate by town
- ZN:       proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS:    proportion of non-retail business acres per town
- CHAS:     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX:      nitric oxides concentration (parts per 10 million)
- RM:       average number of rooms per dwelling
- AGE:      proportion of owner-occupied units built prior to 1940
- DIS:      weighted distances to five Boston employment centres
- RAD:      index of accessibility to radial highways
- TAX:      full-value property-tax rate per \$10,000
- PTRATIO:  pupil-teacher ratio by town
- B:        $1000(Bk - 0.63)^2$ where Bk is the proportion of blacks by town
- LSTAT:    % lower status of the population
- MEDV:     Median value of owner-occupied homes in $1000's

In [4]:
# opening and reading the boston housing data
data = pd.read_csv("./boston.csv")
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [15]:
# pandas: useful for manipulating data with named columns
data.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')

In [36]:
# select a particular column
display(data["indus"].head())  # returns a Series object
display(data[["indus"]].head())# returns a DataFrame object

# and multiple columns
data[["indus", "chas"]].head()

0    2.31
1    7.07
2    7.07
3    2.18
4    2.18
Name: indus, dtype: float64

Unnamed: 0,indus
0,2.31
1,7.07
2,7.07
3,2.18
4,2.18


Unnamed: 0,indus,chas
0,2.31,0
1,7.07,0
2,7.07,0
3,2.18,0
4,2.18,0


In [27]:
# selection by integer position
display(data[5:10])
# this doesn't work: data[5:10, 2:5]

data.iloc[5:10, 2:5]

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


Unnamed: 0,indus,chas,nox
5,2.18,0,0.458
6,7.87,0,0.524
7,7.87,0,0.524
8,7.87,0,0.524
9,7.87,0,0.524


In [28]:
# selection by label; note that this uses the index LABELS (not integer value)
display(data.index)
data.loc[40:43, ["crim", "zn"]]

RangeIndex(start=0, stop=506, step=1)

Unnamed: 0,crim,zn
40,0.03359,75.0
41,0.12744,0.0
42,0.1415,0.0
43,0.15936,0.0


In [37]:
# indexing via boolean masks
display(df.columns != "medv")

# select all feature columns
features = data.loc[:, df.columns != "medv"]
display(features.head())

# housing values
prices = data[["medv"]]
display(prices.head())

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False])

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


Unnamed: 0,medv
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [40]:
# converting dataframes/series to numpy arrays
x = features.values
y = prices.values
display(x)
display(y)

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

array([[24. ],
       [21.6],
       [34.7],
       [33.4],
       [36.2],
       [28.7],
       [22.9],
       [27.1],
       [16.5],
       [18.9],
       [15. ],
       [18.9],
       [21.7],
       [20.4],
       [18.2],
       [19.9],
       [23.1],
       [17.5],
       [20.2],
       [18.2],
       [13.6],
       [19.6],
       [15.2],
       [14.5],
       [15.6],
       [13.9],
       [16.6],
       [14.8],
       [18.4],
       [21. ],
       [12.7],
       [14.5],
       [13.2],
       [13.1],
       [13.5],
       [18.9],
       [20. ],
       [21. ],
       [24.7],
       [30.8],
       [34.9],
       [26.6],
       [25.3],
       [24.7],
       [21.2],
       [19.3],
       [20. ],
       [16.6],
       [14.4],
       [19.4],
       [19.7],
       [20.5],
       [25. ],
       [23.4],
       [18.9],
       [35.4],
       [24.7],
       [31.6],
       [23.3],
       [19.6],
       [18.7],
       [16. ],
       [22.2],
       [25. ],
       [33. ],
       [23.5],
       [19

# numpy indexing

In [None]:
# simple indexing

In [42]:
# example: creating 1-hot arrays

# say you have an array of integers, representing what category a sample belongs to
labels = np.random.randint(0, 6, size=(10,))
display(labels)

array([1, 3, 4, 5, 5, 2, 2, 4, 2, 2])

In [46]:
# you want to convert this to a matrix with shape (num_samples, num_categories)
#     where the values are all 0 except for a 1 indicating which category a 
#     sample belongs to.

def onehot(indices, num_categories=None):
    """
    Convert an array of indices of shape (num_samples,) to
    (num_samples, num_categories) where each row contains a single
    non-zero entry.
    
    If `num_categories` is not specified, the maximum value in
    `indices` will be used.
    """
    
    if num_categories is None:
        num_categories = max(indices)+1
        
    a = np.zeros((indices.size, num_categories))
    a[np.arange(indices.size), indices] = 1.
    
    return a

display(onehot(labels))

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [48]:
# one difference between indexing with arrays and indexing with ranges
a = np.eye(8)
display(a)

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

In [53]:
display(a[0:3, 0:3]) 
display(a[[0, 1, 2], [0, 1, 2]]) 
# with assignments
a[0:3, 0:3] = 5.
a[[0,1,2], [0,1,2]] = 10.
display(a)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

array([1., 1., 1.])

array([[10.,  5.,  5.,  0.,  0.,  0.,  0.,  0.],
       [ 5., 10.,  5.,  0.,  0.,  0.,  0.,  0.],
       [ 5.,  5., 10.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [61]:
# pulling random mini-batches from an array of samples
a = np.eye(10)

# 1. shuffle rows in-place; modifies original array
np.random.shuffle(a)
display(a[0:4, :])

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]])

In [62]:
a = np.eye(10)

# 2. index via a permutation; doesn't modify original
perm = np.random.permutation(10)
display(perm)
display(a[perm[0:4], :])
display(a)

array([9, 3, 4, 2, 0, 8, 1, 7, 6, 5])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])