In [1]:
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import math, scipy, numpy as np
from scipy import linalg

In [2]:
data = datasets.load_diabetes()

In [3]:
feature_names=['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [4]:
trn,test,y_trn,y_test = train_test_split(data.data, data.target, test_size=0.2)

In [5]:
trn.shape, test.shape

((353, 10), (89, 10))

## Linear Regression in Scikit-Learn

In [6]:
regr = linear_model.LinearRegression()
%timeit regr.fit(trn, y_trn)

1.15 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
pred = regr.predict(test)

In [8]:
def regr_metrics(act, pred):
    return (math.sqrt(metrics.mean_squared_error(act, pred)), 
     metrics.mean_absolute_error(act, pred))

In [9]:
regr_metrics(y_test, pred)

(51.71793371092478, 42.88440421452472)

## Polynomial Features

In [10]:
trn.shape

(353, 10)

In [11]:
poly = PolynomialFeatures(include_bias=False)

In [12]:
trn_feat = poly.fit_transform(trn)

In [13]:
', '.join(poly.get_feature_names(feature_names))

'age, sex, bmi, bp, s1, s2, s3, s4, s5, s6, age^2, age sex, age bmi, age bp, age s1, age s2, age s3, age s4, age s5, age s6, sex^2, sex bmi, sex bp, sex s1, sex s2, sex s3, sex s4, sex s5, sex s6, bmi^2, bmi bp, bmi s1, bmi s2, bmi s3, bmi s4, bmi s5, bmi s6, bp^2, bp s1, bp s2, bp s3, bp s4, bp s5, bp s6, s1^2, s1 s2, s1 s3, s1 s4, s1 s5, s1 s6, s2^2, s2 s3, s2 s4, s2 s5, s2 s6, s3^2, s3 s4, s3 s5, s3 s6, s4^2, s4 s5, s4 s6, s5^2, s5 s6, s6^2'

In [14]:
trn_feat.shape

(353, 65)

In [15]:
regr.fit(trn_feat, y_trn)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [16]:
poly_pred = regr.predict(poly.fit_transform(test))

In [17]:
regr_metrics(y_test, poly_pred)

(57.74273616571591, 46.21514728285226)

In [18]:
%timeit poly.fit_transform(trn)

606 µs ± 12.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## speeding up feature generation
#### here we use `Numba` python library;  a compiler.

In [19]:
%matplotlib inline

In [20]:
import math, numpy as np, matplotlib.pyplot as plt
from pandas_summary import DataFrameSummary
from scipy import ndimage

In [21]:
from numba import jit, vectorize, guvectorize, cuda, float32, void, float64

In [22]:
# Untype and Unvectorized
def proc_python(xx,yy):
    zz = np.zeros(nobs, dtype='float32')
    for j in range(nobs):   
        x, y = xx[j], yy[j] 
        x = x*2 - ( y * 55 )
        y = x + y*2         
        z = x + y + 99      
        z = z * ( z - .88 ) 
        zz[j] = z           
    return zz

In [23]:
nobs = 10000
x = np.random.randn(nobs).astype('float32')
y = np.random.randn(nobs).astype('float32')

In [24]:
%timeit proc_python(x,y)   # Untyped and unvectorized

227 ms ± 2.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Numpy

In [25]:
# Typed and Vectorized
def proc_numpy(x,y):
    z = np.zeros(nobs, dtype='float32')
    x = x*2 - ( y * 55 )
    y = x + y*2         
    z = x + y + 99      
    z = z * ( z - .88 ) 
    return z

In [26]:
np.allclose( proc_numpy(x,y), proc_python(x,y), atol=1e-4 )

True

In [27]:
%timeit proc_numpy(x,y)    # Typed and vectorized

91.2 µs ± 722 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### Numba

In [28]:
@jit()
def proc_numba(xx,yy,zz):
    for j in range(nobs):   
        x, y = xx[j], yy[j] 
        x = x*2 - ( y * 55 )
        y = x + y*2         
        z = x + y + 99      
        z = z * ( z - .88 ) 
        zz[j] = z           
    return zz

In [29]:
z = np.zeros(nobs).astype('float32')
np.allclose( proc_numpy(x,y), proc_numba(x,y,z), atol=1e-4 )

True

In [30]:
%timeit proc_numba(x,y,z)

25.4 µs ± 133 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [31]:
@vectorize
def vec_numba(x,y):
    x = x*2 - ( y * 55 )
    y = x + y*2         
    z = x + y + 99      
    return z * ( z - .88 ) 

In [32]:
np.allclose(vec_numba(x,y), proc_numba(x,y,z), atol=1e-4 )

True

In [33]:
%timeit vec_numba(x,y)

25.1 µs ± 1.01 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### Numba polynomial features

In [35]:
@jit(nopython=True)
def vec_poly(x, res):
    m,n=x.shape
    feat_idx=0
    for i in range(n):
        v1=x[:,i]
        for k in range(m): res[k,feat_idx] = v1[k]
        feat_idx+=1
        for j in range(i,n):
            for k in range(m): res[k,feat_idx] = v1[k]*x[k,j]
            feat_idx+=1

### Row-major vs column-major storage

In [36]:
trn = np.asfortranarray(trn)
test = np.asfortranarray(test)

In [37]:
m,n=trn.shape
n_feat = n*(n+1)//2 + n
trn_feat = np.zeros((m,n_feat), order='F')
test_feat = np.zeros((len(y_test), n_feat), order='F')

In [38]:
vec_poly(trn, trn_feat)
vec_poly(test, test_feat)

In [39]:
regr.fit(trn_feat, y_trn)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [40]:
regr_metrics(y_test, regr.predict(test_feat))

(57.74273616571612, 46.21514728285262)

In [41]:
%timeit vec_poly(trn, trn_feat)

18.5 µs ± 67.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [42]:
%timeit poly.fit_transform(trn)

615 µs ± 8.67 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [45]:
605/7.7

78.57142857142857

## Regularization

In [47]:
reg_regr = linear_model.LassoCV(n_alphas=10)

In [48]:
reg_regr.fit(trn_feat, y_trn)

  tol, rng, random, positive)


LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=10, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [49]:
reg_regr.alpha_

0.01043142423432635

In [50]:
regr_metrics(y_test, reg_regr.predict(test_feat))

(50.91772473747468, 41.99314468102378)

## Noise

In [53]:
idxs = np.random.randint(0, len(trn), 10)
idxs

array([245, 337, 281, 341, 114,  78, 119, 234, 208, 296])

In [54]:
y_trn2 = np.copy(y_trn)
y_trn2[idxs] *= 10  #label noise

In [55]:
regr = linear_model.LinearRegression()
regr.fit(trn, y_trn)
regr_metrics(y_test, regr.predict(test))

(51.71793371092478, 42.88440421452472)

In [56]:
regr.fit(trn, y_trn2)
regr_metrics(y_test, regr.predict(test))

(79.8415461700325, 62.918878583371644)

In [57]:
hregr = linear_model.HuberRegressor()
hregr.fit(trn, y_trn2)
regr_metrics(y_test, hregr.predict(test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


(52.119044069402214, 43.116421093204266)