In [4]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold

---
#Polynomial Features in Python
---

http://scikit-learn.org/dev/modules/preprocessing.html#preprocessing

---
##The Polynomial Features functionality creates the cross terms and the power terms for the features
---

In [49]:
X = np.arange(4).reshape(1, 4)
print X                                                 

#TODO: see what happens when you start increasing the degree of the polynomial
poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(X)                             
print X_poly

<type 'numpy.ndarray'>
[[0 1 2 3]]
[[1 0 1 2 3 0 0 0 0 1 2 3 4 6 9]]


In [6]:
print len(X_poly[0])

15


In [7]:
#Just so we can see how a model is fitted using make pipeline. This is just a meaningless value of y
y = np.array([0.43])
print y

[ 0.43]


---
#Fitting an ordinary linear regression model
---

In [8]:
clf = LinearRegression()
clf.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

---
##Using the pipeline
---

http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline

In [9]:
clf = make_pipeline(PolynomialFeatures(2), LinearRegression())
clf.fit(X,y)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, normalize=False))])

In [10]:
print clf

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, normalize=False))])


---
##Features of $X$ have been transformed from $(X_{1}, X_{2})$ to $(1, X_{1}, X_{2}, X_{1}^{2}, X_{1}X_{2}, X_{2}^{2})$
---

---
#Train Test Split
---


http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html#sklearn.cross_validation.train_test_split

---
##Let m = 10
##N = 2
---

In [50]:
X, y = np.arange(20).reshape((10, 2)), np.array(range(10))

#completely hypothetical training data and target data - to illustrate how the data gets split
print X
print y

[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]
[0 1 2 3 4 5 6 7 8 9]


---
##Take 20% for a test set
---

In [51]:
dataset, X_test, dataset_y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
print dataset
print dataset.shape
print "\n", X_test
print X_test.shape

[[10 11]
 [ 0  1]
 [14 15]
 [ 4  5]
 [18 19]
 [ 8  9]
 [ 6  7]
 [12 13]]
(8, 2)

[[16 17]
 [ 2  3]]
(2, 2)


In [53]:
print dataset_y
print y_test

[5 0 7 2 9 4 3 6]
[8 1]


---
##Of the remaining 80% of the training data split this into a training set proper and a validation set
##Overall the size of the validation set should also be 20%
---

In [15]:
X_train, X_val, y_train, y_val = train_test_split(dataset, dataset_y, test_size = 0.25, random_state=21)

In [16]:
print X_train, "\n\n", X_val

[[ 4  5]
 [ 8  9]
 [ 6  7]
 [18 19]
 [10 11]
 [ 0  1]] 

[[14 15]
 [12 13]]


In [17]:
print y_train, "\n\n", y_val

[2, 4, 3, 9, 5, 0] 

[7, 6]


---
#S-fold Cross Validation (K-fold)
---

http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html

In [43]:
X, y = np.arange(20).reshape((10, 2)), np.array(range(10))

print X
print y

[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]
[0 1 2 3 4 5 6 7 8 9]


##Try varying the number of folds

In [44]:
kf = KFold(10, n_folds=2)

In [45]:
print kf

sklearn.cross_validation.KFold(n=10, n_folds=2, shuffle=False, random_state=None)


In [48]:
for train_index, test_index in kf:
    print("Train Index is:", train_index, "Test Index is:", test_index)
    print "-----------------------------"
    X_train, X_test = X[train_index], X[test_index]
    print X_train, "\n", X_test, "\n"
    y_train, y_test = y[train_index], y[test_index]
    print y_train, "\n", y_test, "\n"

('Train Index is:', array([5, 6, 7, 8, 9]), 'Test Index is:', array([0, 1, 2, 3, 4]))
-----------------------------
[[10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]] 
[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]] 

[5 6 7 8 9] 
[0 1 2 3 4] 

('Train Index is:', array([0, 1, 2, 3, 4]), 'Test Index is:', array([5, 6, 7, 8, 9]))
-----------------------------
[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]] 
[[10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]] 

[0 1 2 3 4] 
[5 6 7 8 9] 

