In [7]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split

---
#Polynomial Features in Python
---

---
##The Polynomial Features functionality creates the cross terms and the power terms for the features
---

In [34]:
X = np.arange(4).reshape(1, 4)
print X                                                 

#TODO: see what happens when you start increasing the degree of the polynomial
poly = PolynomialFeatures(2)

X_poly = poly.fit_transform(X)                             
print X_poly

[[0 1 2 3]]
[[1 0 1 2 3 0 0 0 0 1 2 3 4 6 9]]


In [35]:
print len(X_poly[0])

15


In [36]:
#Just so we can see how a model is fitted using make pipeline. This is just a meaningless value of y
y = np.array([0.43])
print y

[ 0.43]


---
#Fitting an ordinary linear regression model
---

In [22]:
clf = LinearRegression()
clf.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

---
##Fitting the make_pipeline
---

In [38]:
clf = make_pipeline(PolynomialFeatures(2), LinearRegression())
clf.fit(X,y)

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, normalize=False))])

In [39]:
print clf

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, normalize=False))])


---
##Features of $X$ have been transformed from $(X_{1}, X_{2})$ to $(1, X_{1}, X_{2}, X_{1}^{2}, X_{1}X_{2}, X_{2}^{2})$
---

---
#Train Test Split
---

---
##Let m = 10
##N = 2
---

In [27]:
X, y = np.arange(20).reshape((10, 2)), range(10)

#completely hypothetical training data and target data - to illustrate how the data gets split
print X
print y

[[ 0  1]
 [ 2  3]
 [ 4  5]
 [ 6  7]
 [ 8  9]
 [10 11]
 [12 13]
 [14 15]
 [16 17]
 [18 19]]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


---
##Take 20% for a test set
---

In [20]:
dataset, X_test, dataset_y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
print dataset
print dataset.shape
print "\n", X_test
print X_test.shape

[[10 11]
 [ 0  1]
 [14 15]
 [ 4  5]
 [18 19]
 [ 8  9]
 [ 6  7]
 [12 13]]
(8, 2)

[[16 17]
 [ 2  3]]
(2, 2)


In [17]:
print dataset_y
print y_test

[5, 0, 7, 2, 9, 4, 3, 6]
[8, 1]


---
##Of the remaining 80% of the training data split this into a training set proper and a validation set
##Overall the size of the validation set should also be 20%
---

In [22]:
X_train, X_val, y_train, y_val = train_test_split(dataset, dataset_y, test_size = 0.25, random_state=21)

In [24]:
print X_train, "\n\n", X_val

[[ 4  5]
 [ 8  9]
 [ 6  7]
 [18 19]
 [10 11]
 [ 0  1]] 

[[14 15]
 [12 13]]


In [25]:
print y_train, "\n\n", y_val

[2, 4, 3, 9, 5, 0] 

[7, 6]
