##### Preprocessing data using different techniques

In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
data = np.array([[3, -1.5,  2, -5.4], [0,  4,  -0.3, 2.1], [1,  3.3, -1.9, -4.3]])

In [3]:
### Mean Removal - Removes the mean from each feature so that it is cented around zero so as to minimize skewness

data_standardized = preprocessing.scale(data)

In [4]:
data_standardized

array([[ 1.33630621, -1.40451644,  1.29110641, -0.86687558],
       [-1.06904497,  0.84543708, -0.14577008,  1.40111286],
       [-0.26726124,  0.55907936, -1.14533633, -0.53423728]])

In [5]:
data_standardized.mean(axis=0)

array([  5.55111512e-17,  -1.11022302e-16,  -7.40148683e-17,
        -7.40148683e-17])

In [6]:
data_standardized.std(axis=0)

array([ 1.,  1.,  1.,  1.])

In [8]:
#### Scaling

In [10]:
data_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

In [11]:
data_scaler.fit_transform(data)

array([[ 1.        ,  0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.41025641,  1.        ],
       [ 0.33333333,  0.87272727,  0.        ,  0.14666667]])

In [12]:
### Normalization
data_normalized = preprocessing.normalize(data, norm='l1')
data_normalized

array([[ 0.25210084, -0.12605042,  0.16806723, -0.45378151],
       [ 0.        ,  0.625     , -0.046875  ,  0.328125  ],
       [ 0.0952381 ,  0.31428571, -0.18095238, -0.40952381]])

In [14]:
### BINARIZATION
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)
data_binarized

array([[ 1.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  1.],
       [ 0.,  1.,  0.,  0.]])

In [25]:
## TO REVSIT
### ONE HOT ENCODING
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]])
encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
encoded_vector

array([[ 0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.]])

In [27]:
 data = encoder.transform([[2, 3, 5, 3]])

In [28]:
#1,5,2,4 - > [0,1,0,0]

### Label Encoding

In [30]:
label_encoder = preprocessing.LabelEncoder()

In [31]:
input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']

In [32]:
label_encoder.fit(input_classes)

LabelEncoder()

In [33]:
label_encoder.classes_

array(['audi', 'bmw', 'ford', 'toyota'], 
      dtype='<U6')

In [35]:
for i, item in enumerate(label_encoder.classes_):
    print (item, '-->', i)

audi --> 0
bmw --> 1
ford --> 2
toyota --> 3


In [36]:
from sklearn import linear_model

In [37]:
# Create linear regression object
linear_regressor = linear_model.LinearRegression()

In [38]:
X_train = np.arange(1,100,3)

In [39]:
Y_train = X_train * 2.5 + 10

In [46]:
#Training
linear_regressor.fit(X_train[:,np.newaxis],Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [47]:
X_test = np.arange(1000,2000,5)

In [53]:
Y_pred = linear_regressor.predict(X_test[:,np.newaxis])
Y_test_pred = X_test * 2.5 + 10

In [49]:
linear_regressor.coef_

array([ 2.5])

In [50]:
linear_regressor.intercept_

9.9999999999999574

In [51]:
import matplotlib.pyplot as plt
%matplotlib inline

* Mean absolute error: This is the average of absolute errors of all the datapoints in the given dataset.
* Mean squared error: This is the average of the squares of the errors of all the datapoints in the given dataset. It is one of the most popular metrics out there!
* Median absolute error: This is the median of all the errors in the given dataset. The main advantage of this metric is that it's robust to outliers. A single bad point in the test dataset wouldn't skew the entire error metric, as opposed to a mean error metric.
* Explained variance score: This score measures how well our model can account for the variation in our dataset. A score of 1.0 indicates that our model is perfect.
* R2 score: This is pronounced as R-squared, and this score refers to the coefficient of determination. This tells us how well the unknown samples will be predicted by our model. The best possible score is 1.0, and the values can be negative as well.

In [52]:
import sklearn.metrics as sm

In [55]:
sm.mean_absolute_error(Y_pred, Y_test_pred)

1.3960743672214448e-12

In [56]:
sm.explained_variance_score(Y_pred, Y_test_pred)

1.0

### Model Persistence

In [58]:
#import cPickle as pickle
output_model = 'save_model.pkl'

with open(output_model, 'w') as f:
    pickle.dump(linear_regressor,f)

In [59]:
#Fetching the model & doing prediction
with open(output_model, 'r') as f:
    model_linregr = pickle.load(f)
    
y_test_pred_new = model_linregr.predict(X_test)

NameError: name 'output_model' is not defined

<img src="https://www.safaribooksonline.com/library/view/python-real-world/9781787123212/graphics/B05485_01_04.jpg">

In [61]:
ridge_regressor = linear_model.Ridge(alpha=0.01, fit_intercept=True,max_iter=10000)
#alpha - controls complexity. alpha gets closer to 0, ridge regressor becomes closer to linear regressor

In [64]:
#ridge_regressor.fit(X_train,Y_train)
#y_pred = ridge_regressor.predict(X_test)

In [65]:
### Polynomial regressor
from sklearn.preprocessing import PolynomialFeatures

polynomial = PolynomialFeatures(degree=3)

In [66]:
X_train

array([ 1,  4,  7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43, 46, 49,
       52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97])

In [68]:
X_train_transformed = polynomial.fit_transform(X_train[:,np.newaxis])

In [69]:
X_train_transformed

array([[  1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          1.00000000e+00],
       [  1.00000000e+00,   4.00000000e+00,   1.60000000e+01,
          6.40000000e+01],
       [  1.00000000e+00,   7.00000000e+00,   4.90000000e+01,
          3.43000000e+02],
       [  1.00000000e+00,   1.00000000e+01,   1.00000000e+02,
          1.00000000e+03],
       [  1.00000000e+00,   1.30000000e+01,   1.69000000e+02,
          2.19700000e+03],
       [  1.00000000e+00,   1.60000000e+01,   2.56000000e+02,
          4.09600000e+03],
       [  1.00000000e+00,   1.90000000e+01,   3.61000000e+02,
          6.85900000e+03],
       [  1.00000000e+00,   2.20000000e+01,   4.84000000e+02,
          1.06480000e+04],
       [  1.00000000e+00,   2.50000000e+01,   6.25000000e+02,
          1.56250000e+04],
       [  1.00000000e+00,   2.80000000e+01,   7.84000000e+02,
          2.19520000e+04],
       [  1.00000000e+00,   3.10000000e+01,   9.61000000e+02,
          2.97910000e+04],
       [  1.00000000e

In [70]:
datapoint = [0.39,2.78,7.11]

In [83]:
poly_datapoint = polynomial.fit_transform(np.array(datapoint)[:,np.newaxis])

In [76]:
poly_linear_model = linear_model.LinearRegression()

In [78]:
poly_linear_model.fit(X_train_transformed, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [82]:
np.array(datapoint)[:,np.newaxis]

array([[ 0.39],
       [ 2.78],
       [ 7.11]])

In [84]:
linear_regressor.predict(np.array(datapoint)[:,np.newaxis])

array([ 10.975,  16.95 ,  27.775])

In [87]:
poly_linear_model.predict(poly_datapoint)

array([ 10.975,  16.95 ,  27.775])