# Predicting bike usage: Template for part 3 activity
This is the template for the part 3 activity.  Please read the activity overview before proceeding!
Use this template to build a model, then answer the questions in the quiz.

## Import the libraries we need

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sklearn processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Perform feature selection using a variance threshold
from sklearn.feature_selection import VarianceThreshold

# Feature selection using Recursive Feature Elimimation
from sklearn.feature_selection import RFE

# Sklearn regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

# Sklearn regression model evaluation functions
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

Define some useful functions 

In [12]:
def linearRegressionSummary(model, column_names):
    '''Show a summary of the trained linear regression model'''

    # Plot the coeffients as bars
    fig = plt.figure(figsize=(8,len(column_names)/3))
    fig.suptitle('Linear Regression Coefficients', fontsize=16)
    rects = plt.barh(column_names, model.coef_,color="lightblue")

    # Annotate the bars with the coefficient values
    for rect in rects:
        width = round(rect.get_width(),4)
        plt.gca().annotate('  {}  '.format(width),
                    xy=(0, rect.get_y()),
                    xytext=(0,2),  
                    textcoords="offset points",  
                    ha='left' if width<0 else 'right', va='bottom')        
    plt.show()
    
def score(model, X, y):
    """Get the model prediction scores using the provided input and target features"""
    
    predictions = model.predict(X)
    print("    R2", r2_score(y, predictions))     
    
def correlatedFeatures(dataset, threshold):
    """Function to list features that are correlated
       Adds the first of the correlated pair only (not both)"""
    
    correlated_columns = set()
    correlations = dataset.corr()
    for i in range(len(correlations)):
        for j in range(i):
            if abs(correlations.iloc[i,j]) > threshold:
                correlated_columns.add(correlations.columns[i])
    return correlated_columns    


def kFoldCV(X, y, silent=False):
    """Evaluate linear regression with k-fold cross validation"""
    
    model = LinearRegression()
    
    # Create folds
    seed = 2
    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    
    # Perform kfold cross validation
    results = cross_val_score(model, X, y, cv=kfold, scoring='r2')
    
    # Show results
    if not silent:
        print(type(model).__name__)
        print("kFoldCV:")
        print("    Fold R2 scores:", results)
        print("    Mean R2 score:", results.mean())
        print("    Std R2 score:", results.std())
    
    # Build a model using all the data and return it
    model.fit(X, y)

    return model



## Load and understand the data


In [13]:
# Load data and inspect the first few rows
dataset = pd.read_csv("bike.csv")
dataset.head()

Unnamed: 0,id,season,year,holiday,weekday,workingday,weather,temp,feel_temp,hum,windspeed,promotion_level,promotion_type,promotion_level_external,promotion_type_external,casual,registered,cnt
0,12765,winter,2016,0,6,0,cloud,0.344167,0.363625,0.805833,0.160446,7,1,2,2,226,654,880
1,12766,winter,2016,0,0,0,cloud,0.363478,0.353739,0.696087,0.248539,8,1,8,1,125,670,795
2,12767,winter,2016,0,1,1,fair,0.196364,0.189405,0.437273,0.248309,3,1,10,2,75,1229,1304
3,12768,winter,2016,0,2,1,fair,0.2,0.212122,0.590435,0.160296,0,1,8,3,67,1454,1521
4,12769,winter,2016,0,3,1,fair,0.226957,0.22927,0.436957,0.1869,2,0,5,1,58,1518,1576


In [14]:
# Confirm the data types
dataset.dtypes

id                            int64
season                       object
year                          int64
holiday                       int64
weekday                       int64
workingday                    int64
weather                      object
temp                        float64
feel_temp                   float64
hum                         float64
windspeed                   float64
promotion_level               int64
promotion_type                int64
promotion_level_external      int64
promotion_type_external       int64
casual                        int64
registered                    int64
cnt                           int64
dtype: object

In [15]:
# Check for nulls
dataset.isnull().mean()

id                          0.0
season                      0.0
year                        0.0
holiday                     0.0
weekday                     0.0
workingday                  0.0
weather                     0.0
temp                        0.0
feel_temp                   0.0
hum                         0.0
windspeed                   0.0
promotion_level             0.0
promotion_type              0.0
promotion_level_external    0.0
promotion_type_external     0.0
casual                      0.0
registered                  0.0
cnt                         0.0
dtype: float64

In [16]:
# Plot a scatter matrix
# TODO

## Thoughts?
TODO: Write your thoughts about the data here

## Feature engineering

In [17]:
# TODO: Initial feature engineering

In [31]:
# One-hot-encode the categoricals
categoricals = ['season','weekday','weather']
dataset = pd.get_dummies(dataset, columns=categoricals) #, drop_first=True)
dataset.head()

KeyError: "None of [Index(['season', 'weekday', 'weather'], dtype='object')] are in the [columns]"

## Split into X and y

In [21]:
# Our target feature
y = dataset.casual

# Our input features
X = dataset.drop(['casual'], axis=1)

## Rescale

In [22]:
# Rescale the input features
scaler = MinMaxScaler(feature_range=(0,1))
X_ = scaler.fit_transform(X)
X = pd.DataFrame(X_, columns=X.columns)

  return self.partial_fit(X, y)


## Simple linear regression
Build a linear regression model as a baseline, so we can see the effect of any improvements we make later.

In [23]:
# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)


In [24]:
# Build and evaluate a linear regression model 
# TODO

## Resampling with k-fold cross-validation

Evaluate linear regression with 10-fold cross validation

In [25]:
# Evaluate linear regression with k-fold cross validation
# TODO

## Feature selection

### Remove low variance features
Identify some low variance features, those with variance less than 0.1 and remove them.

In [26]:
# Remove low variance features
# TODO

### Correlated features
Remove highly correlated features

In [27]:
# Remove highly correlated features
# TODO

## Apply RFE

Systematically determine number of features to keep

In [28]:
# Build models using from 1 to n-1 features using RFE

for i in range(1,X_train.shape[1]-1):
    print("\nRFE ", i, end="\t")
    
    # Create a model
    model = LinearRegression()

    # Select the best features according to RFE
    # TODO

    # Transform (remove features not selected)
    #TODO

    # Evaluate using k-fold cross-validation
    #TODO


RFE  1	
RFE  2	
RFE  3	
RFE  4	
RFE  5	
RFE  6	
RFE  7	
RFE  8	
RFE  9	
RFE  10	
RFE  11	
RFE  12	
RFE  13	
RFE  14	
RFE  15	
RFE  16	
RFE  17	
RFE  18	
RFE  19	
RFE  20	
RFE  21	
RFE  22	
RFE  23	
RFE  24	
RFE  25	
RFE  26	

In [None]:
# Based on the above, remove the suggested number of features

## Evaluate a model with the features removed
Now that we have removed a number of features using low-variance removal, highly correlated feature removal and RFE, build and evaluate a linear regression model with this reduced feature set.

In [None]:
# Build and evaluate a linear regression model with the reduced feature set
## TODO

## Regularization


Build a model using all the original features.  Let's see if regularisation can do some automatic feature selection!

In [30]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV


# Split into train (2/3) and test (1/3) sets
test_size = 0.33
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

# Create 5 folds
seed = 13
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# Create model
# TODO

# Fit model
# TODO

# Evaluate
# TODO

# Print alpha
# TODO