In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from itertools import combinations
from sklearn.linear_model import Lasso

---
#4. Building a Predictive Model
---

In [2]:
location = "/Users/mrgholt/GADS-22-NYC/Citibike_Data/"

dfJanComplete = pd.read_csv(location + "dfJanComplete.csv")
dfFebComplete = pd.read_csv(location + "dfFebComplete.csv")
dfMarComplete = pd.read_csv(location + "dfMarComplete.csv")
dfAprComplete = pd.read_csv(location + "dfAprComplete.csv")
dfMayComplete = pd.read_csv(location + "dfMayComplete.csv")
dfJunComplete = pd.read_csv(location + "dfJunComplete.csv")
dfJulComplete = pd.read_csv(location + "dfJulComplete.csv")
dfAugComplete = pd.read_csv(location + "dfAugComplete.csv")
dfSepComplete = pd.read_csv(location + "dfSepComplete.csv")
dfOctComplete = pd.read_csv(location + "dfOctComplete.csv")
dfNovComplete = pd.read_csv(location + "dfNovComplete.csv")
dfDecComplete = pd.read_csv(location + "dfDecComplete.csv")

dfFebComplete = dfFebComplete[dfFebComplete.distance > 0.0]
dfMarComplete = dfMarComplete[dfMarComplete.distance > 0.0]
dfMayComplete = dfMayComplete[dfMayComplete.distance > 0.0]
dfJunComplete = dfJunComplete[dfJunComplete.distance > 0.0]
dfSepComplete = dfSepComplete[dfSepComplete.distance > 0.0]

In [3]:
def calculate_speed(df, num):
    speed_list = []
    for i in xrange(num):
        temp = df.iloc[i].tripduration/3600.0
        speed = df.iloc[i].distance/temp
        speed_list.append(speed)
    return speed_list

def calc_all_speeds(df):
    df['speed'] = calculate_speed(df, len(df))
    return df

def get_outlier_thresholds(x):
    lower_25 = x.quantile(0.25)
    upper_75 = x.quantile(0.75)
    high_threshold = upper_75 + 1.5 * (upper_75-lower_25)
    low_threshold = lower_25 - 1.5 * (upper_75-lower_25)
    return(low_threshold, high_threshold)

def remove_outliers(x, low, high):
    if x > high:
        return np.nan
    elif x < low:
        return np.nan
    else:
        return x
    
def handle_outliers(df, column_name):
    low_threshold, high_threshold = get_outlier_thresholds(df[column_name])     
    new_column = df[column_name].map((lambda x: remove_outliers(x, low_threshold, high_threshold)))
    return new_column

In [4]:
file_list = [dfJanComplete, dfFebComplete, dfMarComplete, dfAprComplete, dfMayComplete, dfJunComplete,
             dfJulComplete, dfAugComplete, dfSepComplete, dfOctComplete, dfNovComplete, dfDecComplete] 

cols_f = ['tripduration', 'distance']

for df in file_list:
    for c in cols_f:
        df[c] = handle_outliers(df, c)

In [5]:
file_list = [dfJanComplete, dfFebComplete, dfMarComplete, dfAprComplete, dfMayComplete, dfJunComplete,
             dfJulComplete, dfAugComplete, dfSepComplete, dfOctComplete, dfNovComplete, dfDecComplete] 

for df in file_list:
    df = calc_all_speeds(df)

In [6]:
file_list = [dfJanComplete, dfFebComplete, dfMarComplete, dfAprComplete, dfMayComplete, dfJunComplete,
             dfJulComplete, dfAugComplete, dfSepComplete, dfOctComplete, dfNovComplete, dfDecComplete] 

cols_f = ['speed']

for df in file_list:
    for c in cols_f:
        df[c] = handle_outliers(df, c)

In [7]:
file_list = [dfJanComplete, dfFebComplete, dfMarComplete, dfAprComplete, dfMayComplete, dfJunComplete,
             dfJulComplete, dfAugComplete, dfSepComplete, dfOctComplete, dfNovComplete, dfDecComplete] 

cols_f = ['elev']

for df in file_list:
    for c in cols_f:
        df[c] = handle_outliers(df, c)

In [8]:
dfAll = dfJanComplete.copy()
dfAll = dfAll.append(dfFebComplete, ignore_index = True)
dfAll = dfAll.append(dfMarComplete, ignore_index = True)
dfAll = dfAll.append(dfAprComplete, ignore_index = True)
dfAll = dfAll.append(dfMayComplete, ignore_index = True)
dfAll = dfAll.append(dfJunComplete, ignore_index = True)
dfAll = dfAll.append(dfJulComplete, ignore_index = True)
dfAll = dfAll.append(dfAugComplete, ignore_index = True)
dfAll = dfAll.append(dfSepComplete, ignore_index = True)
dfAll = dfAll.append(dfOctComplete, ignore_index = True)
dfAll = dfAll.append(dfNovComplete, ignore_index = True)
dfAll = dfAll.append(dfDecComplete, ignore_index = True)

In [9]:
dfAll = dfAll.dropna()
dfAll = dfAll.reset_index()

In [10]:
dfAll.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7663 entries, 0 to 7662
Data columns (total 36 columns):
index                      7663 non-null int64
Unnamed: 0                 7663 non-null int64
day                        7663 non-null object
tripduration               7663 non-null float64
starttime                  7663 non-null object
stoptime                   7663 non-null object
start.station.id           7663 non-null int64
start.station.name         7663 non-null object
start.station.latitude     7663 non-null float64
start.station.longitude    7663 non-null float64
end.station.id             7663 non-null int64
end.station.name           7663 non-null object
end.station.latitude       7663 non-null float64
end.station.longitude      7663 non-null float64
bikeid                     7663 non-null int64
usertype                   7663 non-null object
birth.year                 7663 non-null int64
gender                     7663 non-null int64
distance                   7663

In [11]:
dfAll.speed = dfAll.speed.astype(float)
dfAll.temp = dfAll.temp.astype(float)
dfAll.elev = dfAll.elev.astype(float)
dfAll.windspeed = dfAll.windspeed.astype(float)
dfAll.windchill = dfAll.windchill.astype(float)
dfAll.hum = dfAll.hum.astype(float)
dfAll.precip = dfAll.precip.astype(float)

In [12]:
dfAll['logspeed'] = np.log10(dfAll.speed.values)

In [13]:
dfres = dfAll[['logspeed', 'speed', 'temp', 'elev', 'windspeed', 'windchill', 'hum', 'precip']]

In [14]:
predictors = list(dfres.columns.values)
predictors.remove('speed')
predictors.remove('logspeed')
print predictors

['temp', 'elev', 'windspeed', 'windchill', 'hum', 'precip']


In [15]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit(dfres)
dfres_scaled = pd.DataFrame(scaler.transform(dfres), columns = dfres.columns)
dfres_scaled.head()

Unnamed: 0,logspeed,speed,temp,elev,windspeed,windchill,hum,precip
0,0.336404,0.234902,-0.611591,0.054319,-3.009389,-0.59203,-0.416958,-0.237757
1,1.100526,1.077496,-0.611591,1.201926,-3.009389,-0.59203,1.295829,-0.237757
2,1.397968,1.441758,-0.611591,0.054319,-3.009389,-0.59203,0.7249,-0.237757
3,-0.758,-0.76901,-0.489687,0.729382,0.331595,-0.59203,1.695479,4.205975
4,1.226725,1.229414,-0.706993,-0.013188,-3.009389,-0.59203,1.581294,-0.237757


In [23]:
def brute_force(data, target_variable, predictors, model, alpha_list = [1.0], degree_list = [3]):
    ''' brute_force is a simple function designed to:
    test every combination of predictors submitted in the predictors argument
    test all degrees of polynomial as submitted in the degree_list argument
    test a number of regularization parameters as submitted in the alpha_list argument
    
    model is the algorithm to be tested
     '''
    min_mse = 1e99
    test_size_split = 0.5

    #search over every combination of the predictors - using the itertools functionality
    for i in xrange(1, 5):
        
        #build and test a model for each combination of predictors
        for j in combinations(predictors, i):
            
            test_predictors = list(j)
            
            #use train test split to get the training and test datasets, according to the parameter test_size_split
            X_train, X_test, y_train, y_test = train_test_split(data[test_predictors], \
                                                    data[target_variable], test_size=test_size_split, random_state=42)
            
            #Now search over all the polynomial degrees in the degree_list
            for degree in degree_list:
                
                #Make sure each model is regularized, and search over all alphas in the regularization list
                for a in alpha_list:
                    
                    #build the model
                    clf = make_pipeline(PolynomialFeatures(degree), model(alpha = a, max_iter = 5000))
                    
                    #fit the model
                    clf.fit(X_train, y_train)
                    
                    #Get the test set predictions
                    y_hat = clf.predict(X_test)
                    
                    #measure the mean squared error of the test set
                    mse = mean_squared_error(y_hat, y_test)
                    
                    #remember ALL information for the minimum
                    if mse < min_mse:
                        min_mse = mse
                        min_clf = clf
                        min_predictors = test_predictors
                        min_degree = degree
                        min_alpha = a
                        #unless you cannot afford to do this, it is always a good idea to remember the train, test
                        #datasets actually used to build your model
                        min_X_train = X_train
                        min_y_train = y_train
                        min_X_test = X_test
                        min_y_test = y_test
                    
    #return a tuple for the minimum model and parameters
    return (min_mse, min_clf, min_predictors, min_degree, min_alpha, min_X_train, min_y_train, min_X_test, min_y_test)

def print_essential_results(results):
    print "MSE = {:5.7f}".format(results[0])
    print "Best predictors = ", results[2]
    print "Optimal degree polynomial = ", results[3]
    print "Optimal regularization value = ", results[4]

In [27]:
lasso_results = brute_force(dfres_scaled, 
                        'speed', 
                        predictors, 
                        Lasso, 
                        alpha_list=np.logspace(-5, 1, 5), 
                        degree_list = [2])

In [28]:
print_essential_results(lasso_results)

MSE = 0.9628376
Best predictors =  ['temp', 'elev']
Optimal degree polynomial =  2
Optimal regularization value =  1e-05


In [19]:
lasso_results = brute_force(dfres_scaled, 
                        'logspeed', 
                        predictors, 
                        Lasso, 
                        alpha_list=np.logspace(-5, 1, 5), 
                        degree_list = [2])

In [20]:
print_essential_results(lasso_results)

MSE = 0.9535393
Best predictors =  ['temp', 'elev']
Optimal degree polynomial =  2
Optimal regularization value =  1e-05


---
#Using the Model to Predict
---

In [21]:
def calc_speed(temp, elev, scaler, dfres, results):
    X1 = np.array([0.0, 0.0, temp, elev, 0.0, 0.0, 0.0, 0.0])
    X1 = scaler.transform(X1)
    X1 = X1[2:4]
    X1 = X1.reshape(1, X1.shape[0])
    clf = results[1]
    y = clf.predict(X1)    
    y_hat = np.array([y, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
    logy = scaler.inverse_transform(y_hat)
    return np.power(10.0, logy[0])[0]

In [22]:
temp_start = 35.0
elev_start = 50.0

temp_next = 75
elev_next = 50.0

distance_to_travel = 3.92 

yhat_1 = calc_speed(temp_start, elev_start, scaler, dfres, lasso_results)
yhat_2 = calc_speed(temp_next, elev_next, scaler, dfres, lasso_results)

time_start = distance_to_travel/yhat_1
time_next = distance_to_travel/yhat_2

print "Your initial speed at temp {:2.0f} deg F and elevation {:5.2f} is {:5.2f} mph".format(temp_start, elev_start, yhat_1)
print "Your subsequent speed at temp {:2.0f} deg F and elevation {:5.2f} is {:5.2f} mph".format(temp_next, elev_next, yhat_2)

print "Your initial journey of {:4.2f} miles will take {:5.2f} minutes".format(distance_to_travel, time_start*60.0)
print "Your subsequent journey of {:4.2f} miles will take {:5.2f} minutes".format(distance_to_travel, time_next*60.0)

print "Your journey differential is {:5.2f} seconds".format(np.fabs(((time_start * 60.0)-(time_next * 60.0)) * 60.0))

Your initial speed at temp 35 deg F and elevation 50.00 is 10.29 mph
Your subsequent speed at temp 75 deg F and elevation 50.00 is 10.88 mph
Your initial journey of 3.92 miles will take 22.86 minutes
Your subsequent journey of 3.92 miles will take 21.61 minutes
Your journey differential is 74.60 seconds
