In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
import sys
sys.path.insert(0,'../')
%load_ext autoreload
%autoreload 2
from utils import citibike_helpers,nyctaxi_helpers
import numpy as np
import pandas as pd  #requirement comes with anaconda
import datetime 
from datetime import datetime as dt
from geopy.distance import vincenty # requires separate install - pip install geopy
import warnings
warnings.filterwarnings('ignore')

#Imports for Classification
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB #Naive Bayes Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

#Split Training and Testing Set.
from sklearn.model_selection import train_test_split

#Generate Classification Performance Results
from sklearn.metrics import classification_report


#Imports for Regression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Helpers to Read In Dataset

In [21]:
inputfile="../datasets/citibike/201701-citibike-tripdata.csv.gz"
df = citibike_helpers.load_citibike_data(inputfile)
df = citibike_helpers.calculate_trip_durations_citibike(df)

# Helpers to Preprocess Dataset

In [22]:
df['Start Time'] =  pd.to_datetime(df['Start Time'], format='%Y-%m-%d %H:%M:%S')
df['Start Time Hour'] = pd.DatetimeIndex(df['Start Time']).hour
df['Start Time Minute'] = pd.DatetimeIndex(df['Start Time']).minute
df['Start Time Day of Week'] = pd.DatetimeIndex(df['Start Time']).dayofweek
f = lambda x: vincenty((x['Start Station Latitude'],x['Start Station Longitude']),(x['End Station Latitude'],x['End Station Longitude'])).miles
df['distance'] = df.apply(f,axis=1)

#Isolate Trips between 5 and 120 minutes.
subset_trips=df[(df['Trip Duration Minutes']>5) & (df['Trip Duration Minutes']<120)]
subset_trips.drop(['Trip Duration', 'Start Time', 'Stop Time', 'Start Station ID','Start Station Name','End Station Name','End Station ID', 'Bike ID'],axis=1,inplace=True)
subset_trips.head()

Unnamed: 0,Start Station Latitude,Start Station Longitude,End Station Latitude,End Station Longitude,User Type,Birth Year,Gender,Trip Duration Minutes,Start Time Hour,Start Time Minute,Start Time Day of Week,distance
0,40.78275,-73.97137,40.775794,-73.976206,Subscriber,1965.0,2,11.333333,0,0,6,0.542899
1,40.729236,-73.990868,40.748549,-73.988084,Subscriber,1987.0,2,21.383333,0,0,6,1.340572
2,40.776829,-73.963888,40.768737,-73.961199,Customer,,0,10.816667,0,0,6,0.575889
3,40.776829,-73.963888,40.768737,-73.961199,Customer,,0,10.533333,0,1,6,0.575889
4,40.776829,-73.963888,40.768737,-73.961199,Customer,,0,10.366667,0,1,6,0.575889


In [23]:
subset_trips = pd.get_dummies(subset_trips)
subset_trips.dropna(inplace=True)
subset_trips.head(2)

Unnamed: 0,Start Station Latitude,Start Station Longitude,End Station Latitude,End Station Longitude,Birth Year,Gender,Trip Duration Minutes,Start Time Hour,Start Time Minute,Start Time Day of Week,distance,User Type_Customer,User Type_Subscriber
0,40.78275,-73.97137,40.775794,-73.976206,1965.0,2,11.333333,0,0,6,0.542899,0,1
1,40.729236,-73.990868,40.748549,-73.988084,1987.0,2,21.383333,0,0,6,1.340572,0,1


# Helper to Create Two Separate Dataframes for Classification and Regression

In [24]:
subset_trips_classification=subset_trips.copy(deep=True)
subset_trips_regression=subset_trips.copy(deep=True)

In [25]:
#Helper to categorize triptime into discrete classes
def categorize_pickup_times(time_min):
    if time_min<20: # Very Short Trips
        return 1
    elif time_min>=20 and time_min<45: #Medium Length Trips
        return 2
    elif time_min>=45: #Long Trips
        return 3

In [26]:
subset_trips_classification['Trip Duration Class']=subset_trips_classification['Trip Duration Minutes'].apply(lambda x: categorize_pickup_times(x))

In [27]:
# Class Distribution
# Class 1 466540
# Class 2 79467
# Class 3 2937

# Classification

## > Use the `subset_trips_classification` dataframe for this part.

## > Classification Task: Classify  `Trip Duration Class` using all features in the `subset_trips_classification` dataframe except:  

### `Trip Duration Minutes` and  `Trip Duration Class`.

## Naive Bayes (Gaussian Naive Bayes), Multi Layer Perceptron, Decision Tree Classifier and K Neighbors Classifier using the scikit-learn package.

## Note: For each model you will have to split the data into training and testing sets using the train_test_split function in scikit-learn and use a test_size of 0.33 (33%).


## Report Classification Results (precision, recall and f1-score) using the classification_report function from scikit-learn.metrics


In [28]:
models = {}
dtree_clf = DecisionTreeClassifier()
nb_clf = GaussianNB()
mlp_clf=MLPClassifier()
knn_clf=KNeighborsClassifier()
models.update({'Decision Tree':dtree_clf})
models.update({'Naive Bayes':nb_clf})
models.update({'Multilayer Perceptron':mlp_clf})
models.update({'K-Nearest Neighbors':knn_clf})

In [29]:
y_cls = subset_trips_classification['Trip Duration Class']

# move all the predictors into another variable
XX_cls = subset_trips_classification.drop(['Trip Duration Class','Trip Duration Minutes'], axis = 1)

# model name which could be any of the following
# sorted by the amount of time they take to train the model:
# Linear Regression, KNN Regression
for model_name, model in models.items():
    print("Running the {} Classification...".format(model_name))
    x_train, x_test, y_train, y_test = train_test_split(XX_cls, y_cls, test_size=0.33, random_state=42)
    y_pred = nyctaxi_helpers.run(model,x_train, y_train, x_test)
    clf_report = classification_report(y_test, y_pred)
    print(clf_report)

Running the Naive Bayes Classification...
             precision    recall  f1-score   support

          1       0.92      0.98      0.95    153889
          2       0.83      0.49      0.62     26303
          3       0.05      0.06      0.06       960

avg / total       0.90      0.91      0.90    181152

Running the Multilayer Perceptron Classification...
             precision    recall  f1-score   support

          1       0.89      1.00      0.94    153889
          2       0.91      0.31      0.46     26303
          3       0.00      0.00      0.00       960

avg / total       0.89      0.89      0.87    181152

Running the K-Nearest Neighbors Classification...
             precision    recall  f1-score   support

          1       0.90      0.99      0.94    153889
          2       0.80      0.39      0.53     26303
          3       0.40      0.00      0.00       960

avg / total       0.88      0.89      0.88    181152

Running the Decision Tree Classification...
        

# Regression

## Use the `subset_trips_regression` dataframe for this part.

## Run Linear Regression, K-Neighbors Regressor using the scikit-learn package.

## > Regression Task: Predict `Trip Duration Minutes` using all features in the `subset_trips_regression` dataframe except:  

### `Trip Duration Minutes`.

## Note: For each model you will have to split the data into training and testing sets using the train_test_split function in scikit-learn and use a test_size of 0.33 (33%).
 
## Report regression performance using the  r2_score function in the scikit-learn.metrics module

In [30]:
models = {}
lnrmodel = LinearRegression()
knnmodel = KNeighborsRegressor()
models.update({'Linear':lnrmodel})
models.update({'KNN':knnmodel})

In [31]:
y_cls = subset_trips_regression['Trip Duration Minutes']
# move all the predictors into another variable
XX_cls = subset_trips_regression.drop(['Trip Duration Minutes'], axis = 1)
# model name which could be any of the following
# sorted by the amount of time they take to train the model:
# Linear Regression, KNN Regression
for model_name, model in models.items():
    print("Running the {} Regression...".format(model_name))
    x_train, x_test, y_train, y_test = train_test_split(XX_cls, y_cls, test_size=0.33, random_state=42)
    y_pred = nyctaxi_helpers.run(model,x_train, y_train, x_test)
    r2 = r2_score(y_test, y_pred)
    print('R2 score={}'.format(r2),"\n")

Running the Linear Regression...
R2 score=0.5192549210155281 

Running the KNN Regression...
R2 score=0.40007019712257696 

