#System Setup

In [0]:
!pip install --upgrade -q gspread

In [0]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

#Data Set Up

In [0]:
# Download data from Google Drive
wks = gc.open_by_url('https://docs.google.com/spreadsheets/d/1IpPsg0TLELx_f32W8Fgj8cZVeVicwr9pIYIZweCleAA/edit?usp=sharing').get_worksheet(0)
recipients = wks.get_all_values()

In [0]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [0]:
# Create DataFrame object
dataframe = pd.DataFrame.from_records(recipients)

In [6]:
# Display beginning of Dataframe
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,distance,minutes,age,season,dayofweek,hour,over30min
1,2188,2016-09-23 19:09:47 UTC,2016-09-23 19:46:16 UTC,2008,Little West St & 1 Pl,40.70569254,-74.01677685,347,Greenwich St & W Houston St,40.728846,-74.008591,22233,Customer,1956,1,2665.383572,36.46666667,64,3,6,19,yes
2,1526,2016-09-05 22:26:35 UTC,2016-09-05 22:52:01 UTC,296,Division St & Bowery,40.71413089,-73.9970468,236,St Marks Pl & 2 Ave,40.7284186,-73.98713956,16939,Customer,1956,1,1794.752389,25.43333333,64,3,2,22,no
3,1382,2016-09-17 17:52:26 UTC,2016-09-17 18:15:28 UTC,252,MacDougal St & Washington Sq,40.73226398,-73.99852205,484,W 44 St & 5 Ave,40.75500254,-73.98014437,23844,Customer,1955,0,2964.78189,23.03333333,65,3,7,17,no
4,1551,2016-09-04 09:52:06 UTC,2016-09-04 10:17:57 UTC,3236,W 42 St & Dyer Ave,40.75898481,-73.99379969,3231,E 67 St & Park Ave,40.7678008,-73.96592081,25714,Customer,1953,1,2544.398151,25.85,67,3,1,9,no


In [7]:
print(dataframe.shape)

(16001, 22)


In [8]:
# need to update dataframe to have column headers
dataframe.columns = dataframe.iloc[0]
dataframe = dataframe[1:]

# Validate Dataframe columns
list(dataframe.columns.values)

['tripduration',
 'starttime',
 'stoptime',
 'start_station_id',
 'start_station_name',
 'start_station_latitude',
 'start_station_longitude',
 'end_station_id',
 'end_station_name',
 'end_station_latitude',
 'end_station_longitude',
 'bikeid',
 'usertype',
 'birth_year',
 'gender',
 'distance',
 'minutes',
 'age',
 'season',
 'dayofweek',
 'hour',
 'over30min']

In [9]:
# Descriptive statistics of Essentials Dataframe
dataframe.describe()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,distance,minutes,age,season,dayofweek,hour,over30min
count,16000,16000,16000,16000,16000,16000.0,16000.0,16000,16000,16000.0,16000.0,16000,16000,16000,16000,16000,16000.0,16000,16000,16000,16000,16000
unique,3066,15718,15717,598,598,598.0,598.0,572,573,572.0,572.0,7142,1,36,2,9223,3066.0,36,3,7,24,2
top,1067,2016-08-23 22:10:43 UTC,2016-09-04 15:05:52 UTC,387,Centre St & Chambers St,40.71273266,-74.0046073,309,Murray St & West St,40.7149787,-74.013012,19334,Customer,1989,1,0,19.78333333,31,3,7,14,no
freq,21,3,3,250,250,250.0,250.0,110,110,110.0,110.0,13,16000,1176,9374,588,21.0,1176,15990,4064,1538,13530


#Create Model

In [0]:
# create train df
train = dataframe

In [0]:
# features and target
target = 'over30min'
features = ['start_station_id', 'end_station_id', 'gender', 'dayofweek']

In [0]:
# X matrix, y vector
X = train[features]
y = train[target]

In [13]:
# model 
model = LogisticRegression()
model.fit(X, y)
model.score(X,y)

0.845625

In [14]:
train.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender,distance,minutes,age,season,dayofweek,hour,over30min
1,2188,2016-09-23 19:09:47 UTC,2016-09-23 19:46:16 UTC,2008,Little West St & 1 Pl,40.70569254,-74.01677685,347,Greenwich St & W Houston St,40.728846,-74.008591,22233,Customer,1956,1,2665.383572,36.46666667,64,3,6,19,yes
2,1526,2016-09-05 22:26:35 UTC,2016-09-05 22:52:01 UTC,296,Division St & Bowery,40.71413089,-73.9970468,236,St Marks Pl & 2 Ave,40.7284186,-73.98713956,16939,Customer,1956,1,1794.752389,25.43333333,64,3,2,22,no
3,1382,2016-09-17 17:52:26 UTC,2016-09-17 18:15:28 UTC,252,MacDougal St & Washington Sq,40.73226398,-73.99852205,484,W 44 St & 5 Ave,40.75500254,-73.98014437,23844,Customer,1955,0,2964.78189,23.03333333,65,3,7,17,no
4,1551,2016-09-04 09:52:06 UTC,2016-09-04 10:17:57 UTC,3236,W 42 St & Dyer Ave,40.75898481,-73.99379969,3231,E 67 St & Park Ave,40.7678008,-73.96592081,25714,Customer,1953,1,2544.398151,25.85,67,3,1,9,no
5,3073,2016-09-10 10:38:18 UTC,2016-09-10 11:29:32 UTC,525,W 34 St & 11 Ave,40.75594159,-74.0021163,410,Suffolk St & Stanton St,40.72066442,-73.98517977,21257,Customer,1949,1,4174.126832,51.21666667,71,3,7,10,yes


# Pickle Model

In [0]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))