### Import library

In [1]:
from __future__ import division

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%pylab inline 
import datetime as dt
import calendar
from haversine import haversine

Populating the interactive namespace from numpy and matplotlib


### Import data

In [2]:
df = pd.read_csv("201508-citibike-tripdata.csv")
df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,1202,8/1/2015 00:00:04,8/1/2015 00:20:07,168,W 18 St & 6 Ave,40.739713,-73.994564,385,E 55 St & 2 Ave,40.757973,-73.966033,23253,Subscriber,1987.0,1
1,301,8/1/2015 00:00:05,8/1/2015 00:05:06,450,W 49 St & 8 Ave,40.762272,-73.987882,479,9 Ave & W 45 St,40.760193,-73.991255,22675,Subscriber,1951.0,2
2,431,8/1/2015 00:00:06,8/1/2015 00:07:18,312,Allen St & E Houston St,40.722055,-73.989111,296,Division St & Bowery,40.714131,-73.997047,19831,Subscriber,1985.0,1
3,273,8/1/2015 00:00:09,8/1/2015 00:04:43,382,University Pl & E 14 St,40.734927,-73.992005,229,Great Jones St,40.727434,-73.99379,22765,Subscriber,1975.0,1
4,1256,8/1/2015 00:00:17,8/1/2015 00:21:13,352,W 56 St & 6 Ave,40.763406,-73.977225,432,E 7 St & Avenue A,40.726218,-73.983799,22127,Subscriber,1978.0,1


### Answer

In [3]:
df.shape

(1179044, 15)

In [4]:
df.sample(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
244277,1290,8/7/2015 12:18:06,8/7/2015 12:39:36,280,E 10 St & 5 Ave,40.73332,-73.995101,417,Barclay St & Church St,40.712912,-74.010202,22262,Subscriber,1970.0,2
193993,797,8/6/2015 08:33:50,8/6/2015 08:47:08,519,Pershing Square North,40.751873,-73.977706,116,W 17 St & 8 Ave,40.741776,-74.001497,14976,Subscriber,1968.0,1
672662,1179,8/18/2015 20:02:42,8/18/2015 20:22:22,446,W 24 St & 7 Ave,40.744876,-73.995299,410,Suffolk St & Stanton St,40.720664,-73.98518,22446,Subscriber,1983.0,2
715620,1184,8/19/2015 21:11:55,8/19/2015 21:31:39,279,Peck Slip & Front St,40.707873,-74.00167,3002,South End Ave & Liberty St,40.711512,-74.015756,14672,Customer,,0
10350,1948,8/1/2015 12:00:04,8/1/2015 12:32:33,545,E 23 St & 1 Ave,40.736502,-73.978095,331,Pike St & Monroe St,40.711731,-73.99193,14567,Customer,,0


In [5]:
df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station name          object
start station latitude     float64
start station longitude    float64
end station id               int64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                 float64
gender                       int64
dtype: object

#### Data Prep

In [6]:
#Mengubah tipe data date

df['starttime'] =  pd.to_datetime(df['starttime'], format="%m/%d/%Y %H:%M:%S")

In [7]:
df['stoptime'] =  pd.to_datetime(df['stoptime'], format="%m/%d/%Y %H:%M:%S")

In [8]:
#Null ada pada kolom birth year
#Namun pada modelling ini kolom birth year tidak digunakan dan akan dihapus
#Sehingga tidak dilakukan penggantian value

df.isnull().sum()

tripduration                    0
starttime                       0
stoptime                        0
start station id                0
start station name              0
start station latitude          0
start station longitude         0
end station id                  0
end station name                0
end station latitude            0
end station longitude           0
bikeid                          0
usertype                        0
birth year                 221001
gender                          0
dtype: int64

#### Feature engineering: memproses data tanggal

In [9]:
df['start_day'] = df.starttime.apply(lambda x: calendar.day_name[x.weekday()])

In [10]:
df.sample(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,start_day
376421,936,2015-08-10 19:39:22,2015-08-10 19:54:59,435,W 21 St & 6 Ave,40.74174,-73.994156,426,West St & Chambers St,40.717548,-74.013221,23577,Customer,,0,Monday
689370,235,2015-08-19 10:19:30,2015-08-19 10:23:26,475,E 16 St & Irving Pl,40.735243,-73.987586,475,E 16 St & Irving Pl,40.735243,-73.987586,19065,Customer,,0,Wednesday
675857,403,2015-08-18 21:44:28,2015-08-18 21:51:11,504,1 Ave & E 15 St,40.732219,-73.981656,411,E 6 St & Avenue D,40.722281,-73.976687,23583,Subscriber,1991.0,1,Tuesday
320181,388,2015-08-09 13:18:00,2015-08-09 13:24:29,355,Bayard St & Baxter St,40.716021,-73.999744,276,Duane St & Greenwich St,40.717488,-74.010455,22702,Subscriber,1952.0,1,Sunday
588285,462,2015-08-16 16:32:47,2015-08-16 16:40:29,468,Broadway & W 55 St,40.765265,-73.981923,520,W 52 St & 5 Ave,40.759923,-73.976485,16653,Subscriber,1972.0,2,Sunday


In [11]:
df['is_weekend'] = df.start_day.apply(lambda x: 1 if (x == 'Saturday' or x == 'Sunday') else 0)

In [12]:
df.sample(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,start_day,is_weekend
354897,222,2015-08-10 11:39:55,2015-08-10 11:43:37,379,W 31 St & 7 Ave,40.749156,-73.9916,474,5 Ave & E 29 St,40.745168,-73.986831,20942,Subscriber,1982.0,1,Monday,0
1017569,769,2015-08-27 18:14:31,2015-08-27 18:27:21,332,Cherry St,40.712199,-73.979481,415,Pearl St & Hanover Square,40.704718,-74.00926,23324,Subscriber,1971.0,1,Thursday,0
634696,884,2015-08-17 20:44:58,2015-08-17 20:59:42,327,Vesey Pl & River Terrace,40.715338,-74.016584,458,11 Ave & W 27 St,40.751396,-74.005226,16243,Subscriber,1984.0,2,Monday,0
472743,378,2015-08-13 16:08:50,2015-08-13 16:15:08,519,Pershing Square North,40.751873,-73.977706,501,FDR Drive & E 35 St,40.744219,-73.971212,16489,Subscriber,1974.0,1,Thursday,0
1117354,1512,2015-08-30 13:18:44,2015-08-30 13:43:57,281,Grand Army Plaza & Central Park S,40.764397,-73.973715,294,Washington Square E,40.730494,-73.995721,20886,Subscriber,1991.0,2,Sunday,1


In [13]:
def time_of_day(x):
    if x.hour < 6 or x.hour >= 22:    #### COMPLETE THE FUNCTION BELOW ####
        return 'night'
    elif x.hour > 18 and x.hour < 22:
        return 'evening'
    elif x.hour >= 12 and x.hour <= 18:
        return 'afternoon'
    else:
        return 'morning'

In [14]:
df['start_moment'] = df.starttime.apply(time_of_day)
col = ['starttime', 'start_moment']
df[col].sample(5)

Unnamed: 0,starttime,start_moment
1172205,2015-08-31 19:24:06,evening
418160,2015-08-12 09:49:24,morning
757440,2015-08-20 23:17:58,night
991460,2015-08-27 08:08:24,morning
883590,2015-08-24 16:27:22,afternoon


#### Feature engineering: circle trip

In [15]:
df['is_circle_trip'] = df.apply(lambda x: 1 if x['start station id'] == x['end station id'] else 0, axis = 1)
df.sample(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,start_day,is_weekend,start_moment,is_circle_trip
70627,399,2015-08-03 07:56:38,2015-08-03 08:03:18,168,W 18 St & 6 Ave,40.739713,-73.994564,526,E 33 St & 5 Ave,40.747659,-73.984907,22205,Subscriber,1992.0,1,Monday,0,morning,0
498353,457,2015-08-14 08:40:19,2015-08-14 08:47:56,335,Washington Pl & Broadway,40.729039,-73.994046,537,Lexington Ave & E 24 St,40.740259,-73.984092,15142,Subscriber,1986.0,1,Friday,0,morning,0
1145946,668,2015-08-31 08:52:54,2015-08-31 09:04:03,297,E 15 St & 3 Ave,40.734232,-73.986923,519,Pershing Square North,40.751873,-73.977706,23544,Subscriber,1983.0,1,Monday,0,morning,0


#### Feature engineering: distance & trip duration

In [16]:
from haversine import haversine

In [17]:
def distance_stations(x):
    start_lat = x['start station latitude']
    start_long = x['start station longitude']
    end_lat = x['end station latitude']
    end_long = x['end station longitude']
    return haversine((start_lat,start_long),(end_lat,end_long))

In [18]:
df['traveled_distance'] = df.apply(distance_stations, axis = 1)

In [19]:
df.sample(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,start_day,is_weekend,start_moment,is_circle_trip,traveled_distance
666343,1105,2015-08-18 18:19:36,2015-08-18 18:38:02,153,E 40 St & 5 Ave,40.752062,-73.981632,236,St Marks Pl & 2 Ave,40.728419,-73.98714,15098,Subscriber,1993.0,2,Tuesday,0,afternoon,0,2.669691
773807,1704,2015-08-21 13:54:15,2015-08-21 14:22:39,2008,Little West St & 1 Pl,40.705693,-74.016777,295,Pike St & E Broadway,40.714067,-73.992939,17839,Customer,,0,Friday,0,afternoon,0,2.214525
716484,424,2015-08-19 21:46:19,2015-08-19 21:53:24,3111,Norman Ave & Leonard St,40.725848,-73.950649,3094,Graham Ave & Withers St,40.716981,-73.944859,16844,Subscriber,1985.0,2,Wednesday,0,evening,0,1.100111


In [20]:
df['average_speed'] = df.apply(lambda x: x['traveled_distance']/(x['tripduration']/3600), axis=1)

In [21]:
col = ['traveled_distance', 'tripduration', 'average_speed']
df[col].sample(5)

Unnamed: 0,traveled_distance,tripduration,average_speed
16724,1.322098,1054,4.515704
693927,6.359528,2054,11.146203
398846,1.387524,418,11.949965
1001693,0.652791,1646,1.427733
556419,1.083463,2819,1.383635


#### Variable encoding

In [22]:
for variable_name in ['start_day','is_weekend',
                      'start_moment','is_circle_trip']:
    print('Dummifying the {} variable ...'.format(variable_name))
    dummies = pd.get_dummies(df[variable_name])
    dummies.columns = ['{}_{}'.format(variable_name,x) for x in dummies.columns]
    df = pd.concat([df,dummies],axis=1)

Dummifying the start_day variable ...
Dummifying the is_weekend variable ...
Dummifying the start_moment variable ...
Dummifying the is_circle_trip variable ...


In [23]:
dummy = ['start_day_Friday',
       'start_day_Monday', 'start_day_Saturday', 'start_day_Sunday',
       'start_day_Thursday', 'start_day_Tuesday', 'start_day_Wednesday',
       'is_weekend_0', 'is_weekend_1', 'start_moment_afternoon',
       'start_moment_evening', 'start_moment_morning', 'start_moment_night',
       'is_circle_trip_0', 'is_circle_trip_1']
df[dummy].sample(5)

Unnamed: 0,start_day_Friday,start_day_Monday,start_day_Saturday,start_day_Sunday,start_day_Thursday,start_day_Tuesday,start_day_Wednesday,is_weekend_0,is_weekend_1,start_moment_afternoon,start_moment_evening,start_moment_morning,start_moment_night,is_circle_trip_0,is_circle_trip_1
254332,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0
364660,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0
166541,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0
184289,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0
404015,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0


In [24]:
df.sample(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,start_day_Tuesday,start_day_Wednesday,is_weekend_0,is_weekend_1,start_moment_afternoon,start_moment_evening,start_moment_morning,start_moment_night,is_circle_trip_0,is_circle_trip_1
684,1482,2015-08-01 01:17:34,2015-08-01 01:42:16,512,W 29 St & 9 Ave,40.750073,-73.998393,116,W 17 St & 8 Ave,40.741776,...,0,0,0,1,0,0,0,1,1,0
812808,1366,2015-08-22 15:08:37,2015-08-22 15:31:24,2002,Wythe Ave & Metropolitan Ave,40.716887,-73.963198,2002,Wythe Ave & Metropolitan Ave,40.716887,...,0,0,0,1,1,0,0,0,0,1
272344,1019,2015-08-08 06:32:19,2015-08-08 06:49:18,2001,Sands St & Navy St,40.699773,-73.979927,263,Elizabeth St & Hester St,40.71729,...,0,0,0,1,0,0,1,0,1,0
845556,2189,2015-08-23 14:55:09,2015-08-23 15:31:39,315,South St & Gouverneur Ln,40.703554,-74.006702,237,E 11 St & 2 Ave,40.730473,...,0,0,0,1,1,0,0,0,1,0
524553,1383,2015-08-14 18:44:22,2015-08-14 19:07:25,331,Pike St & Monroe St,40.711731,-73.99193,311,Norfolk St & Broome St,40.717227,...,0,0,1,0,1,0,0,0,1,0


In [25]:
for variable_name in ['start_day','is_weekend',
                      'start_moment','is_circle_trip']:
    print('Deleting the {} variable ...'.format(variable_name))
    del df[variable_name]

Deleting the start_day variable ...
Deleting the is_weekend variable ...
Deleting the start_moment variable ...
Deleting the is_circle_trip variable ...


In [26]:
del df['starttime'], df['stoptime'], df['start station name'], df['end station name']
del df['gender'], df['birth year']
del df['bikeid']
del df['start station id'], df['end station id']

In [27]:
df.sample(5)

Unnamed: 0,tripduration,start station latitude,start station longitude,end station latitude,end station longitude,usertype,traveled_distance,average_speed,start_day_Friday,start_day_Monday,...,start_day_Tuesday,start_day_Wednesday,is_weekend_0,is_weekend_1,start_moment_afternoon,start_moment_evening,start_moment_morning,start_moment_night,is_circle_trip_0,is_circle_trip_1
545328,2304,40.757148,-73.972078,40.73705,-73.990093,Customer,2.701336,4.220838,0,0,...,0,0,0,1,1,0,0,0,1,0
36865,1338,40.735877,-73.98205,40.764397,-73.973715,Subscriber,3.248124,8.739347,0,0,...,0,0,0,1,0,0,1,0,1,0
1066493,1069,40.716059,-73.991908,40.732241,-74.000264,Subscriber,1.932302,6.507284,1,0,...,0,0,1,0,0,1,0,0,1,0
150566,435,40.739126,-73.979738,40.740343,-73.989551,Subscriber,0.837783,6.933376,0,0,...,0,1,1,0,0,0,1,0,1,0
1043716,265,40.745712,-73.981948,40.751581,-73.97791,Subscriber,0.735936,9.997618,1,0,...,0,0,1,0,0,0,1,0,1,0


#### Split training & testing

In [28]:
labels = np.array(df.usertype)
features = np.array(df.drop(columns='usertype'))

In [29]:
import sklearn
from sklearn.preprocessing import label_binarize
binarized_labels = label_binarize(labels, classes=['Customer', 'Subscriber']).ravel() 

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, binarized_labels, test_size=0.3, random_state = 27)

### Modelling

#### Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
lr = LogisticRegression()
score = cross_val_score(lr, X_train, y_train, scoring='roc_auc', cv=3)
print(score)
print('Logistic Regression Average Score: ', score.mean())

[0.8301164  0.82825768 0.82711757]
Logistic Regression Average Score:  0.8284972155501801


#### KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
score = cross_val_score(knn, X_train, y_train, scoring='roc_auc', cv=3)
print(score)
print('KNN Average Score: ', score.mean())

[0.77188003 0.77189958 0.77062373]
KNN Average Score:  0.771467779440581


#### Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
score = cross_val_score(nb, X_train, y_train, scoring='roc_auc', cv=3)
print(score)
print('Naive Bayes Average Score: ', score.mean())

[0.77955741 0.7792533  0.77592388]
Naive Bayes Average Score:  0.7782448621033221


#### Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10)
score = cross_val_score(rf, X_train, y_train, scoring='roc_auc', cv=3)
print(score)
print('Random Forest Average Score: ', score.mean())

[0.85495982 0.85236648 0.8531569 ]
Random Forest Average Score:  0.8534943995550917


Nilai roc auc terbesar dimiliki oleh random forest classifier