## Imports

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import json

## Data Extraction

In [2]:
def getDataByYear(year):
    name=""
    df=[]
    
    for k in range(1,13):
        if k<10:
            name = str(year)+"0"+str(k)
        else:
            name = str(year)+""+str(k)
        name +="-citibike-tripdata.csv"
        
        df.append(pd.read_csv(name))
        #set columns to lower case
        df[k-1].columns = map(str.lower, df[k-1].columns)
        #
        df[k-1].columns = [x.replace(" ","") for x in list(df[k-1].columns)]
        df[k-1]["starttime"] = pd.to_datetime(df[k-1]["starttime"], infer_datetime_format=True)
        df[k-1]["stoptime"] = pd.to_datetime(df[k-1]["stoptime"], infer_datetime_format=True)
        print(name)

    return pd.concat(df,ignore_index=True)

df2016 = getDataByYear(2016)

201601-citibike-tripdata.csv
201602-citibike-tripdata.csv
201603-citibike-tripdata.csv
201604-citibike-tripdata.csv
201605-citibike-tripdata.csv
201606-citibike-tripdata.csv
201607-citibike-tripdata.csv
201608-citibike-tripdata.csv
201609-citibike-tripdata.csv
201610-citibike-tripdata.csv
201611-citibike-tripdata.csv
201612-citibike-tripdata.csv


In [3]:
# We extract the hour of started trips, hour of ended trips and the day of the week

list_of_starthours, list_of_stophours, list_of_weekdays = np.zeros(len(df2016)), np.zeros(len(df2016)), np.zeros(len(df2016))

for i in tqdm(range(len(df2016))):
    list_of_starthours[i] = df2016.iloc[i].starttime.hour
    list_of_stophours[i] = df2016.iloc[i].stoptime.hour
    list_of_weekdays[i] = df2016.iloc[i].starttime.weekday()
    
    
# We insert the extra columns in the dataset
df2016['starthour'] = list_of_starthours.astype(int)
df2016['endhour'] = list_of_stophours.astype(int)
df2016['weekday'] = list_of_weekdays.astype(int)

100%|██████████| 13845655/13845655 [3:15:17<00:00, 1181.61it/s]  


In [4]:
#df2016 = pd.read_csv("df2016WithAddedColumns.csv")

In [5]:
df2016.head()

Unnamed: 0.1,Unnamed: 0,tripduration,starttime,stoptime,startstationid,startstationname,startstationlatitude,startstationlongitude,endstationid,endstationname,endstationlatitude,endstationlongitude,bikeid,usertype,birthyear,gender,starthour,endhour,weekday
0,0,923,2016-01-01 00:00:41,2016-01-01 00:16:04,268,Howard St & Centre St,40.719105,-73.999733,3002,South End Ave & Liberty St,40.711512,-74.015756,22285,Subscriber,1958.0,1,0,0,4
1,1,379,2016-01-01 00:00:45,2016-01-01 00:07:04,476,E 31 St & 3 Ave,40.743943,-73.979661,498,Broadway & W 32 St,40.748549,-73.988084,17827,Subscriber,1969.0,1,0,0,4
2,2,589,2016-01-01 00:00:48,2016-01-01 00:10:37,489,10 Ave & W 28 St,40.750664,-74.001768,284,Greenwich Ave & 8 Ave,40.739017,-74.002638,21997,Subscriber,1982.0,2,0,0,4
3,3,889,2016-01-01 00:01:06,2016-01-01 00:15:56,268,Howard St & Centre St,40.719105,-73.999733,3002,South End Ave & Liberty St,40.711512,-74.015756,22794,Subscriber,1961.0,2,0,0,4
4,4,1480,2016-01-01 00:01:12,2016-01-01 00:25:52,2006,Central Park S & 6 Ave,40.765909,-73.976342,2006,Central Park S & 6 Ave,40.765909,-73.976342,14562,Subscriber,1952.0,1,0,0,4


In [6]:
# Makes a dataframe with the count of started trips, sorted by station first, then weekday, then hour
d1 = df2016.groupby([df2016['startstationid'], df2016['weekday'], df2016['starthour']])['startstationname'].count().reset_index(name = "startcount")
d1 = d1.rename(columns={'starthour':'hour','startstationid':'stationid'})



In [7]:
# Makes a dataframe with the count of ended trips, sorted by station first, then weekday, then hour
d2 = df2016.groupby([df2016['endstationid'], df2016['weekday'], df2016['endhour']])['endstationname'].count().reset_index(name='stopcount')
d2 = d2.rename(columns={'endhour':'hour','endstationid':'stationid'})



In [8]:
# We merge the two dataframes containing startcount and stopcount, and insert a new colum "taken" in the dataframe
d3 = d1.merge(d2, on = ['stationid', 'hour', 'weekday'])
d3['taken'] = d3['stopcount'] - d3['startcount']

In [9]:
d3.head()

Unnamed: 0,stationid,weekday,hour,startcount,stopcount,taken
0,72,0,0,9,23,14
1,72,0,1,11,16,5
2,72,0,2,6,9,3
3,72,0,4,12,12,0
4,72,0,5,12,19,7


## Data Prep

In [10]:
# Defines input features and target values
X = d3[d3.columns[0:3]].values
y1 = d3[d3.columns[5]].values

# Data Prep
y = np.array(["high demand" if n<0 else "low demand" for n in y1])
class_idx = {'high demand':1, 'low demand':0}

y = np.array([class_idx[v] for v in y])
y = np.eye(2)[y]


# Shuffles the data
random_idx = np.arange(X.shape[0])
np.random.seed(0)
np.random.shuffle(random_idx)

X = X[random_idx]
y = y[random_idx]

In [11]:
# Splits the data up into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Training
DT = DecisionTreeClassifier(min_samples_split=13, criterion = "entropy")
DT.fit(X_train, y_train)
predictions = DT.predict(X_test)

In [13]:
# Accuracy 
true = y_test.argmax(axis=1)
correct_preds = np.equal(true, predictions.argmax(axis=1))
sum(correct_preds) / len(true)

0.75860844870429534

In [16]:
# Error
MSE(X_test, y_test)

0.24286221410822051

In [25]:
# Creates a JSON file with the decision tree

feature_names = ['StationID', 'Weekday', 'Hour']
target_names = ["Unbusy", "Busy"]

r = rules(DT, feature_names, target_names)

with open('rules6.json', 'w') as f:
    f.write(json.dumps(r))

In [19]:
def rules(clf, features, labels, node_index=0):
    """Structure of rules in a fit decision tree classifier

    Parameters
    ----------
    clf : DecisionTreeClassifier
        A tree that has already been fit.

    features, labels : lists of str
        The names of the features and labels, respectively.

    """
    node = {}
    if clf.tree_.children_left[node_index] == -1:  # indicates leaf
        count_labels = zip(clf.tree_.value[node_index, 0], labels)
        node['name'] = ', '.join(('{} of {}'.format(int(count), label)
                                  for count, label in count_labels))
    else:
        feature = features[clf.tree_.feature[node_index]]
        threshold = clf.tree_.threshold[node_index]
        node['name'] = '{} > {}'.format(feature, threshold)
        left_index = clf.tree_.children_left[node_index]
        right_index = clf.tree_.children_right[node_index]
        node['children'] = [rules(clf, features, labels, right_index),
                            rules(clf, features, labels, left_index)]
    return node

In [15]:
# Calculates the Mean Squared Error

def MSE(X, y):
        
        # Predict
        yhat = DT.predict(X)
    
        # Calculate MSE
        return np.mean((y-yhat)**2) # * 0.5