In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from datetime import datetime, timedelta

# Load Data

In [4]:
# Load 2017 trips csv
df = pd.read_csv('metro-bike-share-trips-2017-q4-v2.csv', usecols=[2, 3, 4, 7])
df

Unnamed: 0,start_time,end_time,start_station,end_station
0,2017-12-30 10:20:00,2017-12-30 10:22:00,3000,3000
1,2017-12-29 08:03:00,2018-01-01 09:27:00,3022,3000
2,2017-12-30 23:44:00,2018-01-01 18:24:00,3007,3000
3,2017-12-31 12:43:00,2018-01-01 19:26:00,3047,3000
4,2017-12-30 22:13:00,2018-01-02 12:20:00,3023,3000
5,2017-12-31 20:40:00,2018-01-03 12:39:00,3005,3000
6,2017-12-29 11:29:00,2017-12-29 14:50:00,4211,3000
7,2017-12-30 10:24:00,2017-12-30 10:25:00,3000,3000
8,2017-12-30 10:20:00,2017-12-30 10:22:00,3000,3000
9,2017-12-30 00:18:00,2017-12-30 11:48:00,3024,3000


In [5]:
# Load bike id mapping csv
bike_ids = pd.read_csv('../metro-bike-share-stations-2019-01-07.csv')

# Feature Extraction

In [68]:
from datetime import datetime

def convert_to_seconds(time):
    """
    Converts a YYYY-MM-DD HH:MM:SS timestamp to Unix time (in seconds).
    """
    utc_time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    epoch_time = (utc_time - datetime(1970, 1, 1)).total_seconds()
    return epoch_time

# Convert into Desired Format
- Sort by timestamp
- Reformat to deliver information regarding each bike station

In [7]:
# Run this once to apply timestamp conversion
df['start_time'] = df['start_time'].apply(convert_to_seconds)
df['end_time'] = df['end_time'].apply(convert_to_seconds)
df.sort_values(by=['start_time'])

Unnamed: 0,start_time,end_time,start_station,end_station
65619,1.506816e+09,1.506818e+09,4216,4214
58714,1.506816e+09,1.506818e+09,4202,4204
50896,1.506817e+09,1.506818e+09,4162,4150
20800,1.506818e+09,1.506820e+09,3034,3034
20799,1.506818e+09,1.506820e+09,3034,3034
34110,1.506819e+09,1.506820e+09,3027,3062
34109,1.506819e+09,1.506820e+09,3027,3062
59940,1.506819e+09,1.506820e+09,4212,4208
59939,1.506819e+09,1.506820e+09,4212,4208
59938,1.506819e+09,1.506820e+09,4212,4208


In [None]:
col_names = ['time', 'is_arrival', 'station_number']
trip_df = pd.DataFrame(columns=col_names)

for index, row in df.iterrows():
    depart_row = {'time': row['start_time'], 'is_arrival': False, 'station_number': row['start_station']}
    arrive_row = {'time': row['end_time'], 'is_arrival': True, 'station_number': row['end_station']}
    trip_df = trip_df.append(depart_row, ignore_index=True)
    trip_df = trip_df.append(arrive_row, ignore_index=True)

trip_df.head()
trip_df['station_number'].nunique()


In [15]:
# write to csv
# trip_df.to_csv('trip.csv')
trip_df.head()

Unnamed: 0.1,Unnamed: 0,time,is_arrival,station_number
0,0,1514629000.0,False,3000.0
1,1,1514629000.0,True,3000.0
2,2,1514535000.0,False,3022.0
3,3,1514799000.0,True,3000.0
4,4,1514677000.0,False,3007.0


In [17]:
trip_df = pd.read_csv('trip.csv')
def convert_to_datetime(unix):
    return datetime.fromtimestamp(unix)

trip_df['time'] = trip_df['time'].apply(convert_to_datetime)
trip_df['time'] = pd.to_datetime(trip_df.time)
trip_df.sort_values('time', inplace=True)
trip_df.head()

Unnamed: 0.1,Unnamed: 0,time,is_arrival,station_number
131238,131238,2017-09-30 20:04:00,False,4216.0
117428,117428,2017-09-30 20:06:00,False,4202.0
101792,101792,2017-09-30 20:21:00,False,4162.0
41600,41600,2017-09-30 20:26:00,False,3034.0
131239,131239,2017-09-30 20:27:00,True,4214.0


In [66]:
station_cols = trip_df.station_number.unique()
# print(station_cols)
station_df = pd.DataFrame(columns=station_cols)

initial = {value: 0 for _, value in enumerate(station_cols)}

station_df = station_df.append(initial, ignore_index=True)

initial_arr = list(initial.values())
initial_arr.append(trip_df.loc[0]["time"])
station_to_idx = {value: i for i, value in enumerate(list(station_df))}
n = 0
test_dict = {n: initial_arr}

# for _, row in trip_df.iterrows():
#     entry = test_dict[n].copy()
#     idx = station_to_idx[row['station_number']]
#     if row['is_arrival']:
#         entry[idx] += 1
#     else:
#         entry[idx] -=1
#     n += 1
#     entry[-1] = row['time']
#     test_dict[n] = entry

In [24]:
columns = list(station_to_idx.keys())
columns.append('time')
realtime_df = pd.DataFrame.from_dict(test_dict, orient='index', columns=columns)
realtime_df.head()

Unnamed: 0,4216.0,4202.0,4162.0,3034.0,4214.0,4150.0,4204.0,3027.0,4212.0,3062.0,...,4206.0,3010.0,4174.0,3080.0,4207.0,3060.0,3013.0,4220.0,4227.0,time
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017-12-30 05:20:00
1,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017-09-30 20:04:00
2,-1,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017-09-30 20:06:00
3,-1,-1,-1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017-09-30 20:21:00
4,-1,-1,-1,-1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2017-09-30 20:26:00


In [25]:
realtime_df.to_csv('realtime.csv')

In [11]:
df = pd.read_csv('realtime.csv')
df['time'] = pd.to_datetime(df.time)
time = df.loc[0]['time']
n = 1
last_n = 0
d = {}
rows, cols = df.shape
initial = [0 for _ in range(cols-1)]

i = 0
for _, row in df.iterrows():
    if n >= rows:
        break
    row_time = df.loc[n]['time']
    diff = row_time - df.loc[n-1]['time']
    if row_time - df.loc[n-1]['time'] < timedelta(minutes=10):
        n += 1
        time = row_time
        continue
    reps = diff // timedelta(minutes=10)
    if reps > 1:
        for j in range(reps):
            new_time = row_time + j * timedelta(minutes=10)
            new_row = initial.copy() + [new_time]
            d[i] = new_row
            i += 1
        n += 1
        time = row_time
    else:
        diff = df.loc[n] - df.loc[n-1]
        diff['time'] = row_time
        d[i] = list(diff)
        last_n = n
        n += 1
        i += 1
        time = row_time

In [13]:
# delta_df = pd.DataFrame.from_dict(d, orient='index', columns=df.columns)
# delta_df.sort_values('time', inplace=True)
# delta_df
delta_df.to_csv('delta.csv')

In [53]:
# x: data points + time
# y: station with lowest
df = pd.read_csv('delta.csv')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
stations = df.drop(['time'], axis=1)
stations = stations.idxmin(axis=1)
stations = stations.drop(stations.tail(0).index)
stations = stations.drop(stations.head(0).index)

df = df.drop(stations.tail(1).index)
df = df.drop(stations.head(1).index)
df['label'] = stations
df

Unnamed: 0,4216.0,4202.0,4162.0,3034.0,4214.0,4150.0,4204.0,3027.0,4212.0,3062.0,...,3010.0,4174.0,3080.0,4207.0,3060.0,3013.0,4220.0,4227.0,time,label
1,0,0,0,0,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 20:45:00,3027.0
2,0,0,0,0,0,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,2017-09-30 20:56:00,4212.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 22:04:00,4216.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 22:14:00,4216.0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 22:23:00,3064.0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 23:06:00,4216.0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 23:16:00,4216.0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 23:26:00,4216.0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 23:23:00,3006.0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2017-09-30 23:40:00,4216.0


# Random Forest
- feed in time, delta
- output: counts at each station
- transfer user to station with lowest count

In [14]:
def grid_search(X_train, y_train):
    """
    Perform grid search for hyperparameters.
    """
    reg = RandomForestClassifier()
    param_grid = {
            "n_estimators"      : [10,50,100,500],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_leaf" : [1,5,10,20]
            }
    grid = GridSearchCV(reg, param_grid, verbose=0)

    grid.fit(X_train, y_train)

    return grid.best_score_ , grid.best_params_

In [64]:
def create_rand_forest(X_train, y_train):
    
    # determine parameters
#     best_score, best_params = grid_search(X_train, y_train)
#     mf = best_params['max_features']
#     msl = best_params['min_samples_leaf']
#     ne = best_params['n_estimators']
    
    rfr = RandomForestRegressor(n_estimators=1000)
    rfr.fit(X_train, y_train)
    
    return rfr

In [69]:
x_cols = list(df.columns)
del x_cols[-1]
x = df[x_cols]
x['time'] = df['time'].apply(convert_to_seconds)
x.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,4216.0,4202.0,4162.0,3034.0,4214.0,4150.0,4204.0,3027.0,4212.0,3062.0,...,4206.0,3010.0,4174.0,3080.0,4207.0,3060.0,3013.0,4220.0,4227.0,time
1,0,0,0,0,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,1506804000.0
2,0,0,0,0,0,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,0,1506805000.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1506809000.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1506810000.0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1506810000.0


In [70]:
y = df['label']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.20, random_state=42)

In [71]:
rfr = create_rand_forest(x_train, y_train)

In [91]:
# Get predictions
train_pred = rfr.predict(x_train)
val_pred = rfr.predict(x_val)
# n, d = x_val.shape
# data = np.random.rand(n, 1)
# f = lambda x: 1 if x > 0.5 else -1
base_pred = np.array([4216 for _ in range(val_pred.shape[0])])

# calculate error

train_err = np.asarray(y_train, dtype=float) - np.asarray(train_pred, dtype=float)
val_err = np.asarray(y_val, dtype=float) - np.asarray(val_pred, dtype=float)
base_err = (base_pred - val_pred)

print('Training Error:', np.count_nonzero(train_err)/len(train_err))
print('Validation Error:', np.count_nonzero(val_err)/len(val_err))
print('Baseline Error:', np.count_nonzero(base_err)/len(base_err))

# train_err

Training Error: 0.08645655877342419
Validation Error: 0.07482993197278912
Baseline Error: 0.12074829931972789
