In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

  from numpy.core.umath_tests import inner1d


# Load Data

In [4]:
# Load 2017 trips csv
df = pd.read_csv('../metro-bike-share-trips-2017-q4-v2.csv', usecols=[2, 3, 4, 7])
df

Unnamed: 0,start_time,end_time,start_station,end_station
0,2017-12-30 10:20:00,2017-12-30 10:22:00,3000,3000
1,2017-12-29 08:03:00,2018-01-01 09:27:00,3022,3000
2,2017-12-30 23:44:00,2018-01-01 18:24:00,3007,3000
3,2017-12-31 12:43:00,2018-01-01 19:26:00,3047,3000
4,2017-12-30 22:13:00,2018-01-02 12:20:00,3023,3000
5,2017-12-31 20:40:00,2018-01-03 12:39:00,3005,3000
6,2017-12-29 11:29:00,2017-12-29 14:50:00,4211,3000
7,2017-12-30 10:24:00,2017-12-30 10:25:00,3000,3000
8,2017-12-30 10:20:00,2017-12-30 10:22:00,3000,3000
9,2017-12-30 00:18:00,2017-12-30 11:48:00,3024,3000


In [5]:
# Load bike id mapping csv
bike_ids = pd.read_csv('../metro-bike-share-stations-2019-01-07.csv')

# Feature Extraction

In [6]:
from datetime import datetime

def convert_to_seconds(time):
    """
    Converts a YYYY-MM-DD HH:MM:SS timestamp to Unix time (in seconds).
    """
    utc_time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    epoch_time = (utc_time - datetime(1970, 1, 1)).total_seconds()
    return epoch_time

# Convert into Desired Format
- Sort by timestamp
- Reformat to deliver information regarding each bike station

In [7]:
# Run this once to apply timestamp conversion
df['start_time'] = df['start_time'].apply(convert_to_seconds)
df['end_time'] = df['end_time'].apply(convert_to_seconds)
df.sort_values(by=['start_time'])

Unnamed: 0,start_time,end_time,start_station,end_station
65619,1.506816e+09,1.506818e+09,4216,4214
58714,1.506816e+09,1.506818e+09,4202,4204
50896,1.506817e+09,1.506818e+09,4162,4150
20800,1.506818e+09,1.506820e+09,3034,3034
20799,1.506818e+09,1.506820e+09,3034,3034
34110,1.506819e+09,1.506820e+09,3027,3062
34109,1.506819e+09,1.506820e+09,3027,3062
59940,1.506819e+09,1.506820e+09,4212,4208
59939,1.506819e+09,1.506820e+09,4212,4208
59938,1.506819e+09,1.506820e+09,4212,4208


In [29]:
col_names = ['time', 'is_arrival', 'station_number']
trip_df = pd.DataFrame(columns=col_names)

for index, row in df.iterrows():
    depart_row = {'time': row['start_time'], 'is_arrival': False, 'station_number': row['start_station']}
    arrive_row = {'time': row['end_time'], 'is_arrival': True, 'station_number': row['end_station']}
    trip_df = trip_df.append(depart_row, ignore_index=True)
    trip_df = trip_df.append(arrive_row, ignore_index=True)

trip_df.head()
trip_df['station_number'].nunique()


121

In [19]:
# write to csv
trip_df.to_csv('trip.csv')

In [30]:
station_cols = trip_df.station_number.unique()
# print(station_cols)
station_df = pd.DataFrame(columns=station_cols)

initial = [20]*len(station_cols)
# print(initial[0:])

station_df = station_df.append(initial, ignore_index=True)

# for index, row in trip_df.iterrows():
#     arr = initial[len(initial)-1:]
#     if row['is_arrival']:
#         arr[row['station_number']] +=1 
#     else:
#         arr[row['station_number']] -=1
#     initial.append(arr, ignore_index=True)
    
station_df.head()




[3000. 3022. 3007. 3047. 3023. 3005. 4211. 3024. 3030. 3067. 4160. 4210.
 3082. 3036. 3033. 4162. 3014. 3010. 3055. 4177. 3026. 4132. 3054. 3064.
 3049. 3069. 4212. 4167. 4214. 3019. 4180. 3046. 4209. 4148. 3045. 3042.
 3034. 4215. 3035. 3032. 3066. 3063. 3040. 3027. 3062. 3031. 3081. 3013.
 3077. 3008. 3037. 4142. 4181. 4151. 3079. 3052. 3006. 3038. 3025. 3075.
 4129. 4205. 4216. 4202. 4220. 4165. 3048. 4157. 3018. 3060. 4204. 4155.
 3051. 4158. 3029. 3068. 4170. 4153. 3074. 4126. 3078. 3076. 3065. 4207.
 4227. 3058. 4127. 4208. 4174. 4147. 4213. 3057. 4134. 4163. 3028. 4166.
 4144. 4138. 3020. 4183. 4169. 3011. 4206. 4135. 3016. 4131. 4152. 4130.
 4146. 4149. 3056. 4159. 4156. 4176. 4150. 4108. 3080. 4125. 4133. 4136.
 4154.]
[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 2

Unnamed: 0,0.0,3000.0,3005.0,3006.0,3007.0,3008.0,3010.0,3011.0,3013.0,3014.0,...,4209.0,4210.0,4211.0,4212.0,4213.0,4214.0,4215.0,4216.0,4220.0,4227.0
0,20.0,,,,,,,,,,...,,,,,,,,,,
1,20.0,,,,,,,,,,...,,,,,,,,,,
2,20.0,,,,,,,,,,...,,,,,,,,,,
3,20.0,,,,,,,,,,...,,,,,,,,,,
4,20.0,,,,,,,,,,...,,,,,,,,,,


# Random Forest

In [14]:
def grid_search(X_train, y_train):
    """
    Perform grid search for hyperparameters.
    """
    reg = RandomForestClassifier()
    param_grid = {
            "n_estimators"      : [10,50,100,500],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_leaf" : [1,5,10,20]
            }
    grid = GridSearchCV(reg, param_grid, verbose=0)

    grid.fit(X_train, y_train)

    return grid.best_score_ , grid.best_params_

In [34]:
def create_rand_forest(X_train, y_train):
    
    # determine parameters
#     best_score, best_params = grid_search(X_train, y_train)
#     mf = best_params['max_features']
#     msl = best_params['min_samples_leaf']
#     ne = best_params['n_estimators']
    
    rfr = RandomForestRegressor(n_estimators=1000)
    rfr.fit(X_train, y_train)
    
    return rfr

In [28]:
x_cols = ['start_time', 'end_time', 'start_station']
x = df[x_cols]
x['start_time'] = df['start_time'].apply(convert_to_seconds)
x['end_time'] = df['end_time'].apply(convert_to_seconds)

x.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,start_time,end_time,start_station
0,1514629000.0,1514629000.0,3000
1,1514535000.0,1514799000.0,3022
2,1514677000.0,1514831000.0,3007
3,1514724000.0,1514835000.0,3047
4,1514672000.0,1514896000.0,3023


In [29]:
y = df['end_station']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.20, random_state=42)

In [35]:
rfr = create_rand_forest(x_train, y_train)

In [37]:
# Get predictions
train_pred = rfr.predict(x_train)
# val_pred = rfr.predict(x_val)
# n, d = x_val.shape
# data = np.random.rand(n, 1)
# f = lambda x: 1 if x > 0.5 else -1
# base_pred = np.array(list(map(f, data)))

# calculate error

# train_err = (y_train - train_pred)
# val_err = (y_val - val_pred)
# base_err = (base_pred - val_pred)

# print('Training Error:', np.count_nonzero(train_err)/len(train_err))
# print('Validation Error:', np.count_nonzero(val_err)/len(val_err))
# print('Baseline Error:', np.count_nonzero(base_err)/len(base_err))

train_pred

array([4213.958, 3047.217, 3071.808, ..., 4164.231, 3004.509, 3024.383])