In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from get_features import *

In [2]:
def unique_stations(df):
    '''
    Given a dataframe, identify the unique start/end stations
    
    INPUT: DataFrame
    OUTPUT: 1 array of unique start station ids
  
    '''
    #names of each start station and the number of trips 
    lst_start_station_name = df.start_station_name.value_counts()
    
    #ids of each start station and the number of trips 
    lst_start_station_id = df.start_station_id.value_counts()
    num_unique_stations = lst_start_station_id.unique().size
    unique_start_sations = df.start_station_id.unique()
    unique_end_stations = df.end_station_id.unique()
    return unique_start_sations
    

In [3]:
df = pd.read_csv('data/201803-fordgobike-tripdata.csv')
mar_18_sf = model_city(df)
mar_18= unique_stations(mar_18_sf)
np.sort(mar_18)

FileNotFoundError: [Errno 2] File b'data/201803-fordgobike-tripdata.csv' does not exist: b'data/201803-fordgobike-tripdata.csv'

In [None]:
df = pd.read_csv('data/201804-fordgobike-tripdata.csv')
apr_18_sf = model_city(df)
apr_18= unique_stations(apr_18_sf)
np.sort(apr_18)

In [None]:
def new_stn_ids(old_stn, new_stn):
    '''
    INPUT: 2 lists. 1 list of new station ids
                    1 list of old station ids
    '''
    new_stn = set(new_stn) - set(old_stn)
    lst_new = list(new_stn)
    return lst_new



In [None]:
proposed_stn = new_stn_ids(mar_18, apr_18)
proposed_stn

In [None]:
df_2017 = pd.read_csv('data/2017-fordgobike-tripdata.csv')
df_2017 = feature_addition(df_2017)


In [None]:
df = pd.read_csv('data/2018-2019-fordgobike-tripdata.csv')

In [None]:
df = feature_addition(df)

In [None]:
df_2018 = pd.read_csv('data/201804-fordgobike-tripdata.csv')
df_2018 = feature_addition(df_2018)

In [None]:
#separate the months
#cdf will be current month
#ndf will be next month
cdf = month_sep(df_2017, 2017, 11)
ndf = month_sep(df_2017, 2017, 12)

In [None]:
#find the unique stations for each month
cus = unique_stations(cdf)
nus = unique_stations(ndf)

In [None]:
#list of proposed stations
#stations opened in next month but not in current month
ps = new_stn_ids(cus, nus)

In [None]:
ps

In [None]:
def stn_id_coords(df):
    #getting the coordinates from the dataset
    coordinates = np.array(df[['start_station_longitude', 'start_station_latitude']])
    unique_coords = np.unique(coordinates, axis = 0)
    #create a dictionary with
    #station id as key
    #coordinates for the station id as values
    id_coord = {}
    for u in unique_coords:
        k = df.start_station_id[(df.start_station_longitude == u[0]) &(df.start_station_latitude == u[1])].iloc[0]
        id_coord[k] = u
    return id_coord




In [None]:
def plot_new_station(df, new_stn):
#     print("The origin station is {}.".format(t))
#     print("The 3 closest neighbors are: {}, {}, {}".format(int(knn_dict.get(t)[0]),int(knn_dict.get(t)[1]),int(knn_dict.get(t)[2])))
    id_coord = stn_id_coords(df)
    plt.figure(figsize = (10,10))
    ll,rr = -122.50, -122.36
    bb = 37.73
    plt.xlim(ll,rr)
    plt.ylim(bb, bb+(rr-ll))
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title("Map of Ford GoBike stations at the end of April 2018")
    plt.scatter(df.start_station_longitude, df.start_station_latitude, s=1, c=('r'))
    for n in new_stn:
#         if id_coord.get(n) is not None:
        plt.scatter(id_coord.get(n)[0], id_coord.get(n)[1], s=20, marker='x', c=('b'))
#         else:
#             print ("Station {} has been removed.".format(n))

In [None]:
plot_new_station(ndf, ps)

In [None]:
ps

### Use knn to pick stations from current month that are closest to each proposed station

In [None]:
def euclidean_distance(x, y):
    return np.sqrt(((x-y)**2).sum(axis=1))

In [None]:
def knn_proposed_stn(df, df2, proposed_stn, num_neighbors = 3):
    

    coordinates = np.array(df[['start_station_longitude', 'start_station_latitude']])
    unique_coords = np.unique(coordinates, axis = 0)
    #get the id and coords for current month
    id_coord_df = stn_id_coords(df)
    id_coord_df2 = stn_id_coords(df2)
    knn_dict = {}
    for p in proposed_stn:
        neighbors = unique_coords[np.argsort(euclidean_distance(id_coord_df2.get(p), unique_coords))][1:num_neighbors+1]
#         k = df.start_station_id[(df.start_station_longitude == id_coord_df2.get(p)[0]) &(df.start_station_latitude == id_coord_df2.get(p)[1])].iloc[0]
        v = []
        for i in range(num_neighbors):
            knn_id = df.start_station_id[(df.start_station_longitude == neighbors[i][0]) &(df.start_station_latitude == neighbors[i][1])].iloc[0]
            v.append(knn_id)
        knn_dict[p] = v
    return knn_dict

In [None]:
knn_proposed_stn(cdf, ndf, ps)

### Time series by day

In [None]:
#3 month history for specific station
#current month and 2 previous month
cm = cdf.month.unique()[0]
tsdf = df_2017[(df_2017.month <=cm) & (df_2017.month>cm-3)]


In [None]:
tsdf.head(2)

In [None]:
months = np.sort(tsdf.month.unique())

In [None]:
months

In [None]:
days = np.zeros(1)
days

In [None]:
for idx, mon in enumerate(months):
    mult = idx+1
    

In [None]:
tsdf['days'] = 1

In [None]:
tsdf['days'][tsdf.month == 11] = tsdf.day * 2

In [None]:
def days_for_ts(df, cdf):
    cm = cdf.month.unique()[0]
    tsdf = df_2017[(df_2017.month <=cm) & (df_2017.month>cm-3)]
    tsdf['days'] = 1
    months = np.sort(tsdf.month.unique())
    for idx, mon in enumerate(months):
        mult = idx+1
        tsdf['days'][tsdf.month == mon] = tsdf.day * mult
    return tsdf

In [None]:
ts = days_for_ts(df_2017, cdf)

In [None]:
ts['days'][ts.end_station_id == 246]

In [None]:
def plt_stn(df, station_id):
    tsplt = df['days'][df.end_station_id == station_id].value_counts().reset_index()
    tsplt = np.array(tsplt)
    tsplt = tsplt[np.argsort(tsplt[:,0])]
    plt.plot(tsplt[:,0], tsplt[:,1])


In [None]:
plt_stn(ts, 248)

In [None]:
ok = ts['days'][ts.end_station_id == 240].value_counts().reset_index()

In [None]:
ok = np.array(ok)
ok

In [None]:
ok[np.argsort(ok[:,0])]