In [43]:

import numpy as np
import csv
import os

import matplotlib.pyplot as plt

# Import data

def explore_train_split_index(explore_indices):
    result = 0
    for index in explore_indices:
        if index < 115597:
            result += 1
        else:
            return result
            
            
def main():
    filenames_path = "./data"
    filenames = [os.path.join(filenames_path, filename) for filename in os.listdir(filenames_path) if filename.endswith(".csv")]

    explore_indices = [int(row[0])-1 for row in file_rows("./explore_indices.csv")]
    # These are 1-indexed like in R
    rows = [row for filename in filenames for row in file_rows(filename)]
    train_split_index = explore_train_split_index(explore_indices)
    rows = [rows[i] for i in explore_indices]

    durations = [int(row[0]) for row in rows]
    start_times = [parse_timestr(row[1]) for row in rows]
    end_times = [parse_timestr(row[2]) for row in rows]
    #count_multiday_trips(start_times, end_times)
    # In fact, many people check out bikes late at night one day
    # and bike until the next day begins

    start_stations = [int(row[3]) for row in rows]
    end_stations = [int(row[5]) for row in rows]
    # note that stations get added to the program over time
    # columns 4,6 contain text representation of the numbers in columns 3,5

    bike_numbers = [bike_num(row[7]) for row in rows]
    # Some are None, the data is messy
    #assert_bike_numbers(bike_numbers)

    # Every membertype is "member"
    member_types = [row[8] for row in rows]
    assert_member_types(member_types)
    
    return start_times, end_times, durations, start_stations, end_stations, bike_numbers, train_split_index


def bike_num(s):
    if s[0] == "W" or s[0] == "w":
        return int(s[1:])
    return None

def count_multiday_trips(start_times, end_times):
    num = 0
    for a,b in zip(start_times, end_times):
        if a[0] != b[0]:
            num += 1
    print("The proportion of trips which are multiday is", num/len(start_times))

def assert_member_types(member_types):
    count = 0
    for type in member_types:
        count += 1
        #assert type == "Member"
    print(count)

def assert_bike_numbers(bike_numbers):
    for number in bike_numbers:
        assert int(str(number)) == number

# Returns 2: date (str), time as # absolute seconds since midnight (int)
def parse_timestr(instr):
    instrs = instr.split(' ')
    # datestr, timestr
    h, m, s = instrs[1].split(':')
    return instrs[0], int(h) * 3600 + int(m) * 60 + int(s)

def file_rows(filename):
    with open(filename) as csvfile:
        rows = [row for row in csv.reader(csvfile)]
    # keys = rows[0]
    # print(rows[0:5])
    return rows[1:]




In [44]:
start_times, end_times, durations, start_stations, end_stations, bike_numbers, train_split_index = main()


134237


In [45]:

def train_split(ls):
    #train_split_index = 115597
    return ls[:train_split_index]

def test_split(ls):
    #train_split_index = 115597
    return ls[train_split_index:]

train_start_times = train_split(start_times)
train_end_times = train_split(end_times)
train_durations = train_split(durations)
train_start_stations = train_split(start_stations)
train_end_stations = train_split(end_stations)
train_bike_numbers = train_split(bike_numbers)

test_start_times = test_split(start_times)
test_end_times = test_split(end_times)
test_durations = test_split(durations)
test_start_stations = test_split(start_stations)
test_end_stations = test_split(end_stations)
test_bike_numbers = test_split(bike_numbers)


In [50]:
from copy import deepcopy

def plot_day_ranges(start_times):
    day_ranges = []
    
    daystr = ""
    day_range = []
    for (i, time) in enumerate(start_times):
        if daystr != time[0]:
            daystr = time[0]
            day_ranges.append(deepcopy(day_range))
            day_range = []
        day_range.append(i)
    
    ndays = len(day_ranges)
    day_range_lens = [len(dr) for dr in day_ranges]
    print(len(day_ranges), sum(day_range_lens)/ndays )
    #scipy.stats.linregress()
    
    # https://matplotlib.org/gallery/statistics/hist.html
    fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)

    # We can set the number of bins with the `bins` kwarg
    axs[0].hist(day_range_lens, bins=ndays)
    axs[1].hist(day_range_lens, bins=ndays)


def obtain_best_edgetrip_keys(edge_trips):
    #print(list(edge_trips.items())[:50])
    edge_numtrips = [(len(v),k) for k, v in edge_trips.items()]
    edge_numtrips.sort(reverse=True)
    return [k for (_,k) in edge_numtrips[:50]]

def obtain_edge_trips(durations, start_stations, end_stations):
    edge_trips = {}
    for duration, start_station, end_station in zip(durations, start_stations, end_stations):
        if (start_station, end_station) not in edge_trips.keys():
            edge_trips[(start_station, end_station)] = []
        edge_trips[(start_station, end_station)].append(duration)
    return edge_trips

    
    
    #print([edge_trips[k] for (_,k) in edge_numtrips[:50]])
    #print(edge_trips)
    #print( [len(edge_trips[k]) for (_,k) in edge_numtrips[:50]])
    #for k,v in edge_trips.items():
        #print(k,v)
        #print(sum(v)/len(v))


In [55]:
#plot_day_ranges(start_times)

def avg(ls):
    return sum(ls)/len(ls)

train_edge_trips = obtain_edge_trips(train_durations, train_start_stations, train_end_stations)
trip_keys = obtain_best_edgetrip_keys(train_edge_trips)
train_mean_durations = {k : avg(train_edge_trips[k]) for k in trip_keys}

test_edge_trips = obtain_edge_trips(test_durations, test_start_stations, test_end_stations)
test_mean_durations = {k : avg(test_edge_trips[k]) for k in trip_keys}



In [58]:
#print("train",train_mean_durations)
#print("test",test_mean_durations)

for k in trip_keys:
    print(train_mean_durations[k], test_mean_durations[k])

450.4791666666667 280.97715736040607
4226.2444444444445 4912.534653465346
417.0243902439024 639.4172661870504
241.14634146341464 296.60508083140877
1011.5641025641025 678.2608695652174
500.6666666666667 461.3497267759563
348.48571428571427 485.2962962962963
398.8529411764706 691.5923913043479
409.5151515151515 403.0146750524109
688.78125 518.5978835978837
340.06451612903226 383.8838709677419
182.90322580645162 191.24892703862662
530.4666666666667 559.5870445344129
377.51724137931035 374.5705128205128
4206.964285714285 4844.0901639344265
4002.25 4766.974921630094
349.85714285714283 412.53469387755104
284.48148148148147 818.072072072072
473.44444444444446 589.611374407583
354.46153846153845 357.38170347003154
1132.8461538461538 393.9263157894737
810.4 417.10169491525426
533.76 380.07100591715977
409.25 507.58490566037733
381.75 393.82722513089004
518.75 546.7470588235294
740.5833333333334 988.9166666666666
1834.3478260869565 1762.8354430379748
459.1818181818182 511.9328063241107
345.5454