In [3]:
import numpy as np
import csv
import os
import matplotlib.pyplot as plt


In [8]:

def obtain_data(train_bool=True):
    if train_bool:
        volume_filename = "volume_train.csv"
    else:
        volume_filename = "volume_test.csv"

    with open(volume_filename) as csvfile:
        #rows = [row for row in csv.reader(csvfile)]
        rows = [row for row in csv.DictReader(csvfile)]

    fields = set([k for (k,_) in rows[0].items()])
    print(fields)

    with open("raw_2010.csv") as csvfile:
        raw_rows = [row for row in csv.DictReader(csvfile)]
    with open("raw_2011.csv") as csvfile:
        raw_rows += [row for row in csv.DictReader(csvfile)]
    
    return rows, raw_rows
    


In [None]:
# pvalues = {}
# key: station #
# value: p value
def obtain_pvalues(exp_num=2, train_bool=True)
    pv_fnames = ["ExtremeTempTraffic","TempTraffic","WeatherTraffic","WeekendWeekday"]
    fname = pv_fnames[exp_num]

    if train_bool:
        fname += "_Train.csv"
    else:
        fname += "_Test.csv"

    with open(fname) as csvfile:
        pvalues = {row["stations"]: row["station.p.vals"] for row in csv.DictReader(csvfile)}
    
    return pvalues



In [None]:
rows, raw_rows = obtain_data()
pvalues = obtain_pvalues()

In [9]:
# For myself and my project partners

# SDR: Station Day Rides

# key: (station, date)
# value: list of indices of raw data 
def obtain_sdr():
    station_date_rides = {}
    for i,row in enumerate(raw_rows):
        date = row["Start date"].split(" ")[0]

        station = row["Start station number"]
        if not (station, date) in station_date_rides.keys():
            station_date_rides[(station, date)] = []
        station_date_rides[(station, date)].append(i)

        station = row["End station number"]
        if not (station, date) in station_date_rides.keys():
            station_date_rides[(station, date)] = []
        station_date_rides[(station, date)].append(i)
    return station_date_rides

# Output to CSV with 1-indexing, for correct use in R
def output_sdr_csv():
    station_date_rides = obtain_sdr()
    
    # use i+1 so that indexing is as in R
    for k, ilist in station_date_rides.items():
        ilist = [i+1 for i in ilist]

    with open("station_date_trips.csv", "w+") as outfile:
        #wrt = csv.writer(outfile)# quoting=csv.QUOTE_NONE)
        wrt = csv.writer(outfile, quoting=csv.QUOTE_ALL)
        wrt.writerow(["Station", "Date", "Trips"])
        for k,val in station_date_rides.items():
            station, date = k
            third = " ".join(tuple(str(v) for v in val))
            #wrt.writerow([station, date, '"' + third + '"'])
            wrt.writerow([station, date,third])
    

In [5]:

# key: station
# value: dict of daily traffic values with: 
#   (key: date, val: traffic level value)
def obtain_station_days(rows):
    station_days = {}
    for row in rows:
        station = row["Station"]
        if not station in station_days.keys():
            station_days[station] = {}
        date = row["Date"]
        station_days[station][date] = row["X"]
    return station_days
    
# Return indices of those discoveries made
def bh(pvs, alpha):
    k = 0
    n = float(len(pvs))
    pvs = [(p,i) for i,p in enumerate(pvs)]
    pvs.sort()
    while pvs[k][0] <= (alpha*k)/n:
        k += 1
    return [pv[1] for pv in pvs[:k]]

#def storey_bh(pvs, alpha, gamma=0.5):
def storey_pihat(group, gamma=0.5):
    num = 0.
    for pv in group:
        if pv > gamma:
            num += 1.
    denom = len(group) * (1.-gamma)
    pihat = min(num/denom, 1)
    
from copy import deepcopy
#  function
def group_adaptive_bh(p_vector, groups_indices, alpha, gamma=0.5):
    #groups = []
    #n = 0
    #for group_size in group_sizes:
    #    group = p_vector[n: n + group_size]
    #    groups.append(group)
    #    n += group_size
    
    groups = [[p_vector[i] for i in gi] for gi in groups_indices]

    # Pi hat for each group
    group_pi_hats = []
    for gindex, group in enumerate(groups):
        num = 0.
        for pv in group:
            if pv > gamma:
                num += 1.
        denom = len(group) * (1.-gamma)
        group_pi_hats.append(min(num/denom, 1))

    for i,group in enumerate(groups):
        for j in group:
            p_vector[j] *= group_pi_hat[i]
            
    # bogus approach: cannot iterate correct : g_hat_vector = [p * group_pi_hats[ [i]] for i,p in enumerate(p_vector)]
    #for group, pi_hat in zip(groups, group_pi_hats):
    #    g_hat_vector += [pi_hat for _ in range(len(group))]
    #assert len(g_hat_vector) == len(p_vector)
    #mod_p_vector = [g_hat * p for g_hat,p in zip(g_hat_vector, p_vector)]
    for i,p in enumerate(p_vector):
        if p > gamma:
            p_vector[i] = np.infty

    # Get modified weighted p values
    # Now do ordinary BH on mod_p_vector
    return bh(mod_p_vector, alpha)
    #sorted_p_vector = sorted(mod_p_vector)
    #for k in range(len(sorted_p_vector)-1, 1, -1):
    #    bound = (alpha*k)/n
    #    if sorted_p_vector[k][0] < bound:
    #        return k, sorted_p_vector
    #return len(sorted_p_vector)-1, sorted_p_vector
    # reject 1..k    
    
    
    

KeyError: 'Station'

In [None]:
# key: station
# value: list of dates of days of which station is valid within train/test/validate
valid_station_days = {}
for row in rows:
    station = row["Station"]
    date = row["Date"]
    if not station in valid_station_days.keys():
        valid_station_days[station] = []
    valid_station_days[station].append(date)


def obtain_duration_groups(raw_rows, valid_station_days):
    # key: station
    # value: (total duration, number of trips)
    # then can compute avg by taking v[0]/v[1]
    station_durations = {}
    for row in rows:
        station = row["Station"]
        if not station in station_durations.keys():
            station_durations[station] = (0,0)
            
        total_duration = 0
        for i in station_date_rides[(station, date)]:
            total_duration += raw_rows[i]["Duration"]
        
        station_durations[station][0] += total_duration
        station_durations[station][1] += len(station_date_rides[(station, date)])
    
    avg_station_durations = {k:(v[0]/v[1]) for k,v in station_durations.items()}
    sort_station_durations = [(k,v) for k,v in station_durations.items()]
    sort_station_durations.sort()
    sort_station_durations_names = [k for (k,_) in sort_station_durations]
    
    nstations = len(sort_station_durations_names)
    ngroups = 5
    ratio = int(nstations/ngroups)
    
    groups = []
    for i in range(ngroups):
        groups.append(sort_station_durations_names[ratio*i:ratio*(i+1)])
    groups[-1] += sort_station_durations_names[int(sort_station_durations_names/ngroups)*ngroups:]
        

    