In [2]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.ticker import MaxNLocator

import feather

from tqdm import tqdm_notebook as tqdm

from tqdm import tqdm_pandas

from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

from multiprocessing import Pool

## Loading and setting up the data

In [8]:
se = feather.read_dataframe("../data_files/B/once/75days/stop_events_with_geo_train_test_averages_prev_next_dwell.feather")
# se = feather.read_dataframe("../data_files/B/once/75days/se_prev_next.feather")
se = se.set_index(se.columns[0])

In [9]:
 # And now for just segments:
se["diff_segment_and_mean_by_segment_code"] = (
    se["segment_duration"]
    - se["mean_durations_by_segment_code"]
)
se["diff_segment_and_mean_by_segment_code_and_hour_and_day"] = (
    se["segment_duration"]
    - se["mean_durations_by_segment_code_and_hour_and_day"]
)

se["diff_percent_segment_and_mean_by_segment_code"] = (
    se["diff_segment_and_mean_by_segment_code"]
    * 100
    / se["mean_durations_by_segment_code"]
)

se["diff_percent_segment_and_mean_by_segment_code_and_hour_and_day"] = (
    se["diff_segment_and_mean_by_segment_code_and_hour_and_day"]
    * 100
    / se["mean_durations_by_segment_code_and_hour_and_day"]
)

In [12]:
def include_columns_containing(se, to_include):

    min_cols = [c for c in se.columns if any(x in c for x in to_include)]

    se_min = se[min_cols]

    return se_min

In [18]:
def exclude_columns_containing(se, to_remove):

    min_cols = [c for c in se.columns if not any(x in c for x in to_remove)]

    se_min = se[min_cols]

    return se_min

In [19]:
se = exclude_columns_containing(se, ["prev_segment_code_", "next_segment_code_", "prev_event_index_", "next_event_index_"])

In [13]:
se_prev_next = include_columns_containing(se_prev_next, ['workid', 'date', 'segment_code', 'prev_segment_code_', 'next_segment_code_'])

In [20]:
se = se.merge(se_prev_next, on=['workid', 'date', 'segment_code'], how="left")

In [None]:
# We need to generate this from scratch as we need both test and train data.
# ts_5 = se.pivot_table(
#     index="actualArrival",
#     columns="segment_code",
#     values="diff_percent_segment_and_mean_by_segment_code_and_hour_and_day",
#     aggfunc=np.median,
# )

In [21]:
# From: https://stackoverflow.com/questions/31661604/efficiently-create-sparse-pivot-tables-in-pandas

arrival_c = CategoricalDtype(sorted(se.actualArrival.unique()), ordered=True)
segment_code_c = CategoricalDtype(sorted(se.segment_code.unique()), ordered=True)

row = se.actualArrival.astype(arrival_c).cat.codes
col = se.segment_code.astype(segment_code_c).cat.codes
sparse_matrix = csr_matrix((se["diff_percent_segment_and_mean_by_segment_code_and_hour_and_day"], (row, col)), \
                           shape=(arrival_c.categories.size, segment_code_c.categories.size))



In [22]:
# ts.loc['2018-12-16 14:00:00':'2018-12-16 14:08:01', '1200BOB20146_1200DOY38562_0'].dropna().mean()

In [23]:
num_partitions = 20 #number of partitions to split dataframe
num_cores = 5 #number of cores on your machine

def parallelize_dataframe_to_numpy(df, func, args):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    
    # This line is fidly, we make a list where each item is a tuple of 
    # a bit of the dataframe and whatever is passed in as args. 
    # Then starmap unpacks that tuple so each copy of func gets it's 
    # little bit of the dataframe and the right args to do it's job. 
    # All this to avoid globals! 
    all_args = [(split,) + args for split in df_split]
    
    matrix = np.concatenate(pool.starmap(func, all_args))
    pool.close()
    pool.join()
    return matrix

In [24]:
arrival_index = pd.to_datetime(arrival_c.categories)
segment_code_index = segment_code_c.categories

def calc_segment(se, segment, window_count, window_size = 5):
    
    last_time_slots_mean = np.empty((se.shape[0], window_count)).astype(float)
    last_time_slots_count = np.zeros((se.shape[0], window_count)).astype(int)
    
    last_time_slots_mean[:,:] = np.nan

    def get_recent_buses(row, idx):
        
        if row[2] == "":
            return
        try:
            column_index = segment_code_index.get_loc(row[2])
        except KeyError:
            return
        
        for i in range(window_count):
            slice_obj = arrival_index.slice_indexer(row[1] - (i+1) * pd.Timedelta(f"{window_size} min"), 
                                                      row[1] - (i * pd.Timedelta(f"{window_size} min")) + pd.Timedelta("1 sec"))

            journeys = sparse_matrix[slice_obj, column_index].data
            
            if journeys.shape[0] == 0:
                continue

            last_time_slots_mean[idx, i] = journeys.mean()
            last_time_slots_count[idx, i] = journeys.shape[0]

    
    for idx, row in enumerate(se[['actualArrival', segment]].itertuples()):
        get_recent_buses(row, idx)
            
    return np.concatenate((se.index.values.reshape(-1, 1), last_time_slots_mean, last_time_slots_count), axis=1)



In [None]:
matrix = parallelize_dataframe_to_numpy(se[['actualArrival', 'segment_code']], calc_segment, ('segment_code', 12, 15))


In [None]:
# With getting the entire column: 16.6 s ± 5.72 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
# With indexing into the column: 1.16 s ± 50.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# As above + only making stuff once & no exceptions: 1.09 s ± 68.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
matrix_segment_code = matrix

In [None]:
np.save("matrix_segment_code_last_12_15", matrix_segment_code)

In [None]:
for code in ['segment_code',
             'next_segment_code_1', 
             'next_segment_code_2', 
             'next_segment_code_3', 
             'prev_segment_code_1',
             'prev_segment_code_2',
             'prev_segment_code_3',
             'next_segment_code_4', 
             'prev_segment_code_4',
             'next_segment_code_5', 
             'prev_segment_code_5',
             'next_segment_code_6', 
             'prev_segment_code_6',
             'next_segment_code_7', 
             'prev_segment_code_7',
             'next_segment_code_8', 
             'prev_segment_code_8',
             'next_segment_code_9', 
             'prev_segment_code_9',
             'next_segment_code_10', 
             'prev_segment_code_10',
             'next_segment_code_11', 
             'prev_segment_code_11',
            ]:
    mtx_last_25_10 = parallelize_dataframe_to_numpy(se[['actualArrival', code]], calc_segment, (code, 25, 10))
    np.save(f"mtx_{code}_last_25_10", mtx_last_25_10)
    print(f"done: {code}", flush=True)



done: segment_code
done: next_segment_code_1
done: next_segment_code_2
done: next_segment_code_3
done: prev_segment_code_1
done: prev_segment_code_2


In [None]:
se_min['mean_offsets_enhanced_all'] = 0

mask = np.sum(se_min[['last_15_mins_count', 
                                                        'last_15_30_mins_count', 
                                                        'last_30_45_mins_count',
                                                        'last_45_60_mins_count',
                                                       'prev_segm_last_15_mins_count', 
                                                        'prev_segm_last_15_30_mins_count', 
                                                        'prev_segm_last_30_45_mins_count',
                                                        'prev_segm_last_45_60_mins_count',
                                                       'next_segm_last_15_mins_count', 
                                                        'next_segm_last_15_30_mins_count', 
                                                        'next_segm_last_30_45_mins_count',
                                                        'next_segm_last_45_60_mins_count']], axis=1) > 0

data = se_min.loc[mask, ['last_15_mins_mean', 
                                                        'last_15_30_mins_mean', 
                                                        'last_30_45_mins_mean',
                                                        'last_45_60_mins_mean',
                                                       'prev_segm_last_15_mins_mean', 
                                                        'prev_segm_last_15_30_mins_mean', 
                                                        'prev_segm_last_30_45_mins_mean',
                                                        'prev_segm_last_45_60_mins_mean',
                                                       'next_segm_last_15_mins_mean', 
                                                        'next_segm_last_15_30_mins_mean', 
                                                        'next_segm_last_30_45_mins_mean',
                                                        'next_segm_last_45_60_mins_mean']]

weights = se_min.loc[mask, ['last_15_mins_count', 
                                                        'last_15_30_mins_count', 
                                                        'last_30_45_mins_count',
                                                        'last_45_60_mins_count',
                                                       'prev_segm_last_15_mins_count', 
                                                        'prev_segm_last_15_30_mins_count', 
                                                        'prev_segm_last_30_45_mins_count',
                                                        'prev_segm_last_45_60_mins_count',
                                                       'next_segm_last_15_mins_count', 
                                                        'next_segm_last_15_30_mins_count', 
                                                        'next_segm_last_30_45_mins_count',
                                                        'next_segm_last_45_60_mins_count']]

masked_data = np.ma.masked_array(data, np.isnan(data))

se_min.loc[mask, 'mean_offsets_enhanced_all'] = np.ma.average(masked_data, 
                                               axis=1,
                                               weights=weights)



In [None]:
se_min['mean_offsets_enhanced_15plus'] = 0

mask = np.sum(se_min[[
                                                        'last_15_30_mins_count', 
                                                        'last_30_45_mins_count',
                                                        'last_45_60_mins_count',
                                                       
                                                        'prev_segm_last_15_30_mins_count', 
                                                        'prev_segm_last_30_45_mins_count',
                                                        'prev_segm_last_45_60_mins_count',
                                                       
                                                        'next_segm_last_15_30_mins_count', 
                                                        'next_segm_last_30_45_mins_count',
                                                        'next_segm_last_45_60_mins_count']], axis=1) > 0

data = se_min.loc[mask, [
                                                        'last_15_30_mins_mean', 
                                                        'last_30_45_mins_mean',
                                                        'last_45_60_mins_mean',
                                                      
                                                        'prev_segm_last_15_30_mins_mean', 
                                                        'prev_segm_last_30_45_mins_mean',
                                                        'prev_segm_last_45_60_mins_mean',
                                                       
                                                        'next_segm_last_15_30_mins_mean', 
                                                        'next_segm_last_30_45_mins_mean',
                                                        'next_segm_last_45_60_mins_mean']]

weights = se_min.loc[mask, [
                                                        'last_15_30_mins_count', 
                                                        'last_30_45_mins_count',
                                                        'last_45_60_mins_count',
                                                      
                                                        'prev_segm_last_15_30_mins_count', 
                                                        'prev_segm_last_30_45_mins_count',
                                                        'prev_segm_last_45_60_mins_count',
                                                      
                                                        'next_segm_last_15_30_mins_count', 
                                                        'next_segm_last_30_45_mins_count',
                                                        'next_segm_last_45_60_mins_count']]

masked_data = np.ma.masked_array(data, np.isnan(data))

se_min.loc[mask, 'mean_offsets_enhanced_15plus'] = np.ma.average(masked_data, 
                                               axis=1,
                                               weights=weights)



In [None]:
se_min['mean_offsets_enhanced_15plus_filtered'] = 0

mask = np.sum(se_min[[
                                                        'last_15_30_mins_count', 
                                                        'last_30_45_mins_count',
                                                        'last_45_60_mins_count',
                                                       
                                                        'prev_segm_last_15_30_mins_count', 
                                                        'prev_segm_last_30_45_mins_count',
                                                        'prev_segm_last_45_60_mins_count',
                                                       
                                                        'next_segm_last_15_30_mins_count', 
                                                        'next_segm_last_30_45_mins_count',
                                                        'next_segm_last_45_60_mins_count']], axis=1) > 10

data = se_min.loc[mask, [
                                                        'last_15_30_mins_mean', 
                                                        'last_30_45_mins_mean',
                                                        'last_45_60_mins_mean',
                                                      
                                                        'prev_segm_last_15_30_mins_mean', 
                                                        'prev_segm_last_30_45_mins_mean',
                                                        'prev_segm_last_45_60_mins_mean',
                                                       
                                                        'next_segm_last_15_30_mins_mean', 
                                                        'next_segm_last_30_45_mins_mean',
                                                        'next_segm_last_45_60_mins_mean']]

weights = se_min.loc[mask, [
                                                        'last_15_30_mins_count', 
                                                        'last_30_45_mins_count',
                                                        'last_45_60_mins_count',
                                                      
                                                        'prev_segm_last_15_30_mins_count', 
                                                        'prev_segm_last_30_45_mins_count',
                                                        'prev_segm_last_45_60_mins_count',
                                                      
                                                        'next_segm_last_15_30_mins_count', 
                                                        'next_segm_last_30_45_mins_count',
                                                        'next_segm_last_45_60_mins_count']]

masked_data = np.ma.masked_array(data, np.isnan(data))

se_min.loc[mask, 'mean_offsets_enhanced_15plus_filtered'] = np.ma.average(masked_data, 
                                               axis=1,
                                               weights=weights)



In [None]:
bus_counts = np.sum(se_min[['last_15_mins_count', 
                                                        'last_15_30_mins_count', 
                                                        'last_30_45_mins_count',
                                                        'last_45_60_mins_count',
                                                       'prev_segm_last_15_mins_count', 
                                                        'prev_segm_last_15_30_mins_count', 
                                                        'prev_segm_last_30_45_mins_count',
                                                        'prev_segm_last_45_60_mins_count',
                                                       'next_segm_last_15_mins_count', 
                                                        'next_segm_last_15_30_mins_count', 
                                                        'next_segm_last_30_45_mins_count',
                                                        'next_segm_last_45_60_mins_count']], axis=1)

In [None]:
plt.hist(bus_counts, bins=50, range=(0,100));
plt.title("Number of data points for last hour incl prev & next segments")

In [None]:
se_min['mean_offsets_enhanced_all']

In [None]:
se_min['mean_durations_by_segment_code_and_hour_and_day_enhanced_all'] = se_min['mean_durations_by_segment_code_and_hour_and_day'] * (1 + (se_min['mean_offsets_enhanced_all']/100))


In [None]:

se_min['mean_durations_by_segment_code_and_hour_and_day_enhanced_15plus'] = se_min['mean_durations_by_segment_code_and_hour_and_day'] * (1 + (se_min['mean_offsets_enhanced_15plus']/100))



In [None]:
se_min['mean_durations_by_segment_code_and_hour_and_day_enhanced_15plus_filtered'] = se_min['mean_durations_by_segment_code_and_hour_and_day'] * (1 + (se_min['mean_offsets_enhanced_15plus_filtered']/100))

In [None]:
se_min['mean_durations_by_segment_code_and_hour_and_day_enhanced_7'] = se_min['mean_durations_by_segment_code_and_hour_and_day']

se_min.loc[se_min['last_30_mins_count'] >= 7, 'mean_durations_by_segment_code_and_hour_and_day_enhanced_7'] = se_min.loc[se_min['last_30_mins_count'] >= 7, 'mean_durations_by_segment_code_and_hour_and_day'] * (1 + (se_min.loc[se_min['last_30_mins_count'] >= 7, 'last_30_mins_mean']/100))



In [None]:
for i in range(10):
    print(f"{i} -> {np.count_nonzero(se_min['last_30_mins_count'] >= i)/len(se_min)*100}%")

In [None]:
plt.hist(se_min.loc[se_min['last_30_mins_count'] >= 7, 'last_30_mins_mean'], bins=100);

In [None]:
predict_array_median_chd = np.empty((se_min.shape[0],140)).astype(float)
predict_array_median_chd[:] = np.nan

predict_array_mean_chd = np.empty((se_min.shape[0],140)).astype(float)
predict_array_mean_chd[:] = np.nan

predict_array_mean_enh = np.empty((se_min.shape[0],140)).astype(float)
predict_array_mean_enh[:] = np.nan

predict_array_mean_enh7 = np.empty((se_min.shape[0],140)).astype(float)
predict_array_mean_enh7[:] = np.nan

predict_array_mean_enh_all = np.empty((se_min.shape[0],140)).astype(float)
predict_array_mean_enh_all[:] = np.nan

predict_array_mean_enh_15plus = np.empty((se_min.shape[0],140)).astype(float)
predict_array_mean_enh_15plus[:] = np.nan

predict_array_mean_enh_15plus_filtered = np.empty((se_min.shape[0],140)).astype(float)
predict_array_mean_enh_15plus_filtered[:] = np.nan

predict_array_median_rules = np.empty((se_min.shape[0],140)).astype(float)
predict_array_median_rules[:] = np.nan

predict_array_mean_rules = np.empty((se_min.shape[0],140)).astype(float)
predict_array_mean_rules[:] = np.nan

actual_array = np.empty((se_min.shape[0],140)).astype(float)
actual_array[:] = np.nan

In [None]:
# Rest the index so that it matches the index into the numpy array

se_min.reset_index(drop=True, inplace=True)

In [None]:
runs = se_min.groupby(['date','workid'])

In [None]:
for name, run in tqdm(runs):
    run = run.sort_values("actualArrival")
    
    run_length = len(run)
    
    for i in range(min([run_length, 70])):
        
        predict_array_median_chd[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['median_durations_by_segment_code_and_hour_and_day', 'median_dwell_prev_by_stop_code_and_hour_and_day']]
        
        predict_array_mean_chd[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code_and_hour_and_day', 'mean_dwell_prev_by_stop_code_and_hour_and_day']]
        
#         predict_array_mean_enh[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code_and_hour_and_day_enhanced', 'median_dwell_prev_durations_by_stop_code']]
        
#         predict_array_mean_enh7[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code_and_hour_and_day_enhanced_7', 'mean_dwell_prev_durations_by_stop_code']]
        
        predict_array_mean_enh_all[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code_and_hour_and_day_enhanced_all', 'mean_dwell_prev_durations_by_stop_code']]
        
        predict_array_mean_enh_15plus[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code_and_hour_and_day_enhanced_15plus', 'mean_dwell_prev_durations_by_stop_code']]
        
        predict_array_mean_enh_15plus_filtered[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code_and_hour_and_day_enhanced_15plus_filtered', 'mean_dwell_prev_durations_by_stop_code']]
        
#         predict_array_median_rules[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code', 'dwell_predict_rules_median']]
        
#         predict_array_mean_rules[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['mean_durations_by_segment_code', 'dwell_predict_rules_mean']]
    
        actual_array[run.iloc[i:].index,i*2:i*2+2] = run.iloc[:run_length-i][['segment_duration', 'dwell_duration_prev']]
        

In [None]:
nans = []

for i in range(140):
    nans.append(predict_array_median_chd.shape[0] - np.count_nonzero(np.isnan(predict_array_median_chd[:,i])))
    
plt.plot(nans)

In [None]:
def percent_in_x_percent(predict, actual, threshold):
    
    if np.count_nonzero(~np.isnan(actual)) == 0:
        return 0
    
    threshold = threshold/100
    
    mask = (~np.isnan(predict) & ~np.isnan(actual))
    
    pass_count = np.count_nonzero((predict[mask] < actual[mask] * (1 + threshold)) & (predict[mask] > actual[mask] * (1-threshold)))
    
    over_count = np.count_nonzero(predict[mask] > actual[mask] * (1+threshold))
    
    under_count = np.count_nonzero(predict[mask] < actual[mask] * (1-threshold))
    
    pass_percent = pass_count/np.count_nonzero(mask) * 100
    
    if over_count + under_count == 0:
        drift = 0.5
    else:
        drift = over_count / (over_count + under_count)
    
    return pass_percent, drift

In [None]:
def make_accuracy_matrix_minutes(predict, actual, max_threshold = 50):

    actual_ints = np.array(actual/60).astype(int)
    
    rows = int(max_threshold/10)
    
    max_a = np.nanmax(actual)/60

    accuracies_table = np.empty((rows, int(max_a)))
    drift_table = np.empty((rows, int(max_a)))
    frequency = np.empty(int(max_a))

    for i in range(int(max_a)):
        print(".", end="", flush=True)
        mask = (actual_ints == i)
        
        frequency[i] = np.count_nonzero(mask)
        
        for j in range(1, rows+1):
            accuracy, drift = percent_in_x_percent(predict[mask], actual[mask], j * 10)
            accuracies_table[j-1,i] = accuracy
            drift_table[j-1, i] = drift

    return accuracies_table, frequency, drift_table

In [None]:
def show_accuracy_minutes(predict, actual, title):
    results, frequency, drift = make_accuracy_matrix_minutes(predict, actual)
    
    for i in range(results.shape[0]):
        plt.plot(results[i,:], label=f"{(i+1)*10}%")
        
        
    plt.xlabel("minutes ahead")
    plt.ylabel("percentage within threshold")
    plt.legend()
    plt.title(title)
    plt.xlim(0,20)
    plt.ylim(0,100)
    plt.gca().yaxis.grid(True, linewidth="0.2")
    
    ax = plt.gca()
    
    ax2 = ax.twinx()
    ax2.plot(drift[0,:], label="fraction over", linestyle=":")
    ax2.set_ylim(0,1)
   
    plt.show()

In [None]:
predict_array_median_chd_cum = np.cumsum(predict_array_median_chd, axis=1)
predict_array_mean_chd_cum = np.cumsum(predict_array_mean_chd, axis=1)
# predict_array_mean_enh_cum = np.cumsum(predict_array_mean_enh, axis=1)
# predict_array_mean_enh7_cum = np.cumsum(predict_array_mean_enh7, axis=1)
predict_array_mean_enh_all_cum = np.cumsum(predict_array_mean_enh_all, axis=1)
predict_array_mean_enh_15plus_cum = np.cumsum(predict_array_mean_enh_15plus, axis=1)
predict_array_mean_enh_15plus_filtered_cum = np.cumsum(predict_array_mean_enh_15plus_filtered, axis=1)
# predict_array_median_rules_cum = np.cumsum(predict_array_median_rules, axis=1)
# predict_array_mean_rules_cum = np.cumsum(predict_array_mean_rules, axis=1)


actual_array_cum = np.cumsum(actual_array, axis=1)



In [None]:
predict_journey_median_chd_cum = np.cumsum(predict_array_median_chd[:, ::2], axis=1)
predict_journey_mean_chd_cum = np.cumsum(predict_array_mean_chd[:, ::2], axis=1)
# predict_journey_mean_enh_cum = np.cumsum(predict_array_mean_enh[:, ::2], axis=1)
# predict_journey_mean_enh7_cum = np.cumsum(predict_array_mean_enh7[:, ::2], axis=1)
predict_journey_mean_enh_all_cum = np.cumsum(predict_array_mean_enh_all[:, ::2], axis=1)
predict_journey_mean_enh_15plus_cum = np.cumsum(predict_array_mean_enh_15plus[:, ::2], axis=1)
predict_journey_mean_enh_15plus_filtered_cum = np.cumsum(predict_array_mean_enh_15plus_filtered[:, ::2], axis=1)
actual_journey_cum = np.clip(np.cumsum(actual_array[:, ::2], axis=1), 0, 2*60*60)

predict_dwell_median_chd_cum = np.cumsum(predict_array_median_chd[:, 1::2], axis=1)
predict_dwell_mean_chd_cum = np.cumsum(predict_array_mean_chd[:, 1::2], axis=1)
actual_dwell_cum = np.clip(np.cumsum(actual_array[:, 1::2], axis=1), 0, 2*60*60)

In [None]:
# predict_dwell_median_rules_cum = np.cumsum(predict_array_median_rules[:, 1::2], axis=1)
# predict_dwell_mean_rules_cum = np.cumsum(predict_array_mean_rules[:, 1::2], axis=1)

In [None]:
actual_array_cum = np.clip(actual_array_cum, 0, 2*60*60)

In [None]:
show_accuracy_minutes(predict_array_mean_chd_cum, actual_array_cum, "full journey accuracies means (chd)")

In [None]:
show_accuracy_minutes(predict_array_median_chd_cum, actual_array_cum, "full journey accuracies median (chd)")

In [None]:
show_accuracy_minutes(predict_array_mean_enh_all_cum, actual_array_cum, "full journey accuracies mean enhanced all")


In [None]:
show_accuracy_minutes(predict_array_mean_enh7_cum, actual_array_cum, "full journey accuracies mean enhanced 7")

In [None]:
show_accuracy_minutes(predict_journey_mean_chd_cum, actual_journey_cum, "just journey accuracies means (chd)")


In [None]:
show_accuracy_minutes(predict_journey_median_chd_cum, actual_journey_cum, "just journey accuracies medians (chd)")

In [None]:
show_accuracy_minutes(predict_journey_mean_enh7_cum, actual_journey_cum, "just journey accuracies means (enh7)")

In [None]:
results, _, _= make_accuracy_matrix_minutes(predict_journey_mean_chd_cum, actual_journey_cum, 10)
results_enh7, _, _ = make_accuracy_matrix_minutes(predict_journey_mean_enh7_cum, actual_journey_cum, 10)
results_enh_all, _, _ = make_accuracy_matrix_minutes(predict_journey_mean_enh_all_cum, actual_journey_cum, 10)
results_enh_15plus, _, _ = make_accuracy_matrix_minutes(predict_journey_mean_enh_15plus_cum, actual_journey_cum, 10)
results_enh_15plus_filtered, _, _ = make_accuracy_matrix_minutes(predict_journey_mean_enh_15plus_filtered_cum, actual_journey_cum, 10)


In [None]:
plt.plot(results[0,:], label="mean")
# plt.plot(results_enh7[0,:], label="mean_enh7")
plt.plot(results_enh_all[0,:], label="mean_enh_all")
plt.plot(results_enh_15plus[0,:], label="mean_enh_15plus")
plt.plot(results_enh_15plus_filtered[0,:], label="mean_enh_15plus_filtered")
plt.legend()
plt.xlim(0,20)
plt.ylim(20,60)
plt.title("Percentage of predictions within 10%")
plt.xlabel("minutes ahead")
plt.ylabel("percent within 10%")