In [25]:
import json
import copy
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error

from coordinate_converter import utm_to_ssb_grid_id

In [26]:
def create_empty_counts():
    # grids = pd.read_csv("data/grid_centroids.csv")
    base_stations = pd.read_csv("data/base_stations.csv")

    counts = {}

    time = {}
    # for month in range(1, 13):
    time[8] = {}
    for day in range(1, 8):
        time[8][day] = {}
        for hour in range(0, 24):
            time[8][day][hour] = 0

    for station in base_stations.values:
        counts[int(station[0])] = copy.deepcopy(time)

    return counts

In [27]:
def create_num_weekdays():
    num_weekdays = {}
    for dayweek in range(1, 8):
        num_weekdays[dayweek] = 0
    return num_weekdays

In [28]:
def count_incidents(df, counts, num_weekdays):
    prev_weekday = 0

    grid_zones = pd.read_csv("data/grid_zones.csv")

    for incident in tqdm(df.values, desc="Count incidents per hour"):
        date = datetime.strptime(incident[0], '%Y-%m-%d %H:%M:%S')
        weekday = date.weekday() + 1
        incident_grid = utm_to_ssb_grid_id(int(incident[1]), int(incident[2]))
        if date > datetime(2017, 8, 6) and date < datetime(2017, 8, 14):
            try:
                incident_station = grid_zones.loc[grid_zones["SSBID1000M"] == incident_grid, "base_station"].iloc[0]
            except:
                print(f"grid {incident_grid} was not in grid_zones.csv")
                continue
            counts[incident_station][8][weekday][date.hour] += 1
            
            if weekday != prev_weekday:
                if weekday - prev_weekday != 1 and weekday - prev_weekday != -6 and prev_weekday != 0:
                    print("Skipped one or more days")
                num_weekdays[weekday] += 1
                prev_weekday = weekday

In [41]:
def average_count(incident_distribution, num_weekdays):
    for station_id in tqdm(incident_distribution.keys(), desc="Change to average per day"):
        #for month in incident_distribution[station_id].keys():
        for weekday in incident_distribution[station_id][8].keys():
            for hour in incident_distribution[station_id][8][weekday].keys():
                count = incident_distribution[station_id][8][weekday][hour]
                if count > 0:
                    num_weekday_month = num_weekdays[weekday]
                    weekday_in_month_average = count/num_weekday_month
                    incident_distribution[station_id][8][weekday][hour] = round(weekday_in_month_average, 4)

In [29]:
def prediction_accuracy(counts, incident_distribution):
    truths = []
    predictions = []

    for station_id in counts.keys():
        #for year in counts[station_id].keys():
            #for month in counts[station_id].keys():
        for weekday in counts[station_id][8].keys():
            for hour in counts[station_id][8][weekday].keys():
                count = counts[station_id][8][weekday][hour]
                prediction = incident_distribution[str(station_id)][str(8)][str(weekday)][str(hour)]
                
                truths.append(count)
                predictions.append(prediction)
    
    return truths, predictions
                    


In [30]:
INCIDENTS_FILE = "data/incidents_all_processed.csv"
DISTRIBUTION_FILE = "data/incidents_distribution_station_avg.json"

file = open(DISTRIBUTION_FILE, 'r')
distributions = json.load(file)
incidents = pd.read_csv(INCIDENTS_FILE, encoding='utf-8', escapechar='\\', parse_dates=True)

In [45]:
counts = create_empty_counts()
num_weekdays = create_num_weekdays()
count_incidents(incidents, counts, num_weekdays)

Count incidents per hour: 100%|██████████| 368148/368148 [00:07<00:00, 48678.59it/s]


In [46]:
average_count(counts, num_weekdays)

Change to average per day: 100%|██████████| 19/19 [00:00<00:00, 9550.79it/s]


In [47]:
truths, predictions = prediction_accuracy(counts, distributions)

In [48]:
print("mean absolute error:", mean_absolute_error(truths, predictions))
print("mean square error:", mean_squared_error(truths, predictions))

h = [(round(p, 3), round(t, 3)) for p, t in zip(predictions, truths)]
for i in h:
    print(i)

mean absolute error: 0.5114391290726817
mean square error: 0.5183729956171679
(0.167, 0)
(0.111, 0)
(0.222, 0)
(0.056, 0)
(0.056, 0)
(0.167, 0)
(0.111, 0)
(0.111, 1.0)
(0.556, 0)
(0.333, 1.0)
(0.222, 0)
(0.333, 1.0)
(0.167, 0)
(0.333, 0)
(0.333, 1.0)
(0.278, 1.0)
(0.389, 1.0)
(0.222, 0)
(0.333, 0)
(0.556, 0)
(0.5, 0)
(0.389, 0)
(0.111, 0)
(0, 0)
(0.111, 1.0)
(0.167, 0)
(0.056, 0)
(0, 1.0)
(0.111, 0)
(0.056, 0)
(0.222, 0)
(0.278, 0)
(0.5, 1.0)
(0.5, 0)
(0.5, 1.0)
(0.444, 0)
(0.444, 0)
(0.111, 0)
(0.5, 1.0)
(0.389, 0)
(0.278, 0)
(0.389, 1.0)
(0.333, 0)
(0.5, 1.0)
(0.389, 1.0)
(0.444, 0)
(0.5, 0)
(0.222, 0)
(0.158, 0)
(0.21, 0)
(0.105, 0)
(0.21, 0)
(0, 0)
(0.053, 0)
(0.263, 0)
(0.158, 1.0)
(0.684, 0)
(0.316, 0)
(0.316, 0)
(0.684, 0)
(0.421, 0)
(0.579, 0)
(0.263, 0)
(0.316, 0)
(0.158, 0)
(0.21, 0)
(0.263, 1.0)
(0.421, 0)
(0.263, 1.0)
(0.158, 0)
(0, 0)
(0.316, 0)
(0.222, 1.0)
(0.167, 0)
(0.167, 0)
(0, 0)
(0.056, 0)
(0.222, 0)
(0.056, 0)
(0.111, 0)
(0.278, 0)
(0.278, 0)
(0.389, 1.0)
(0.167, 