# Create Heatmaps for Event Dataset
The notebook creates several data series for the use in the heatmap visualizer for analyzing the event density and distribution in New York City.

In [1]:
import pandas as pd
import os
import json

## Load Event Dataset
The path to the event dataset must be stored in the ```DATA_PATH``` variable. It can be generated using the ```merge_events_venues.py``` script.

In [2]:
DATA_PATH = "../scripts/events/data/merged_events.csv"
OUTPUT_PATH = "../heatmap-visualizer/maps"

In [3]:
data = pd.read_csv(DATA_PATH,
                   parse_dates=["start_date", "end_date"],
                   index_col="id")
data["longitude"] = pd.to_numeric(data["longitude"], errors="coerce")
data["score"] = data["score"].replace(-1, 0)

IOError: File ../scripts/events/data/merged_events.csv does not exist

## Discretize Latitute & Longitudes

In [3]:
ROUND_PARAM = 3
DISCRETIZED_COLUMN_NAMES = ["longitude_descretized", "latitude_discretized"]

data["longitude_descretized"] = data["longitude"].round(ROUND_PARAM)
data["latitude_discretized"] = data["latitude"].round(ROUND_PARAM)

## Helper methods

In [21]:
def make_json(scores, title, normalize_value):
    points = []
    for i in range(len(scores)):
        lon, lat = scores.index[i]
        score = scores["score"][i]
        points.append({"lat": lat, "lon": lon, "weight": float(score / normalize_value) + 1})
    
    return {
        "title": title,
        "pointRadius": 10,
        "data": points
    }

## Heatmaps

### Average event density
The heatmap series contains the summed scores of the events happening in the discretized districts in the whole time period. 

In [28]:
summed_scores = data[DISCRETIZED_COLUMN_NAMES + ["score"]].groupby(DISCRETIZED_COLUMN_NAMES).sum()

In [29]:
with open(os.path.join(OUTPUT_PATH, "event_density.json"), "w") as outfile:
    json.dump(make_json(summed_scores, "Average event density", summed_scores["score"].max()), outfile)

### Average events by weekday
The heatmap series contains the summed scores of the events happening in the discretized districts grouped by weekday. 

In [24]:
data["weekday"] = data["start_date"].apply(lambda x: x.weekday())
summed_scores = data[DISCRETIZED_COLUMN_NAMES + ["score", "weekday"]].groupby(["weekday"] + DISCRETIZED_COLUMN_NAMES).sum()

In [1]:
output_dir = os.path.join(OUTPUT_PATH, "events_by_weekday")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

weekday_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

for weekday in range(7):
    weekday_summed_scores = summed_scores.loc[weekday]
    with open(os.path.join(output_dir, str(weekday) + ".json"), "w") as outfile:
        json.dump(make_json(weekday_summed_scores, weekday_names[weekday], summed_scores["score"].max()), outfile)

NameError: name 'os' is not defined

### Average events by hour
The heatmap series contains the summed scores of the events happening in the discretized districts grouped by hour. 

In [34]:
data["hour"] = data["start_date"].apply(lambda x: x.hour)
summed_scores = data[DISCRETIZED_COLUMN_NAMES + ["score", "hour"]].groupby(["hour"] + DISCRETIZED_COLUMN_NAMES).sum()

In [39]:
output_dir = os.path.join(OUTPUT_PATH, "events_by_hour")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for hour in range(24):
    if hour in summed_scores.index:
        weekday_summed_scores = summed_scores.loc[hour]
    else:
        weekday_summed_scores = []
    with open(os.path.join(output_dir, "%02d.json" % hour), "w") as outfile:
        json.dump(make_json(weekday_summed_scores, "%02d:00" % hour, summed_scores["score"].max()), outfile)