# Data Preperation for Streamlit WW2 Implementation
Our WW2 data contains attacks on individual coordinates.

In this session we want to create a nice looking web-page that let's us easily compare statistics between individual countries.

In order to do this we have to aggregate our individual coordinate-based data to per-country data.

## Imports

In [None]:
import pandas as pd
import geopandas as gpd

## Load Raw Data

In [None]:
# load data on european countries
europe = pd.read_pickle("../data/europe.p").to_crs("EPSG:4326")
# contains all countries intersecting our polygon from the WW2 analysis
# not only european countries per se
# columns of interest for us: name and geometry
europe.head()


In [None]:
# load WW2 data for europe
# geometry contains coordinate POINTS
europe_data = pd.read_pickle("../data/gdf_europe.p").to_crs("EPSG:4326")
europe_data.head()

## Replace target coordinates by target countries polygons

In [None]:
res_intersect_europe = gpd.overlay(europe_data, europe, how='intersection')
# Basically checks for each geometry in "europe_data" (target coordinates) 
# if intersects with geometry of "europe" (country)
# -> Merges both dataframes and keeps country polygon instead of target coordinates
res_intersect_europe.head()

In [None]:
# Set index to country name so we can directly insert data grouped by country name
europe.set_index("name", inplace=True, drop=False)

In [None]:
# get attack counts and total weight of explosives per year
# and apply it to a given target country
for year in range(1939, 1946):
    subset = res_intersect_europe.loc[res_intersect_europe['year'] == year]
    europe[f"attacks_{year}"] = subset.groupby("name")["Mission ID"].count()
    europe[f"explosives_weight_{year}"] = subset.groupby("name")["High Explosives Weight (Tons)"].sum()
# get totals
europe[f"attacks_all"] = res_intersect_europe.groupby("name")["Mission ID"].count()
europe[f"explosives_weight_all"] = res_intersect_europe.groupby("name")["High Explosives Weight (Tons)"].sum()
# replace NaN (no a number) values with 0
europe.fillna(0, inplace=True)

In [None]:
# since we will be doing this more often, let's build a generic function that implements the steps above
def aggregate_per_year(
                    df: pd.DataFrame, 
                    df_data: pd.DataFrame, 
                    target_column: str = "Mission ID", 
                    aggr_column_name: str = "attacks",
                    aggr_method: str = "count",
                    years: list = range(1939, 1945 + 1),
                ) -> pd.DataFrame:
    """[summary]

    Returns:
        [type]: [description]
    """
    for year in years:
        subset = df_data.loc[df_data['year'] == year]
        df[f"{aggr_column_name}_{year}"] = subset.groupby("name")[target_column].agg(aggr_method)
    df[f"{aggr_column_name}_all"] = df_data.groupby("name")[target_column].agg(aggr_method)
    df.fillna(0, inplace=True)
    return df

In [None]:
aggregate_per_year(
    europe, res_intersect_europe
)

In [None]:
# our data now contains the name of a country
# it's borders and number of attacks and total weight of explosives dropped per year
europe.loc["Germany"].to_frame()

In [None]:
europe.sample(5)

# German Bundeslaender

In [None]:
# load data containing German Bundesländer
# similarly to the europe dataset it contains polygons mapping individual Bundesländer
bl = gpd.read_file('../data/vg2500_geo84.zip').to_crs("EPSG:4326")
# rename the column that specifies the name so that is the same with the remaining data
bl.rename({"GEN": "name"}, axis=1, inplace=True)

In [None]:
bl.head()

In [None]:
# same as before but now we want to aggregate data for individual Bundesländer
res_intersect_bl = gpd.overlay(europe_data, bl, how='intersection')

In [None]:
res_intersect_bl.sample(5)

In [None]:
# get attack counts and total weight of explosives per year
# and apply it to a given target Bundesland
for year in range(1939, 1946):
    subset = res_intersect_bl.loc[res_intersect_bl['year'] == year]
    bl[f"attacks_{year}"] = subset.groupby("name")["Mission ID"].count()
    bl[f"explosives_weight_{year}"] = subset.groupby("name")["High Explosives Weight (Tons)"].sum()
bl[f"attacks_all"] = res_intersect_bl.groupby("name")["Mission ID"].count()
bl[f"explosives_weight_all"] = res_intersect_bl.groupby("name")["High Explosives Weight (Tons)"].sum()
bl.fillna(0, inplace=True)

In [None]:
bl.sample(5)

## German Landkreise
Let's go even lower than Bundesländer

In [None]:
kreise = gpd.read_file('../data/vg2500_krs.zip').to_crs("EPSG:4326")
kreise.rename({"GEN": "name"}, axis=1, inplace=True)
kreise.sample(5)

repeat the stepts

In [None]:
res_intersect_kreise = gpd.overlay(europe_data, kreise, how='intersection')

In [None]:
for year in range(1939, 1946):
    subset = res_intersect_kreise.loc[res_intersect_kreise['year'] == year]
    kreise[f"attacks_{year}"] = subset.groupby("name")["Mission ID"].count()
    kreise[f"explosives_weight_{year}"] = subset.groupby("name")["High Explosives Weight (Tons)"].sum()
kreise[f"attacks_all"] = res_intersect_kreise.groupby("name")["Mission ID"].count()
kreise[f"explosives_weight_all"] = res_intersect_kreise.groupby("name")["High Explosives Weight (Tons)"].sum()
kreise.fillna(0, inplace=True)

In [None]:
europe.to_pickle("../data/europe_attacks.p")
bl.to_pickle("../data/bl_attacks.p")
kreise.to_pickle("../data/kreise_attacks.p")

In [None]:
res_intersect_kreise.to_pickle("../data/kreise_full.p")

In [None]:
res_intersect_kreise