In [None]:
import geopandas as gpd
import pandas as pd

In [None]:
FP_NE_ADMIN0 = "data/ne/ne_10m_admin_0_countries.zip"
FP_NE_ADMIN1 = "data/ne/ne_10m_admin_1_states_provinces.zip"
FP_NE_CITIES = "data/ne/ne_10m_populated_places_simple.zip"
FP_OLIVIA = "data/olivia_cities.csv"

In [None]:
# pd.options.display.max_columns = 999

## Preparing Olivia data

### Read from Google Sheets

In [None]:
def load_sheet(
    refresh=False,
    sheet_id="12I5GR8v8H1LVXtjYX5S-v6zuO0up_aY3D3BqplzSu4s",
    sheet_num="0",
    save_path=FP_OLIVIA,
):
    """
    Load data from Google Sheet and save again,
    or just load from saved csv
    """
    
    if refresh:
        url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid={sheet_num}&format=csv"
        df = pd.read_csv(url)
        df.to_csv(save_path, index=False)
        
    # get data
    return pd.read_csv(save_path)

In [None]:
# df = load_sheet(refresh=True)

In [None]:
def sort_by_date(df):
    """Convert date column to datetime for sorting, then back to string"""
    df = df.astype({'date': 'datetime64[ns]'})
    df = df.sort_values('date')
    df = df.astype({'date': str})
    return df

In [None]:
def get_data(refresh=False):
    df = load_sheet(refresh=refresh)
    df = sort_by_date(df)
    return df

In [None]:
df = get_data()
df.shape

### Preparing my dataset

In [None]:
def prepare_city_dataset(df):
    use_cols = ["name", "adm1name", "adm0name", "biked", "date"]
    df = df.loc[:, use_cols]
    return df.groupby('name').agg(
        {
            'adm1name': 'first',
            'adm0name': 'first',
            'biked': 'any',
            'date': 'first',
        }
    ).reset_index()

In [None]:
def prepare_state_dataset(df):
    use_cols = ["adm1name", "adm0name", "biked", "date"]
    df = df.loc[:, use_cols]
    return df.groupby('adm1name').agg(
        {
            'adm0name': 'first',
            'biked': 'any',
            'date': 'first',
        }
    ).reset_index()

In [None]:
def prepare_country_dataset(df):
    use_cols = ["adm0name", "biked", "date"]
    df = df.loc[:, use_cols]
    return df.groupby('adm0name').agg(
        {
            'biked': 'any',
            'date': 'first',
        }
    ).reset_index()

## Get US States, non-US countries

In [None]:
map_args = {
    "tiles": "CartoDB positron"
}

In [None]:
df = get_data()

visited_usa = df.loc[df["adm0name"] == "United States of America"]
visited_abroad = df.loc[df["adm0name"] != "United States of America"]

### US States (admin1)

In [None]:
# --- export ---

# readable_list = [ "name", "name_alt", "admin", "adm0_a3", "adm1_code", "region", "type", "type_en", "postal"]
# regions[readable_list].to_csv("data/ne_out/ne_admin1.csv")

In [None]:
# get countries dataset
regions = gpd.read_file(FP_NE_ADMIN1)

# simplify
regions_simple = regions.reset_index().loc[:, ["index", "name", "admin", "geometry"]]
regions_simple.columns = ["adm1index", "adm1name", "adm0name", "geometry"]

In [None]:
v = prepare_state_dataset(visited_usa)
# v.head()

In [None]:
merged_states = pd.merge(regions_simple, v)
merged_states.head()

In [None]:
# no states got lost??
assert len(v) == len(merged_states)

In [None]:
# m = merged_states.explore(
#     column="biked",
#     tooltip=["adm1name", "biked", "date"],
#     **map_args,
# )

### Countries (admin0)

In [None]:
# get countries dataset
countries = gpd.read_file(FP_NE_ADMIN0)

# simplify
countries_simple = countries.reset_index().loc[:, ["index", "ADMIN", "geometry"]]
countries_simple.columns = ["adm0index", "adm0name", "geometry"]

In [None]:
merged_countries = pd.merge(
    countries_simple,
    prepare_country_dataset(visited_abroad)
)

merged_countries.head()

In [None]:
# merged_countries.explore(
#     m=m,
#     tooltip=["adm0name", "biked", "date"],
#     **map_args
# )

### Combined

In [None]:
merged = pd.concat([
    merged_countries.rename(columns={'adm0name': 'name'}),
    merged_states.rename(columns={'adm1name': 'name'})
])

In [None]:
m = merged.explore(
    column="biked",
    tooltip=["name", "biked"],
    **map_args
)
m

In [None]:
m.save("map.html")

## Other

### Find missing stuff

In [None]:
merged = gdf.merge(df, on=["name", "adm0name"], how='right')

In [None]:
# merged[merged['index_x'].isna()]

In [None]:
merged = gdf.merge(df, on=["name", "adm0name"])

In [None]:
merged = gdf.merge(df, on=["name", "adm1name", "adm0name"])

## Cities

In [None]:
# pd.set_option("display.max.columns", None)

In [None]:
# get olivia data
df = load_sheet(refresh=False)

# prepare it
df = df.reset_index()
df["date"] = pd.to_datetime(df["date"])

In [None]:


# get cities data
gdf_full = gpd.read_file(FP_NE_CITIES)

# prepare it
# gdf = gdf_full.reset_index().loc[:, ["index", "name", "adm1name", "adm0name"]]
gdf = gdf_full

In [None]:
# gdf_full[gdf_full["NAME"] == "Helsinki"]