In [None]:
import pandas as pd
import numpy as np
import geopandas as gp
import matplotlib.pyplot as plt 
import sys
sys.path.append("..") # src exists in the parent directory
import src.config
output_name = "LastFourQuartersOrBestEstimate_On_DissolvedSmallerCitiesHexes.gpkg"
output_dir = src.config.DATA_DIRECTORY / "processed" / "statistical_geometries"

In [None]:
def convert_kbps_to_mbps(table, copy=False):
    if copy:
        table = table.copy()
    for col in table.columns:
        if "kbps" not in col:
            continue
        table.loc[:, col] /= 1000
        table.rename(columns={col: col.replace("kbps", "Mbps")}, inplace=True)
    return table

In [None]:
def load_speed_data():
    print("Loading speed data...")

    speed_data = gp.read_file(output_dir / output_name, driver="GPKG")

    speed_data["Ookla_Pop_at_50_10"] = (
        speed_data["Pop2016"] * speed_data["ookla_50_10_percentile"] / 100
    )

    speed_data = convert_kbps_to_mbps(speed_data)

    speed_data["is_rural"] = ~speed_data.PCCLASS.isin(["2", "3", "4"])

    # speed_data["PRUID"] = speed_data["PRCODE"].replace(PRCODE_MAP)
    speed_data["PCCLASS"] = speed_data["PCCLASS"].fillna("")
    return speed_data

In [None]:
def pcclass_percentage_breakdown(sample_data):
    stats_table = sample_data.groupby("PCCLASS")[
        ["Pop2016", "Pop2016_at_50_10_Combined", "Ookla_Pop_at_50_10"]
    ].sum()
    stats_table.index = pd.Index(
        pd.Series(stats_table.index)
        .replace({"2": "Small", "3": "Medium", "4": "Large", "": "Rural"})
        .values
    ).rename("Population Center Type")
    # stats_table = stats_table.append(
    #     sample_data[["Pop2016", "Pop2016_at_50_10_Combined", "Ookla_Pop_at_50_10"]]
    #     .sum()
    #     .rename("Total")
    # )
    other = sample_data[["Pop2016", "Pop2016_at_50_10_Combined", "Ookla_Pop_at_50_10"]].sum().rename("Total")
    other = pd.DataFrame(other).T
    stats_table = pd.concat([stats_table, other],axis=0)

    stats_table["Percentage_StatCan"] = (
        stats_table["Pop2016_at_50_10_Combined"] / stats_table["Pop2016"] * 100
    )
    stats_table["Percentage_Ookla"] = (
        stats_table["Ookla_Pop_at_50_10"] / stats_table["Pop2016"] * 100
    )
    stats_table.rename(
        columns={"Pop2016_at_50_10_Combined": "StatCan_Pop_at_50_10"}, inplace=True
    )

    return stats_table

In [None]:
speed_data = load_speed_data() # Half a minute to load
speed_data["Discrepancy"] = speed_data["Pop_Avail_50_10"] - speed_data["ookla_50_10_percentile"]
print(speed_data.columns)
print(np.shape(speed_data))

In [None]:
processed_data = pcclass_percentage_breakdown(speed_data)
processed_data.columns
processed_data

In [None]:
speed_data.plot.scatter(x="Pop_Avail_50_10", y="Discrepancy", s=4, alpha=0.5)
plt.axhline(y=0, color='k')

* Are all the large discrepancies above caused by low population/low tests entries?
* Sum along axis to quantify the overestimations and underestimations.

In [None]:
speed_data.plot.scatter(x="Pop2016", y="Discrepancy", s=4, alpha=0.5)
plt.axhline(y=0, color='k')
# Points close to zero discrepancy are also overestimated.

* Large discrepancies relate to low population entries.
* Large population entries have lower discrepancy but consistently overstimated speed by StatCan.

In [None]:
speed_data.plot.scatter(x="Pop2016", y="tests", s=4, alpha=0.5)
# plt.xlim(-1,100)
# plt.ylim(-1,100)
# Lots of regions with very low population

* Shows a lot of points in low tests/low population region
* How many points? Should they be discarded in the ML model?

In [None]:
plt.plot(speed_data['Pop2016'].sort_values().cumsum().values)
plt.ylabel("Cumulative Population")
plt.yscale('log')
plt.xlabel("Number of sorted rows")

* About 8k entries have close to zero population.

In [None]:
speed_data['tests'].sort_values().isna().cumsum().values[-1]

* Over 13k entries have nan tests.

In [None]:
speed_data[speed_data["tests"].isna()].loc[:,['Pop2016', 'PRCODE', 'PCNAME', 'PCCLASS','PCPUID', 'HEXUID_PCPUID', 'Pop_Avail_50_10', 'is_rural']].sort_values('Pop2016')

* Most nan tests are from low population entries.
* Some nan tests occur for moderate opulation too. What gives?

* Why are population numbers fractional? Unit conversion? or, population distributed over multiple hexes?