In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict

from helpers import bendford_comparison_figure, plot_oom_dist

### Convenience Functions for Retrieving Datasets

In [2]:
data_root = Path("data")
data_group_location_map = {
    "covid": data_root / "covid",
    "ukraine-war": data_root / "war_in_ukraine",
    "world-population": data_root / "world_population"

}
def get_csv_file_path(group: str, filename: str) -> Path:
    """Returns the absolute path to a CSV dataset.
    
    Args:
        group (str): dataset group, i.e. 'covid'
        filename (str): the name of the file within the group.

    Returns:
        Path: absolute path to specified file.
    """

    return data_group_location_map[group] / filename


def get_dataframe(group: str, filename: str) -> pd.DataFrame:
    """Returns an instance of a Pandas DataFrame with data obtained 
    from a specified dataset.

    Args:
        group (str): dataset group, i.e. 'covid'
        filename (str): the name of the file within the group.

    Returns:
        pd.DataFrame: dataframe with loaded data
    """
    filepath = get_csv_file_path(group, filename)
    return pd.read_csv(filepath)

### Prepare the Datasets

In [3]:
# map dataset name to sequence and a human readable title
dataset_sequences: Dict[str, Dict] = {}

#### Samples from Random Uniform Distribution
Synthetic data for comparison

In [4]:
name = "random_uniform"
title = "Random Uniform"
sequence = np.abs(np.random.uniform(0, 1e10, (1000)))
dataset_sequences[name] = dict(
    sequence=sequence.ravel().astype(np.int64),
    title=title
)

#### World Population
See https://www.kaggle.com/datasets/tanuprabhu/population-by-country-2020

In [5]:
name = "world_population_2020"
title = "World Population (2020)"


world_population_data = get_dataframe("world-population", "world_population_2020.csv")
world_population_data = world_population_data[["Population (2020)", "Density (P/Km²)", "Land Area (Km²)"]]
dataset_sequences[name] = dict(
    sequence=world_population_data.values.ravel().astype(np.int64),
    title=title
)

#### World Covid Cases
See https://www.kaggle.com/datasets/imdevskp/corona-virus-report?select=covid_19_clean_complete.csv

In [6]:
name = "world_covid_cases"
title = "World Covid Cases"

world_covid_data = pd.read_csv(get_csv_file_path("covid", "covid_cases_country.csv"))
world_covid_data = world_covid_data.drop(columns=["Country/Region", "WHO Region", "1 week change", "1 week % increase", "Deaths / 100 Recovered", "Recovered / 100 Cases", "Deaths / 100 Cases"])
dataset_sequences[name] = dict(
    sequence=world_covid_data.values.ravel().astype(np.int64),
    title=title
)

#### US County Covid Cases
See https://www.kaggle.com/datasets/sudalairajkumar/covid19-in-usa

In [7]:
name = "usa_covid_cases"
title = "USA Covid Cases"

covid_data_usa = pd.read_csv(get_csv_file_path("covid", "covid_cases_us_counties.csv"))
covid_data_usa = covid_data_usa[["cases", "deaths"]]
covid_data_usa = covid_data_usa.dropna()
dataset_sequences[name] = dict(
    sequence=covid_data_usa.values.ravel().astype(np.int64),
    title=title
)

#### Russia Covid Cases
See https://data.humdata.org/dataset/covid-19-cases-data-in-russia

In [8]:
name = "russia_covid_cases"
title = "Covid (Russia)"
covid_data_russia = pd.read_csv(get_csv_file_path("covid", "covid_cases_russia.csv"))
covid_data_russia = covid_data_russia[["infected", "recovered", "died"]]
dataset_sequences[name] = dict(
    sequence=covid_data_russia.values.ravel().astype(np.int64),
    title=title
)


#### Russia Equipment Losses During War in Ukraine
See https://www.kaggle.com/datasets/piterfm/2022-ukraine-russian-war

In [9]:
name = "russia_losses_equipment"
title = "Russia Losses (Equipment)"
russia_losses_equipment = pd.read_csv(get_csv_file_path("ukraine-war", "russia_losses_equipment.csv"))
russia_losses_equipment = russia_losses_equipment.drop(columns=["date", "day", "special equipment", "mobile SRBM system"])

dataset_sequences[name] = dict(
    sequence=russia_losses_equipment.values.ravel().astype(np.int64),
    title=title
)

### Plot Comparison Figures and Order of Magnitude Distributions

In [10]:
# this will take about 20 seconds
dataset_figures = {}
for index, (dataset_name, dataset) in enumerate(dataset_sequences.items()):
    print(f"Plotting {index + 1} / {len(dataset_sequences)}.", end="\r")
    bendford_figure = bendford_comparison_figure(
        sequence=dataset["sequence"],
        legend=dataset["title"]
    )
    oom_dist_figure = plot_oom_dist(
        dataset["sequence"],
        legend=dataset["title"]
    )
    
    # bendford_figure.write_image(f"../figures/{dataset_name.lower().replace(' ', '-')}.png")
    # oom_dist_figure.write_image(f"../figures/{dataset_name.lower().replace(' ', '-')}-oom.png")
    dataset_figures[dataset_name] = dict(
        bendford=bendford_figure,
        oom=oom_dist_figure
    )
print("Finished plotting.")

Finished plotting.


In [13]:
dataset_figures

{'random_uniform': {'bendford': Figure({
      'data': [{'line': {'color': 'rgb(231,41,138)'},
                'name': "Bendford's Law.",
                'type': 'scatter',
                'x': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                'y': array([0.30103   , 0.17609126, 0.12493874, 0.09691001, 0.07918125, 0.06694679,
                            0.05799195, 0.05115252, 0.04575749])},
               {'line': {'color': 'MediumPurple'},
                'name': 'Random Uniform',
                'type': 'scatter',
                'x': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                'y': array([0.127, 0.11 , 0.115, 0.108, 0.108, 0.109, 0.101, 0.11 , 0.112])}],
      'layout': {'height': 700,
                 'legend': {'x': 0.85, 'xanchor': 'left', 'y': 0.99, 'yanchor': 'top'},
                 'template': '...',
                 'title': {'text': ('<b>Digit Occurrence Distributi' ... '-size: 14px;">N = 1000.</span>')},
                 'width': 900,
                 'xaxis': {

In [14]:
key_list = list(dataset_figures.keys())

In [15]:
index = 0
bendf_plot = dataset_figures[key_list[index]]["bendford"]
oom_plot = dataset_figures[key_list[index]]["oom"]

bendf_plot.show()
oom_plot.show()

In [16]:
index = 1
bendf_plot = dataset_figures[key_list[index]]["bendford"]
oom_plot = dataset_figures[key_list[index]]["oom"]

bendf_plot.show()
oom_plot.show()

In [17]:
index = 2
bendf_plot = dataset_figures[key_list[index]]["bendford"]
oom_plot = dataset_figures[key_list[index]]["oom"]

bendf_plot.show()
oom_plot.show()

In [20]:
index = 3
bendf_plot = dataset_figures[key_list[index]]["bendford"]
oom_plot = dataset_figures[key_list[index]]["oom"]

bendf_plot.show()
# oom_plot.show() # output too large for deepnote

In [19]:
index = 4
bendf_plot = dataset_figures[key_list[index]]["bendford"]
oom_plot = dataset_figures[key_list[index]]["oom"]

bendf_plot.show()
oom_plot.show()

In [21]:
index = 5
bendf_plot = dataset_figures[key_list[index]]["bendford"]
oom_plot = dataset_figures[key_list[index]]["oom"]

bendf_plot.show()
oom_plot.show()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e23ee56b-a103-4c1f-a760-e62f238ef4cc' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>