<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [44]</a>'.</span>

In [None]:
from yugiquery import *

init_notebook_mode(all_interactive=True)

header("Sets")

---

Table of Contents <a class="jp-toc-ignore"></a>
=================
* [1 Data aquisition](#data-aquisition)
  * [1.1 Fetch set lists](#fetch-set-lists)
  * [1.2 Fetch set properties](#fetch-set-properties)
  * [1.3 Merge data](#merge-data)
* [2 Check changes](#check-changes)
  * [2.1 Load previous data](#load-previous-data)
  * [2.2 Generate changelog](#generate-changelog)
  * [2.3 Save data](#save-data)
* [3 Data visualization](#data-visualization)
  * [3.1 Full data](#full-data)
  * [3.2 Set](#set)
  * [3.3 Card number](#card-number)
  * [3.4 Name](#name)
  * [3.5 Rarity](#rarity)
    * [3.5.1 By unique](#by-unique)
    * [3.5.2 By combination](#by-combination)
  * [3.6 Print](#print)
  * [3.7 Quantity](#quantity)
  * [3.8 Region](#region)
  * [3.9 Cover card](#cover-card)
  * [3.10 Series](#series)
  * [3.11 Set type](#set-type)
  * [3.12 Release](#release)
    * [3.12.1 By year](#by-year)
    * [3.12.2 By month](#by-month)
    * [3.12.3 By day](#by-day)
    * [3.12.4 By region](#by-region)
    * [3.12.5 By series](#by-series)
    * [3.12.6 By set type](#by-set-type)
* [4 Debug](#debug)
  * [4.1 Has nan name/number](#has-nan-name/number)
  * [4.2 Has nan rarity](#has-nan-rarity)
  * [4.3 Has nan release](#has-nan-release)
  * [4.4 HTML export](#html-export)
  <!-- * [4.5 Git](#git) -->

# Data aquisition

## Fetch set lists

In [None]:
# Timestamp
timestamp = arrow.utcnow()

all_set_lists_df = fetch_all_set_lists()

## Fetch set properties

In [None]:
# List of sets to ask
sets = all_set_lists_df["Set"].unique()
set_info_df = api.fetch_set_info(sets)

## Merge data

In [None]:
# Add properties to set lists
all_set_lists_df = merge_set_info(all_set_lists_df, set_info_df)

# Check changes

## Load previous data

In [None]:
# Get latest file if exist
tuple_cols = ["Rarity", "Cover card"]
previous_df, previous_ts = load_latest_data("sets", tuple_cols)

if previous_df is not None:
    previous_df = previous_df.astype(
        all_set_lists_df[previous_df.columns.intersection(all_set_lists_df.columns)].dtypes.to_dict()
    )

## Generate changelog

In [None]:
if previous_df is None:
    changelog = None
    print("Skipped")
else:
    changelog = generate_changelog(previous_df, all_set_lists_df, col="Card number")
    if not changelog.empty:
        display(changelog)
        changelog.to_csv(
            dirs.DATA / make_filename(report="sets", timestamp=timestamp, previous_timestamp=previous_ts),
            index=True,
        )
        print("Changelog saved")

## Save data

In [None]:
if changelog is not None and changelog.empty:
    print("No changes. New data not saved")
else:
    all_set_lists_df.to_csv(
        dirs.DATA / make_filename(report="sets", timestamp=timestamp),
        index=False,
    )
    print("Data saved")

# Data visualization

## Full data

In [None]:
all_set_lists_df

Full data available [here](../data)

## Set

In [None]:
print("Total number of sets:", all_set_lists_df["Set"].nunique())

In [None]:
all_set_lists_df.groupby("Set", dropna=False).nunique()

In [None]:
all_set_lists_df.groupby("Set", dropna=False).nunique().describe()

## Card number

In [None]:
print("Total number of card numbers:", all_set_lists_df["Card number"].nunique())

In [None]:
all_set_lists_df.groupby("Card number", dropna=False).nunique()

## Name

In [None]:
print("Total number of card names:", all_set_lists_df["Name"].nunique())

In [None]:
all_set_lists_df.groupby("Name", dropna=False).nunique()

In [None]:
all_set_lists_df.groupby("Name", dropna=True).nunique().describe()

## Rarity

In [None]:
print("Total number of rarities:", all_set_lists_df["Rarity"].explode().nunique())

### By unique

In [None]:
all_set_lists_df.explode("Rarity").groupby("Rarity", dropna=False).nunique()

In [None]:
all_set_lists_df["Rarity"].explode().value_counts().plot.barh(figsize=(10, 20), grid=True)
plt.xscale("log")
plt.show()

### By combination

In [None]:
all_set_lists_df.groupby("Rarity", dropna=False).nunique()

In [None]:
all_set_lists_df["Rarity"].value_counts().plot.barh(figsize=(10, 80), grid=True)
plt.xscale("log")
plt.show()

## Print

In [None]:
print("Total number of prints:", all_set_lists_df["Print"].nunique())

In [None]:
all_set_lists_df.groupby("Print", dropna=False).nunique()

In [None]:
all_set_lists_df["Print"].value_counts().plot.bar(figsize=(18, 6), grid=True, rot=45)
plt.yscale("log")
plt.show()

## Quantity

In [None]:
print("Total number of quantities:", all_set_lists_df["Quantity"].nunique())

In [None]:
all_set_lists_df.groupby("Quantity", dropna=False).nunique()

## Region

In [None]:
print("Total number of regions:", all_set_lists_df["Region"].nunique())

In [None]:
all_set_lists_df.groupby(["Set", "Region"], dropna=False).nunique()

In [None]:
all_set_lists_df.groupby("Region", dropna=False).nunique().describe()

In [None]:
all_set_lists_df["Region"].value_counts().plot.bar(figsize=(18, 6), grid=True, rot=0)
plt.show()

## Cover card

In [None]:
print(
    "Total number of cover cards:",
    all_set_lists_df["Cover card"].explode("Cover card").nunique(),
)

In [None]:
all_set_lists_df.explode("Cover card").groupby(["Cover card"]).nunique()

In [None]:
all_set_lists_df.explode(["Cover card"]).groupby("Cover card")["Set"].nunique().sort_values(ascending=False).plot.barh(
    figsize=(10, 250), grid=True
)
plt.show()

## Series

In [None]:
print("Total number of series:", all_set_lists_df["Series"].nunique())

In [None]:
all_set_lists_df.groupby("Series").nunique()

In [None]:
all_set_lists_df.explode(["Series"]).groupby("Series")["Set"].nunique().sort_values(ascending=False).plot.barh(
    figsize=(10, 15), grid=True
)
plt.show()

## Set type

In [None]:
print("Total number of set types:", all_set_lists_df["Set type"].nunique())

In [None]:
all_set_lists_df.groupby("Set type").nunique()

In [None]:
set_info_df["Set type"].value_counts().plot.barh(figsize=(10, 10), grid=True)
plt.show()

## Release

In [None]:
print("Total number of release dates:", all_set_lists_df["Release"].nunique())

In [None]:
all_set_lists_df.groupby("Release", dropna=False).nunique()

In [None]:
set_by_release = pd.DataFrame(all_set_lists_df.groupby("Release", dropna=False)["Set"].nunique())

### By year

In [None]:
all_set_lists_df.drop("Modification date", axis=1).groupby("Release", dropna=False).nunique().groupby(
    set_by_release.index.strftime("%Y")
).sum()

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [None]:
plot.rate(set_by_release)

### By month

In [None]:
set_by_release.groupby(set_by_release.index.strftime("%B")).sum().sort_index(
    key=lambda x: pd.to_datetime(x, format="%B").month
).plot.bar(figsize=(16, 8), grid=True, xlabel="Release month", rot=0)
plt.show()

In [None]:
set_by_release.groupby(set_by_release.index.strftime("%d")).sum().plot.bar(
    figsize=(16, 8), grid=True, xlabel="Release day of the month", rot=0
)
plt.show()

### By day

In [None]:
set_by_release.groupby(set_by_release.index.dayofyear).sum().plot(
    figsize=(16, 8), grid=True, xlabel="Release day of the year"
)
plt.show()

In [None]:
set_by_release_wd = set_by_release.groupby(set_by_release.index.weekday).sum()
set_by_release_wd.index = [calendar.day_name[i] for i in set_by_release_wd.index.astype(int)]
set_by_release_wd.plot.bar(figsize=(16, 8), grid=True, xlabel="Release day of the week", rot=0)
plt.show()

### By region

In [None]:
by_release_region = all_set_lists_df.groupby(["Region", "Release"]).nunique()
by_release_region

In [None]:
set_by_release_region = by_release_region["Set"].unstack(0).sort_index().fillna(0).astype(int)
set_by_release_region_y = set_by_release_region.groupby(set_by_release_region.index.strftime("%Y")).sum().T
set_by_release_region_y

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(
    set_by_release_region_y[set_by_release_region_y > 0],
    annot=True,
    fmt="g",
    cmap="viridis",
    square=True,
)
plt.show()

In [None]:
plot.rate_subplots(set_by_release_region, title="Set releases")

### By series

In [None]:
by_release_series = all_set_lists_df.explode("Series").groupby(["Series", "Release"]).nunique()
by_release_series

In [None]:
set_by_release_series = by_release_series["Set"].unstack(0).sort_index().fillna(0).astype(int)
set_by_release_series_y = set_by_release_series.groupby(set_by_release_series.index.strftime("%Y")).sum().T
set_by_release_series_y

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(
    set_by_release_series_y[set_by_release_series_y > 0],
    annot=True,
    fmt="g",
    cmap="viridis",
    square=True,
)
plt.show()

In [None]:
plot.rate_subplots(set_by_release_series, title="Set releases")

### By set type

In [None]:
by_release_type = all_set_lists_df.explode("Set type").groupby(["Set type", "Release"]).nunique()
by_release_type

In [None]:
set_by_release_type = by_release_type["Set"].unstack(0).sort_index().fillna(0).astype(int)
set_by_release_type_y = set_by_release_type.groupby(set_by_release_type.index.strftime("%Y")).sum().T
set_by_release_type_y

In [None]:
plt.figure(figsize=(20, 16))
sns.heatmap(
    set_by_release_type_y[set_by_release_type_y > 0],
    annot=True,
    fmt="g",
    cmap="viridis",
    square=True,
)
plt.show()

In [None]:
plot.rate_subplots(set_by_release_type, title="Set releases")

# Debug

This section is used for debugging the extraction of data from MediaWiki query responses

## Has nan name/number

In [None]:
all_set_lists_df[all_set_lists_df["Name"].isna() | all_set_lists_df["Card number"].isna()]

## Has nan rarity

In [None]:
all_set_lists_df.explode("Rarity")[all_set_lists_df.explode("Rarity")["Rarity"].isna()]["Set"].unique()

## Has nan release

In [None]:
all_set_lists_df[all_set_lists_df["Release"].isna()]["Set"].unique()

 # Epilogue

In [None]:
benchmark(report="sets", timestamp=timestamp)

In [None]:
footer()

## HTML export

In [None]:
# May need to sleep for a few seconds after saving
save_notebook()

In [None]:
export_notebook(dirs.NOTEBOOKS.user / "Sets.ipynb")

## Git

In [None]:
git.commit("*[Ss]ets[._]*", f"Sets update - {timestamp.isoformat()}")