In [None]:
from yugiquery import *

header("Sets")

---

Table of Contents
=================

*   [1  Data aquisition](#Data-aquisition)
    *   [1.1  Fetch set lists](#Fetch-set-lists)
    *   [1.2  Fetch set properties](#Fetch-set-properties)
    *   [1.3  Merge data](#Merge-data)
    *   [1.4  Save data](#Save-data)
*   [3  Check changes](#Check-changes)
    *   [3.1  Load previous data](#Load-previous-data)
    *   [3.2  Generate changelog](#Generate-changelog)
*   [4  Data visualization](#Data-visualization)
    *   [4.1  Full data](#Full-data)
    *   [4.2  Set](#Set)
    *   [4.3  Card number](#Card-number)
    *   [4.4  Name](#Name)
    *   [4.5  Rarity](#Rarity)
        *   [4.5.1  By unique](#By-unique)
        *   [4.5.2  By combination](#By-combination)
    *   [4.6  Print](#Print)
    *   [4.7  Quantity](#Quantity)
    *   [4.8  Region](#Region)
    *   [4.9  Cover card](#Cover-card)
    *   [4.10  Series](#Series)
    *   [4.11  Set type](#Set-type)
    *   [4.12  Release](#Release)
        *   [4.12.1  By year](#By-year)
        *   [4.12.2  By month](#By-month)
        *   [4.12.3  By day](#By-day)
        *   [4.12.4  By region](#By-region)
        *   [4.12.5  By series](#By-series)
        *   [4.12.6  By set type](#By-set-type)
*   [5  Debug](#Debug)
    *   [5.1  Has nan name/number](#Has-nan-name/number)
    *   [5.2  Has nan rarity](#Has-nan-rarity)
    *   [5.3  Has quantity as print](#Has-quantity-as-print)
    *   [5.4  Has nan release](#Has-nan-release)
    *   [5.5  Has print as rarity](#Has-print-as-rarity)
    *   [5.6  Merge failed](#Merge-failed)
*   [6  Epilogue](#Epilogue)
    *   [6.1  HTML export](#HTML-export)
<!-- *   [6.2  Git](#Git) -->

# Data aquisition

## Fetch set lists

In [None]:
# Timestamp
timestamp = pd.Timestamp.now()

all_set_lists_df = fetch_all_set_lists()

## Fetch set properties

In [None]:
# List of sets to ask
sets = all_set_lists_df["Set"].unique()
set_info_df = fetch_set_info(sets)

## Merge data

In [None]:
# Add properties to set lists
all_set_lists_df = merge_set_info(all_set_lists_df, set_info_df)

# Check changes

## Load previous data

In [None]:
## Get list of files
files_list = sorted(
    glob.glob("../data/all_sets_*.bz2"), key=os.path.getctime, reverse=True
)
# Get second newest file if exist
if files_list:
    latest_file = files_list[0]
    # Load csv avoiding converting "NA" to NaN and force dtypes to match current df
    previous_df = pd.read_csv(
        latest_file, dtype=object, keep_default_na=False, na_values=[""]
    )
    # Correct tuples
    tuple_cols = ["Rarity", "Cover card"]
    for col in tuple_cols:
        previous_df[col] = previous_df[col].dropna().apply(literal_eval)
    # Force dtypes to match current df
    previous_df = previous_df.astype(
        all_set_lists_df[
            previous_df.columns.intersection(all_set_lists_df.columns)
        ].dtypes.to_dict()
    )
    previous_ts = pd.to_datetime(
        os.path.basename(latest_file).split("_")[-1].split(".bz2")[0]
    )
    print("File loaded")
else:
    previous_df = None
    print("No older files")

## Generate changelog

In [None]:
if previous_df is None:
    changelog = None
    print("Skipped")
else:
    changelog = generate_changelog(previous_df, all_set_lists_df, col="Card number")
    if not changelog.empty:
        display(changelog)
        changelog.to_csv(
            f'../data/sets_changelog_{previous_ts.isoformat(timespec="minutes")}_{timestamp.isoformat(timespec="minutes")}.bz2',
            index=True,
        )
        print("Changelog saved")

## Save data

In [None]:
if changelog is not None and changelog.empty:
    print("No changes. New data not saved")
else:
    all_set_lists_df.to_csv(
        f'../data/all_sets_{timestamp.isoformat(timespec="minutes")}.bz2', index=False
    )
    print("Data saved")

# Data visualization

## Full data

In [None]:
all_set_lists_df

Full data available [here](../data)

## Set

In [None]:
print("Total number of sets:", all_set_lists_df["Set"].nunique())

In [None]:
all_set_lists_df.groupby("Set", dropna=False).nunique()

In [None]:
all_set_lists_df.groupby("Set", dropna=False).nunique().describe()

## Card number

In [None]:
print("Total number of card numbers:", all_set_lists_df["Card number"].nunique())

In [None]:
all_set_lists_df.groupby("Card number", dropna=False).nunique()

## Name

In [None]:
print("Total number of card names:", all_set_lists_df["Name"].nunique())

In [None]:
all_set_lists_df.groupby("Name", dropna=False).nunique()

In [None]:
all_set_lists_df.groupby("Name", dropna=True).nunique().describe()

## Rarity

In [None]:
print("Total number of rarities:", all_set_lists_df["Rarity"].explode().nunique())

### By unique

In [None]:
all_set_lists_df.explode("Rarity").groupby("Rarity", dropna=False).nunique()

In [None]:
all_set_lists_df["Rarity"].explode().value_counts().plot.barh(
    figsize=(10, 20), grid=True
)
plt.xscale("log")
plt.show()

### By combination

In [None]:
all_set_lists_df.groupby("Rarity", dropna=False).nunique()

In [None]:
all_set_lists_df["Rarity"].value_counts().plot.barh(figsize=(10, 40), grid=True)
plt.xscale("log")
plt.show()

## Print

In [None]:
print("Total number of prints:", all_set_lists_df["Print"].nunique())

In [None]:
all_set_lists_df.groupby("Print", dropna=False).nunique()

In [None]:
all_set_lists_df["Print"].value_counts().plot.bar(figsize=(18, 6), grid=True, rot=45)
plt.yscale("log")
plt.show()

## Quantity

In [None]:
print("Total number of quantities:", all_set_lists_df["Quantity"].nunique())

In [None]:
all_set_lists_df.groupby("Quantity", dropna=False).nunique()

## Region

In [None]:
print("Total number of regions:", all_set_lists_df["Region"].nunique())

In [None]:
all_set_lists_df.groupby("Region", dropna=False).nunique()

In [None]:
all_set_lists_df.groupby("Region", dropna=False).nunique().describe()

In [None]:
all_set_lists_df["Region"].value_counts().plot.bar(figsize=(18, 6), grid=True, rot=0)
plt.show()

## Cover card

In [None]:
print(
    "Total number of cover cards:",
    all_set_lists_df["Cover card"].explode("Cover card").nunique(),
)

In [None]:
all_set_lists_df.explode("Cover card").groupby("Cover card").nunique()

In [None]:
all_set_lists_df["Cover card"].explode().value_counts().plot.barh(
    figsize=(10, 180), grid=True
)
plt.show()

## Series

In [None]:
print("Total number of series:", all_set_lists_df["Series"].nunique())

In [None]:
all_set_lists_df.groupby("Series").nunique()

In [None]:
all_set_lists_df["Series"].value_counts().plot.barh(figsize=(10, 15), grid=True)
plt.show()

## Set type

In [None]:
print("Total number of set types:", all_set_lists_df["Set type"].nunique())

In [None]:
all_set_lists_df.groupby("Set type").nunique()

In [None]:
set_info_df["Set type"].value_counts().plot.barh(figsize=(10, 10), grid=True)
plt.show()

## Release

In [None]:
print("Total number of release dates:", all_set_lists_df["Release"].nunique())

In [None]:
all_set_lists_df.groupby("Release", dropna=False).nunique()

In [None]:
set_by_release = pd.DataFrame(
    all_set_lists_df.groupby("Release", dropna=False)["Set"].nunique()
)

### By year

In [None]:
all_set_lists_df.drop("Modification date", axis=1).groupby(
    "Release", dropna=False
).nunique().groupby(set_by_release.index.strftime("%Y")).sum()

In [None]:
rate_plot(set_by_release)

### By month

In [None]:
set_by_release.groupby(set_by_release.index.strftime("%B")).sum().sort_index(
    key=lambda x: pd.to_datetime(x, format="%B").month
).plot.bar(figsize=(16, 8), grid=True, xlabel="Release month", rot=0)
plt.show()

In [None]:
set_by_release.groupby(set_by_release.index.strftime("%d")).sum().plot.bar(
    figsize=(16, 8), grid=True, xlabel="Release day of the month", rot=0
)
plt.show()

### By day

In [None]:
set_by_release.groupby(set_by_release.index.dayofyear).sum().plot(
    figsize=(16, 8), grid=True, xlabel="Release day of the year"
)
plt.show()

In [None]:
set_by_release_wd = set_by_release.groupby(set_by_release.index.weekday).sum()
set_by_release_wd.index = [
    calendar.day_name[i] for i in set_by_release_wd.index.astype(int)
]
set_by_release_wd.plot.bar(
    figsize=(16, 8), grid=True, xlabel="Release day of the week", rot=0
)
plt.show()

### By region

In [None]:
by_release_region = all_set_lists_df.groupby(["Region", "Release"]).nunique()
by_release_region

In [None]:
set_by_release_region = (
    by_release_region["Set"].unstack(0).sort_index().fillna(0).astype(int)
)
set_by_release_region_y = (
    set_by_release_region.groupby(set_by_release_region.index.strftime("%Y")).sum().T
)
set_by_release_region_y

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(
    set_by_release_region_y[set_by_release_region_y > 0],
    annot=True,
    fmt="g",
    cmap="viridis",
    square=True,
)
plt.show()

In [None]:
rate_subplots(set_by_release_region, title="Set releases")

### By series

In [None]:
by_release_series = (
    all_set_lists_df.explode("Series").groupby(["Series", "Release"]).nunique()
)
by_release_series

In [None]:
set_by_release_series = (
    by_release_series["Set"].unstack(0).sort_index().fillna(0).astype(int)
)
set_by_release_series_y = (
    set_by_release_series.groupby(set_by_release_series.index.strftime("%Y")).sum().T
)
set_by_release_series_y

In [None]:
plt.figure(figsize=(20, 16))
sns.heatmap(
    set_by_release_series_y[set_by_release_series_y > 0],
    annot=True,
    fmt="g",
    cmap="viridis",
    square=True,
)
plt.show()

In [None]:
rate_subplots(set_by_release_series, title="Set releases")

### By set type

In [None]:
by_release_type = (
    all_set_lists_df.explode("Set type").groupby(["Set type", "Release"]).nunique()
)
by_release_type

In [None]:
set_by_release_type = (
    by_release_type["Set"].unstack(0).sort_index().fillna(0).astype(int)
)
set_by_release_type_y = (
    set_by_release_type.groupby(set_by_release_type.index.strftime("%Y")).sum().T
)
set_by_release_type_y

In [None]:
plt.figure(figsize=(20, 16))
sns.heatmap(
    set_by_release_type_y[set_by_release_type_y > 0],
    annot=True,
    fmt="g",
    cmap="viridis",
    square=True,
)
plt.show()

In [None]:
rate_subplots(set_by_release_type, title="Set releases")

# Debug

This section is used for debugging the extraction of data from MediaWiki query responses

## Has nan name/number

In [None]:
all_set_lists_df[
    all_set_lists_df["Name"].isna() | all_set_lists_df["Card number"].isna()
]

## Has nan rarity

In [None]:
all_set_lists_df.explode("Rarity")[all_set_lists_df.explode("Rarity")["Rarity"].isna()][
    "Set"
].unique()

## Has nan release

In [None]:
all_set_lists_df[all_set_lists_df["Release"].isna()]["Set"].unique()

 # Epilogue

In [None]:
benchmark("sets", timestamp)

In [None]:
footer()

## HTML export

In [None]:
# May need to sleep for a few seconds after saving
save_notebook()

In [None]:
! jupyter nbconvert Sets.ipynb --output-dir='../' --to=HTML --TagRemovePreprocessor.enabled=True --TagRemovePreprocessor.remove_cell_tags='exclude' --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True

## Git

In [None]:
! git add "../*[Ss]ets[._]*"

In [None]:
! git commit -m {"'Sets update-" + timestamp.isoformat() + "'"}