# All Printings Table

## Introduction

The purpose of this notebook is to process and upload all card data from MTGJSON into the postgresql database mtg_db. This is done through the following steps:
- Download the json file from MTGJSON's file server
- Check the version and date of the json file
- Pre-process the dictionary and convert it into a dataframe
- Push the keywords dataframe to the database "raw_data" schema

## Python Libraries

In [28]:
import json, requests, gzip, io, re
import numpy      as     np
import pandas     as     pd
from   tqdm       import tqdm
from   contextlib import redirect_stdout
from   sqlalchemy import create_engine, Table, Column, MetaData, Text, Date, text
from   sqlalchemy.dialects.postgresql import insert

## Functions

In [42]:
# Function for showing the hierarchy of a dictionary or the schema of a single level
def print_dict_structure(data, max_depth=None, _indent=0):

    """
    Recursively prints the hierarchical structure of a dictionary or list,
    including the length of each element where applicable.
    
    If max_depth=1, returns a DataFrame with columns: KEY_NAME, DATA_TYPE, LENGTH.
    
    Args:
        data: The dictionary or list to explore.
        max_depth: Limit how deep to traverse (None for full depth).
    
    Returns:
        pd.DataFrame if max_depth=1, otherwise None (prints output).
    """
    
    # Check if we are at the top level and max_depth=1 to return DataFrame instead of printing
    if max_depth == 1 and _indent == 0:
        # Initialize list to collect rows for the DataFrame
        rows = []

        # If data is a dictionary, iterate over its keys and values
        if isinstance(data, dict):
            for key, value in data.items():
                # Determine length if possible, otherwise set to 0
                length = len(value) if hasattr(value, "__len__") and not isinstance(value, (str, bytes)) else 0
                # Append a tuple of key name, data type, and length to rows
                rows.append((key, type(value).__name__, length))

        # If data is a list, take the first element (assumed dict) and do the same
        elif isinstance(data, list) and data:
            for key, value in data[0].items():
                # Determine length if possible, otherwise set to 0
                length = len(value) if hasattr(value, "__len__") and not isinstance(value, (str, bytes)) else 0
                # Append a tuple of key name, data type, and length to rows
                rows.append((key, type(value).__name__, length))

        # Convert collected rows into a DataFrame with specific column names
        return pd.DataFrame(rows, columns=["KEY_NAME", "DATA_TYPE", "LENGTH"])
    
    # Create a prefix for indentation when printing nested structures
    prefix = "  " * _indent

    # If data is a dictionary, iterate recursively
    if isinstance(data, dict):
        for key, value in data.items():
            # Prepare a string showing type and length for printing
            length_info = f", len={len(value)}" if hasattr(value, "__len__") and not isinstance(value, (str, bytes)) else ""
            # Print the key name, type, and length with proper indentation
            print(f"{prefix}{key} ({type(value).__name__}{length_info})")
            # Recurse into value if max_depth is not reached
            if max_depth is None or _indent < max_depth - 1:
                print_dict_structure(value, max_depth, _indent + 1)

    # If data is a list, recurse into the first element (assuming homogeneous elements)
    elif isinstance(data, list):
        if data and (max_depth is None or _indent < max_depth - 1):
            print_dict_structure(data[0], max_depth, _indent + 1)

    # For non-dict and non-list elements, print their type and length
    else:
        # Prepare a string showing type and length for printing
        length_info = f", len={len(data)}" if hasattr(data, "__len__") and not isinstance(data, (str, bytes)) else ""
        # Print the type and length with proper indentation
        print(f"{prefix}{type(data).__name__}{length_info}")

In [2]:
# Function for showing the data and version of the MTGJSON data
def data_recency_check(data, json_type):

    """
    Extract and display the version and date metadata from an MTGJSON dataset,
    and return this information as a DataFrame along with the JSON type.

    Parameters
    ----------
    data : dict
        MTGJSON data loaded from a JSON file, expected to contain a 'meta' key
        with 'date' and 'version' fields.

    json_type : str
        A string indicating the type or name of the JSON dataset being processed.
        This will be included in the output DataFrame.

    Returns
    -------
    pd.DataFrame
        A DataFrame with a single row and columns:
        - 'json_type': The provided JSON dataset type/name.
        - 'latest_date': The date the MTGJSON data was last updated.
        - 'latest_version': The MTGJSON model version.
    """

    # Create a DataFrame for the output
    df = pd.DataFrame({'json_type'      : [json_type]
                      ,'latest_date'    : [data['meta']['date']]
                      ,'latest_version' : [data['meta']['version']]})

    # Returning the values directly
    return(df)

In [3]:
# Function for uploading the recency check

def recency_check_upload(schema_name, table_name, dataframe):
    
    """
    Uploads recency check data from a Pandas DataFrame into a PostgreSQL table 
    with upsert (insert or update) logic.

    Each row from the DataFrame is inserted into the target table. If a row with the 
    same `json_type` (primary key) already exists, the corresponding `latest_date` 
    and `latest_version` values are updated instead.

    Parameters
    ----------
    schema_name : str
        Name of the PostgreSQL schema where the table resides.
    table_name : str
        Name of the PostgreSQL table to update or insert into.
    dataframe : pandas.DataFrame
        DataFrame containing the recency check data with columns:
        - "json_type" (str): Identifier for the JSON file type.
        - "latest_date" (datetime.date): Date of the latest file.
        - "latest_version" (str): Version string of the latest file.

    Notes
    -----
    - Requires a global SQLAlchemy `engine` object to be defined.
    - Uses PostgreSQL's ON CONFLICT clause for upsert behavior.
    """

    # Create a MetaData object
    metadata = MetaData(schema=schema_name)
    
    # Define the Table object matching your PostgreSQL table
    json_recency_table = Table(table_name
                              ,metadata
                              ,Column("json_type" ,Text ,primary_key = True)
                              ,Column("latest_date" ,Date)
                              ,Column("latest_version" ,Text))
    
    # Upsert each row from your DataFrame
    with engine.begin() as conn:
        
        # Iterate through rows of the DataFrame
        for _, row in dataframe.iterrows():
            
            # Create an insert statement for the current row
            stmt = insert(json_recency_table).values(json_type      = row["json_type"]
                                                    ,latest_date    = row["latest_date"]
                                                    ,latest_version = row["latest_version"])
            
            # Add upsert logic to update on conflict
            stmt = stmt.on_conflict_do_update(index_elements = ["json_type"]
                                             ,set_           = {"latest_date"    : row["latest_date"]
                                                               ,"latest_version" : row["latest_version"]})
            
            # Execute the statement
            conn.execute(stmt)

## Input

### All printings Schema

| Column           | Renamed   | Dataype | Description                                              |
| ---              | ---       | ---     | ---                                                      |
| data             | set       | STRING  | A list of ability words found in rules text on cards     |
| keywordAbilities | keywords  | STRING  | A list of keyword abilities found in rules text on cards |
| keywordActions   | actions   | STRING  | A list of keyword actiona found in rules text on cards   |

In [None]:
# URL for the MTGJSON file (example: AllPrintings)
url = "https://mtgjson.com/api/v5/AllPrintings.json.gz"

# Stream download the file to track progress
response = requests.get(url, stream=True)
response.raise_for_status()

# Prepare to track total size and read in chunks
total_size = int(response.headers.get('content-length', 0))  # total bytes, may be None
chunk_size = 1024 * 1024  # 1 MB per chunk
compressed_data = bytearray()  # store the downloaded bytes

# Iterate over response chunks, updating progress bar
with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as pbar:
    for chunk in response.iter_content(chunk_size=chunk_size):
        if chunk:  # filter out keep-alive chunks
            compressed_data.extend(chunk)
            pbar.update(len(chunk))

# Decompress the downloaded data and parse JSON into a dictionary
dict__all_printings = json.loads(gzip.decompress(compressed_data))

Downloading: 100%|██████████| 128M/128M [00:03<00:00, 39.8MB/s] 


In [5]:
## Setting up credentials for accessing postgresql "mtg_db" database

# Credentials for setting up connection to postgresql
user     = "postgres"
password = "as:123bpostgresql"
host     = "localhost"
port     = "5432"
database = "mtg_db"

# Engine connection to postgresql
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}")

In [6]:
## Creating the empty data_recency table if not exists
query = "CREATE TABLE IF NOT EXISTS raw_data.data_recency ("\
        " json_type      TEXT PRIMARY KEY"\
        ",latest_date    DATE"\
        ",latest_version TEXT);"
with engine.begin() as conn:
    conn.execute(text(query))

## Pre-processing

In [9]:
# Checking the latest version of the input data
df__data_recency = data_recency_check(dict__all_printings, 'all printings')

## Main Code

In [16]:
dict__all_printings['data'].keys()

dict_keys(['10E', '2ED', '2X2', '2XM', '30A', '3ED', '40K', '4BB', '4ED', '5DN', '5ED', '6ED', '7ED', '8ED', '9ED', 'A25', 'AA1', 'AA2', 'AACR', 'AAFR', 'ABLB', 'ABRO', 'ACLB', 'ACMM', 'ACR', 'ADFT', 'ADMU', 'ADSK', 'AEOE', 'AER', 'AFC', 'AFDN', 'AFIN', 'AFR', 'AINR', 'AJMP', 'AKH', 'AKHM', 'AKR', 'ALA', 'ALCI', 'ALL', 'ALTC', 'ALTR', 'AMH1', 'AMH2', 'AMH3', 'AMID', 'AMKM', 'AMOM', 'ANA', 'ANB', 'ANEO', 'AONE', 'AOTJ', 'APC', 'ARB', 'ARC', 'ARN', 'ASNC', 'ASTX', 'ATDM', 'ATH', 'ATQ', 'AVOW', 'AVR', 'AWOE', 'AZNR', 'BBD', 'BCHR', 'BFZ', 'BIG', 'BLB', 'BLC', 'BNG', 'BOK', 'BOT', 'BRB', 'BRC', 'BRO', 'BRR', 'BTD', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'CC1', 'CC2', 'CED', 'CEI', 'CHK', 'CHR', 'CLB', 'CLU', 'CM1', 'CM2', 'CMA', 'CMB1', 'CMB2', 'CMD', 'CMM', 'CMR', 'CN2', 'CNS', 'CON', 'CP1', 'CP2', 'CP3', 'CSP', 'CST', 'DBL', 'DCI', 'DD1', 'DD2', 'DD3', 'DDC', 'DDD', 'DDE', 'DDF', 'DDG', 'DDH', 'DDI', 'DDJ', 'DDK', 'DDL', 'DDM', 'DDN', 'DDO', 'DDP', 'DDQ', 'DDR', '

In [44]:
print_dict_structure(dict__all_printings['data']['10E'], max_depth=1)

Unnamed: 0,KEY_NAME,DATA_TYPE,LENGTH
0,baseSetSize,int,0
1,block,str,0
2,booster,dict,1
3,cards,list,510
4,cardsphereSetId,int,0
5,code,str,0
6,decks,list,17
7,isFoilOnly,bool,0
8,isOnlineOnly,bool,0
9,keyruneCode,str,0


In [55]:
dict__all_printings['data']['10E']['booster']['draft']['name']

'Tenth Edition Draft Booster'

In [None]:
## Converting the dictionary to a dataframe, renaming the columns and making empty values empty strings

# Converting the json dictionary to a dataframe
df__keywords = pd.DataFrame.from_dict(dict__all_printings['data']
                                     # The columns are different lengths
                                     ,orient = 'index').transpose()

# Renaming the columns
df__keywords.columns = ['abilities'
                       ,'keywords'
                       ,'actions']

# Sort each column independently, pushing NaNs and empty strings to the bottom
df__keywords = df__keywords.apply(lambda col: col.replace('', np.nan)             # Treat empty strings as NaN
                                                 .sort_values(na_position='last') # Sort values
                                                 .fillna('')                      # Put empty strings back if desired
                                                 .values)                         # Reset index

## Output

In [10]:
recency_check_upload(schema_name = "raw_data"
                    ,table_name  = "data_recency"
                    ,dataframe   = df__data_recency)

In [11]:
# Uploading the keywords dataframe to postgresql
df__keywords.to_sql(name      = 'keywords'
                   ,con       = engine
                   ,schema    = 'raw_data'
                   ,if_exists = 'replace'
                   ,index     = False)

212

## Checks

In [17]:
# Check the json file date and version
query = """
        SELECT *
        FROM raw_data.data_recency
        WHERE json_type = 'keyword'
        """
pd.read_sql_query(query, con=engine)

Unnamed: 0,json_type,latest_date,latest_version
0,keyword,2025-08-19,5.2.2+20250819


In [18]:
# Check the dataframe top 10 values
query = """
        SELECT *
        FROM raw_data.keywords
        LIMIT 10
        """
pd.read_sql_query(query, con=engine)

Unnamed: 0,abilities,keywords,actions
0,Adamant,Absorb,Abandon
1,Addendum,Affinity,Activate
2,Alliance,Afflict,Adapt
3,Battalion,Afterlife,Amass
4,Bloodrush,Aftermath,Assemble
5,Celebration,Amplify,Attach
6,Channel,Annihilator,Behold
7,Chroma,Ascend,Bolster
8,Cohort,Assist,Cast
9,Constellation,Augment,Clash
