# Indexing MDF Data in MySQL

A cut down version with all of the database bits moved into a separate file.

In [1]:
from mdf_models import *

import os
import random
import fsspec
mdf_paths=list()

s3_cfg = {
    "key": "mdf_minio_access_key",
    "secret": "mdf_minio_secret_key",
    "client_kwargs": {
        "endpoint_url": "http://minio:9000",
    },
}

fs = fsspec.filesystem("s3", **s3_cfg)
for bucket in fs.ls(""):
    for root, dirs, files in fs.walk(bucket):
        for file in files:
            if file.lower().endswith(".mf4") or file.lower().endswith(".mdf"):
                mdf_paths.append(os.path.join(root, file))
print(f"Found {len(mdf_paths)} MDF files")

Found 975 MDF files


In [2]:
from multiprocessing import Pool

In [3]:
def index_mdf(mdf_path):
    """ Index the mdf file itself. """
    info = fs.info(mdf_path)
    # Local File
    MDF_ = upsert(
        cls=MDF,
        get={"key": info["Key"]},
        set={
            "last_modified": info["LastModified"],
            "etag": info["ETag"],
            "size": info["size"],
            "size_mb": info["size"] / 1024 ** 2,
            "storage_class": info["StorageClass"],
            "type": info["type"],
            "name": info["name"],
            "basename": os.path.basename(info["name"]),
        },
    )
    
    return MDF_
    
def index_channels(mdf):
    """Given a MDF files, process the channels
    
    """
    name = mdf.name
    # Open the MDF file.
    with fs.open(mdf.name, "rb") as fid:
        mdf_ = asammdf.MDF(fid)
    #
    channels = list()
    # Loop through each of the channels in the database.
    for channel in mdf_.channels_db.keys():
        channel_ = upsert(Channel, {"name": channel})
        channels.append(channel_)
    MDF_ = upsert(cls=MDF, get={"name": name}, set={"channels": channels},)

def index_mdf_info(mdf):
    name = mdf.name
    
    """ Index company and product information in the database from the filename."""
    product = os.path.basename(os.path.dirname(name))
    company = os.path.basename(os.path.dirname(os.path.dirname(name)))
    # Local File
    MDF_ = upsert(
        cls=MDF,
        get={"name": name},
        set={"product": product, "company": company,},
    )

Index the first 8 MDFs:

In [None]:
p=Pool(8)
p.map(index_mdf, mdf_paths[:8]);

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/pool.py", line 470, in _handle_results
    task = get()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 251, in recv
    return _ForkingPickler.loads(buf.getbuffer())
  File "/opt/conda/lib/python3.7/site-packages/pony/orm/core.py", line 4594, in unpickle_entity
    cache = entity._database_._get_cache()
  File "/opt/conda/lib/python3.7/site-packages/pony/orm/core.py", line 855, in _get_cache
    ): throw(TransactionError, 'db_session is required when working with the database')
  File "/opt/conda/lib/python3.7/site-packages/pony/utils/utils.py", line 108, in throw
    raise exc  # Set "pony.options.CUT_TRACEBACK = False" to see full traceback
pony.orm

Index all of the MDFs:

In [None]:
p=Pool(8)
p.map(index_mdf, mdf_paths);

In [None]:
mdf