# Exploration of WZ files

In [14]:
wzpath = "../../wzexports"

In [15]:
from pathlib import Path

list(Path(wzpath).glob("*"))

[WindowsPath('../../wzexports/Base.wz'),
 WindowsPath('../../wzexports/Character.wz'),
 WindowsPath('../../wzexports/Effect.wz'),
 WindowsPath('../../wzexports/Etc.wz'),
 WindowsPath('../../wzexports/Item.wz'),
 WindowsPath('../../wzexports/Map.wz'),
 WindowsPath('../../wzexports/Mob.wz'),
 WindowsPath('../../wzexports/Morph.wz'),
 WindowsPath('../../wzexports/Npc.wz'),
 WindowsPath('../../wzexports/Quest.wz'),
 WindowsPath('../../wzexports/Reactor.wz'),
 WindowsPath('../../wzexports/Skill.wz'),
 WindowsPath('../../wzexports/Sound.wz'),
 WindowsPath('../../wzexports/String.wz'),
 WindowsPath('../../wzexports/TamingMob.wz'),
 WindowsPath('../../wzexports/UI.wz')]

In [16]:
from pyspark.sql import SparkSession, functions as F, types as T, Window

spark = SparkSession.builder.getOrCreate()

In [17]:
[p for p in Path(wzpath).glob("Map.wz/*/*") if ".xml" not in p.name]

[WindowsPath('../../wzexports/Map.wz/Map/Map0'),
 WindowsPath('../../wzexports/Map.wz/Map/Map1'),
 WindowsPath('../../wzexports/Map.wz/Map/Map2'),
 WindowsPath('../../wzexports/Map.wz/Map/Map5'),
 WindowsPath('../../wzexports/Map.wz/Map/Map6'),
 WindowsPath('../../wzexports/Map.wz/Map/Map8'),
 WindowsPath('../../wzexports/Map.wz/Map/Map9')]

In [18]:
df = spark.read.format("xml").options(rowTag="imgdir", valueTag="_value_tag").load(f"{wzpath}/Map.wz/Map/Map0/*.xml")
df.printSchema()
df.show(n=5, truncate=80)

root
 |-- _name: string (nullable = true)
 |-- imgdir: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _name: string (nullable = true)
 |    |    |-- _value_tag: string (nullable = true)
 |    |    |-- canvas: struct (nullable = true)
 |    |    |    |-- _basedata: string (nullable = true)
 |    |    |    |-- _height: long (nullable = true)
 |    |    |    |-- _name: string (nullable = true)
 |    |    |    |-- _value_tag: string (nullable = true)
 |    |    |    |-- _width: long (nullable = true)
 |    |    |-- float: struct (nullable = true)
 |    |    |    |-- _name: string (nullable = true)
 |    |    |    |-- _value: double (nullable = true)
 |    |    |    |-- _value_tag: string (nullable = true)
 |    |    |-- imgdir: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- _name: string (nullable = true)
 |    |    |    |    |-- _value_tag: string (nullable = true)
 |    |    |    |    |-- 

## Getting useful information out of maps

In [19]:
subdoc = (
    df
    .select(F.col("_name").alias("img"), F.explode("imgdir").alias("payload"))
    .where(F.col("payload._name") == "info")
)
subdoc.select("payload.*").schema["float"].dataType.typeName()

'struct'

In [20]:
def flatten_info(df, dtype):
    q = df
    key = "img_name"
    subdoc = (
        df
        .select(F.col("_name").alias(key), F.explode("imgdir").alias("payload"))
        .where(F.col("payload._name") == "info")
        .select(key, f"payload.{dtype}")
    )
    is_array = subdoc.schema[dtype].dataType.typeName() == "array"
    subdoc = subdoc.withColumn("data", F.explode(dtype) if is_array else F.col(dtype))
    return (
        subdoc
        .select(key, "data.*")
        .groupBy(key)
        .pivot("_name")
        .agg(F.max("_value"))
    )

In [21]:
flatten_info(df, "string").show(n=3)

+-------------+----------------+-------+---------------+
|     img_name|             bgm|mapDesc|        mapMark|
+-------------+----------------+-------+---------------+
|000010000.img|Bgm00/FloralLife|   null|MushroomVillage|
|001000001.img|Bgm00/RestNPeace|       |        Amherst|
|001000004.img|Bgm00/RestNPeace|       |        Amherst|
+-------------+----------------+-------+---------------+
only showing top 3 rows



In [22]:
flatten_info(df, "float").show(n=3)

+-------------+-------+
|     img_name|mobRate|
+-------------+-------+
|001000001.img|    1.3|
|001020001.img|    1.0|
|000010000.img|    1.0|
+-------------+-------+
only showing top 3 rows



Can we be even more generic though?

In [23]:
def flatten(df, path: list, dtype: str):
    """Flatten a document based on a path."""
    if len(path) < 2:
        raise ValueError("path must be at least two elements long")
    sub = df
    # walk the tree
    for i in range(len(path) - 1):
        key = path[i]
        subkey = path[i + 1]
        # TODO: test nested 3 deep
        next_dtype = "imgdir" if i < len(path) else dtype
        sub = (
            sub.select(F.col("_name").alias(key), F.explode("imgdir").alias("_tmp"))
            .select(*path[:i+1], "_tmp.*")
            .withColumn(subkey, F.col("_name"))
            .where(F.col(subkey) == subkey)
            .drop("_name", "_value_tag")
        )
        sub = sub.select(*(path[:i+2]+[c for c in sub.columns if c not in path]))
    if dtype == "imgdir":
        return sub.select(*path, dtype)
    is_array = sub.schema[dtype].dataType.typeName() == "array"
    subdoc = sub.withColumn("_tmp", F.explode(dtype) if is_array else F.col(dtype))
    return subdoc.select(*path, "_tmp.*").groupBy(key).pivot("_name").agg(F.max("_value"))

In [24]:
flatten(df, ["root", "info"], "float").show(n=3)

+-------------+-------+
|         root|mobRate|
+-------------+-------+
|001000001.img|    1.3|
|001020001.img|    1.0|
|000010000.img|    1.0|
+-------------+-------+
only showing top 3 rows



In [25]:
flatten(df, ["root", "miniMap"], "int").show(n=3)

+-------------+-------+-------+------+---+-----+
|         root|centerX|centerY|height|mag|width|
+-------------+-------+-------+------+---+-----+
|001000001.img|    653|    752|  1696|  4| 2096|
|001000004.img|   -672|    717|  1511|  4| 1721|
|001020001.img|    526|    420|   974|  4| 1924|
+-------------+-------+-------+------+---+-----+
only showing top 3 rows



### Extract the links between maps

In [27]:
def get_portals(df):
    k = ["root", "portal"]
    q = (
        flatten(df, k, "imgdir")
        .select(*k, F.explode("imgdir").alias("_tmp"))
        .select(*k, F.col("_tmp._name").alias("i"), "_tmp.*")
        .drop("_name", "_value_tag")
        .select(*k, "i", F.explode("int").alias("_tmp"))
        .where(F.col("_tmp._name") == "tm")
        # what is tm?
        .select(
            *k,
            F.collect_list("_tmp._value")
            .over(Window.partitionBy(*k).orderBy("i"))
            .alias("tm")
        )
        .groupby(*k)
        .agg(F.max("tm").alias("tm"))
        .select(F.split(F.col("root"), "\.")[0].alias("src"), F.col("tm").alias("dst"))
    )
    return q

get_portals(df).show()

+---------+--------------------+
|      src|                 dst|
+---------+--------------------+
|000000002|      [999999999, 1]|
|000040000|[999999999, 99999...|
|000040001|[999999999, 99999...|
|001000003|[999999999, 99999...|
|001010001|[999999999, 1010000]|
|001010004|[999999999, 99999...|
|000060001|  [999999999, 60000]|
|001000006|[999999999, 99999...|
|000020000|[999999999, 99999...|
|000060000|[999999999, 99999...|
|001000005|[999999999, 99999...|
|000000000|[999999999, 99999...|
|000040002|[999999999, 99999...|
|001010000|[999999999, 10000...|
|001010002|[999999999, 10100...|
|001020001|[999999999, 99999...|
|000000003|[999999999, 99999...|
|000030000|[999999999, 99999...|
|001000000|[999999999, 50000...|
|000010000|[999999999, 99999...|
+---------+--------------------+
only showing top 20 rows

