In [1]:
import pandas as pd
import numpy as np 
import json
import re
from statistics import mean
from sklearn.linear_model import LinearRegression

In [2]:
input_path = "data/d2_250_1_timelines.jsonl"
output_path = "filtered_info_gold_win_minion.jsonl"

# keywords to match (case-insensitive)
KEYWORDS = ["gold", "win", "minion","Gold", "Minion"]

def key_matches(key):
# Return True if key contains any keyword (case insensitive).
    key_lower = key.lower()
    return any(kw in key_lower for kw in KEYWORDS)

def filter_info(info_obj):

    if isinstance(info_obj, dict):
        filtered = {}
        for k, v in info_obj.items():
            if key_matches(k):
                filtered[k] = v
            else:
                # recursively check nested dict/lists
                nested = filter_info(v)
                if nested not in (None, {}, [], ""):
                    filtered[k] = nested
        return filtered

    elif isinstance(info_obj, list):
        filtered_list = []
        for item in info_obj:
            nested = filter_info(item)
            if nested not in (None, {}, [], ""):
                filtered_list.append(nested)
        return filtered_list

    return None



# Process the JSONL file


with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:

    for line in infile:
        try:
            record = json.loads(line)
        except json.JSONDecodeError:
            continue

        info = record.get("info", {})
        filtered_info = filter_info(info)

        # Only write if there is something
        if filtered_info and filtered_info != {}:
            json.dump(filtered_info, outfile)
            outfile.write("\n")

print(output_path)

Done! Saved filtered file to: filtered_info_gold_win_minion.jsonl
