Import the necessary libraries and modules.

In [1]:
import pandas as pd
import datetime as dt
import csv
import math
import re
import json
from pandas import json_normalize

Change the Path variable to point to your CSV file

In [2]:
path = r"C:\Users\joacosta\Dev\Python\ORF5\Data\S04.csv"

In [3]:
df = pd.read_csv(
    path,
    sep=";",
    header=None,
    engine="python",
    quoting=csv.QUOTE_NONE,      # make " just a character
    skipinitialspace=True,
    on_bad_lines="skip",         # skip malformed rows if any
    dtype=str                    # keep everything as text for now
)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,"""250917 110405 027""","""N""","""SMC""","""10.158.244.100:7200""","""AWCS.Comm""","""""","""""","""22:45:29,656",MC01,54177,SQ 224,"->{event: ""AwcsConverterSendS04"", plcRecordNo:..."
1,"""250917 110405 267""","""N""","""SMC""","""10.158.244.100:7200""","""AWCS.Comm""","""""","""""","""22:45:29,895",MC01,54177,SQ 236,"->{event: ""AwcsConverterSendS04"", plcRecordNo:..."
2,"""250917 110405 505""","""N""","""SMC""","""10.158.244.100:7200""","""AWCS.Comm""","""""","""""","""22:45:30,133",MC01,54177,SQ 252,"->{event: ""AwcsConverterSendS04"", plcRecordNo:..."
3,"""250917 110405 745""","""N""","""SMC""","""10.158.244.100:7200""","""AWCS.Comm""","""""","""""","""22:45:30,374",MC01,54177,SQ 013,"->{event: ""AwcsConverterSendS04"", plcRecordNo:..."
4,"""250917 110406 031""","""N""","""SMC""","""10.158.244.100:7200""","""AWCS.Comm""","""""","""""","""22:45:30,625",MC01,54177,SQ 026,"->{event: ""AwcsConverterSendS04"", plcRecordNo:..."


In [4]:

pd.set_option('display.max_colwidth', None)   # no truncar columnas largas
pd.set_option('display.width', 0)             # ajustar a la terminal
pd.set_option('display.max_rows', None)       # (opcional) mostrar todas las filas

# Ver TODA la columna 11 con índice
print(df.iloc[0,11])

->{event: "AwcsConverterSendS04", plcRecordNo: 0972, itemID: 14121U, indexNo: 0972, locationAWCS: "S01aa          ", barcodeAWCS: "SD0002GOOD_001_v                    ", actualDestMCID: 0, requestedDestMCID: [0, -1, -1, -1, -1, -1], sortCode: [0, -1, -1, -1, -1, -1], requestedDestStatus: ["Used,FromAwcs,Attempted", "Unused", "Unused", "Unused", "Unused", "Unused"], comHost: "AwcsConverterMcIf", comMode: "Send", telegramType: "S04"}<"


Cleaning the message and parsing to JSON

In [5]:
def strip_decorators(s: str) -> str | None:
    """Quita adornos -> y <", y devuelve solo el contenido entre { ... }."""
    if not isinstance(s, str):
        return None
    s = s.strip()
    if s.startswith("->"):
        s = s[2:].strip()
    if s.endswith('<"'):
        s = s[:-2].strip()
    # quedarnos con { ... }
    i, j = s.find("{"), s.rfind("}")
    if i == -1 or j == -1 or j <= i:
        return None
    return s[i+1:j]  # SIN llaves externas; más fácil para tokenizar

def split_top_level_commas(s: str) -> list[str]:
    """
    Divide por comas de nivel superior, ignorando comas dentro de [] o dentro de "cadenas".
    """
    parts = []
    buf = []
    depth = 0
    in_str = False
    esc = False
    for ch in s:
        if in_str:
            if esc:
                esc = False
                buf.append(ch)
            elif ch == "\\":
                esc = True
                buf.append(ch)
            elif ch == '"':
                in_str = False
                buf.append(ch)
            else:
                buf.append(ch)
        else:
            if ch == '"':
                in_str = True
                buf.append(ch)
            elif ch == '[':
                depth += 1
                buf.append(ch)
            elif ch == ']':
                depth = max(0, depth-1)
                buf.append(ch)
            elif ch == ',' and depth == 0:
                parts.append(''.join(buf).strip())
                buf = []
            else:
                buf.append(ch)
    if buf:
        parts.append(''.join(buf).strip())
    return parts

def split_key_value(pair: str) -> tuple[str, str] | None:
    """
    Divide 'clave: valor' en clave y valor (solo el primer ':').
    """
    idx = pair.find(':')
    if idx == -1:
        return None
    key = pair[:idx].strip()
    val = pair[idx+1:].strip()
    # quitar comillas en la CLAVE si las hubiera (normalmente no)
    key = key.strip('"')
    return key, val

def parse_list(val: str):
    """
    Parsea una lista tipo [0, -1, "texto", ABC123] de manera tolerante.
    """
    inner = val.strip()[1:-1].strip()
    if inner == "":
        return []
    items = split_top_level_commas(inner)
    return [parse_value(v) for v in items]

def looks_int(tok: str) -> bool:
    return re.fullmatch(r'-?\d+', tok) is not None

def parse_value(val: str):
    """
    Interpreta el valor:
    - "cadena con espacios"  -> string (strip de relleno interno luego si quieres)
    - [ ... ]                -> lista recursiva
    - números puros (sin letra) -> int
    - tokens alfanum (ej: 14121U) -> string
    """
    val = val.strip()

    # cadena entrecomillada
    if len(val) >= 2 and val[0] == '"' and val[-1] == '"':
        return val[1:-1]  # sin escape complejo (no suele venir)
    # lista
    if val.startswith('[') and val.endswith(']'):
        return parse_list(val)
    # token "vacío" u objetos no esperados
    if val == "" or val == "null" or val == "None":
        return None
    # número entero (permitimos ceros a la izquierda -> lo dejamos como int? o string?)
    # Para conservar PRNs con ceros a la izquierda si te importa, trátalos como string si empiezan con '0'
    if looks_int(val) and not val.startswith('0') or val == "0" or val.startswith('-'):
        try:
            return int(val)
        except:
            pass
    # el resto lo tratamos como string
    return val

def parse_log_object(cell: str) -> dict | None:
    """
    Parser tolerante: toma la celda cruda, extrae { ... }, separa por comas de nivel 0,
    divide en clave:valor y convierte cada valor a int/str/list según corresponda.
    """
    core = strip_decorators(cell)
    if core is None:
        return None

    # partes top-level "k: v"
    pairs = split_top_level_commas(core)

    out = {}
    for p in pairs:
        kv = split_key_value(p)
        if not kv:
            continue
        k, v = kv
        # normalizar nombre de clave (sin espacios)
        k = k.strip()
        # parsear valor
        parsed_v = parse_value(v)
        # limpiar relleno en strings de campos conocidos
        if isinstance(parsed_v, str) and k in {"locationAWCS", "barcodeAWCS"}:
            parsed_v = parsed_v.strip()
        out[k] = parsed_v
    return out

# 3) Aplicar parser fila por fila y recolectar en memoria
parsed_dicts = []
idx_ok = []
for i, s in df[11].items():
    d = parse_log_object(s)
    parsed_dicts.append(d)
    if isinstance(d, dict):
        idx_ok.append(i)

# 4) Convertir la lista de dicts a DataFrame y concatenar
df_parsed = pd.DataFrame([d if isinstance(d, dict) else {} for d in parsed_dicts])
df_parsed.index = df.index  # alinear índices con df original

# 5) Renombrar campos a nombres S04 más claros
rename_map = {
    "plcRecordNo": "PLC_RecordNumber",
    "itemID": "Item_ID",
    "indexNo": "Index_No",
    "locationAWCS": "Location_ID",
    "barcodeAWCS": "Barcode",
    "actualDestMCID": "Actual_Dest_ID",
    "requestedDestMCID": "Requested_Dest_ID",   # lista
    "sortCode": "Sort_Code",                     # lista
    "requestedDestStatus": "Requested_Dest_Status",
    "comHost": "Com_Host",
    "comMode": "Com_Mode",
    "telegramType": "Telegram_Type",
    "event": "Event"
}
df_parsed.rename(columns=rename_map, inplace=True)

# 6) (Opcional) Explotar arrays a columnas por índice
def explode_list_column(df_in: pd.DataFrame, col: str, n: int):
    if col not in df_in.columns:
        return df_in
    series = df_in[col].apply(lambda x: x if isinstance(x, list) else [])
    for i in range(n):
        df_in[f"{col}[{i}]"] = series.apply(lambda a: a[i] if len(a) > i else None)
    return df_in

# ajusta 'n' al tamaño típico de tus arrays (en tu ejemplo vienen 6)
df_parsed = explode_list_column(df_parsed, "Requested_Dest_ID", 6)
df_parsed = explode_list_column(df_parsed, "Sort_Code", 6)

# 7) Concatenar con el DataFrame original
df_final = pd.concat([df, df_parsed], axis=1)

# 8) Vista rápida
pd.set_option('display.max_colwidth', None)
print("Filas parseadas correctamente:", len(idx_ok), "de", len(df))
print(df_final[[
    c for c in [
        "Event","Telegram_Type","PLC_RecordNumber","Item_ID","Index_No",
        "Location_ID","Barcode","Actual_Dest_ID",
        "Requested_Dest_ID","Sort_Code","Requested_Dest_Status"
    ] if c in df_final.columns
]].head(10))

# 9) (Opcional) Guardar
df_final.to_csv("s04_dataframe.csv", index=False)
print("Guardado: s04_dataframe.csv")


Filas parseadas correctamente: 5911 de 5911
                  Event Telegram_Type PLC_RecordNumber Item_ID Index_No  \
0  AwcsConverterSendS04           S04             0972  14121U     0972   
1  AwcsConverterSendS04           S04             1330  14479U     1330   
2  AwcsConverterSendS04           S04             1313  14462U     1313   
3  AwcsConverterSendS04           S04             1318  14467U     1318   
4  AwcsConverterSendS04           S04             1326  14475U     1326   
5  AwcsConverterSendS04           S04             1088  14237U     1088   
6  AwcsConverterSendS04           S04             1322  14471U     1322   
7  AwcsConverterSendS04           S04             1329  14478U     1329   
8  AwcsConverterSendS04           S04             1336  14485U     1336   
9  AwcsConverterSendS04           S04             1316  14465U     1316   

  Location_ID           Barcode  Actual_Dest_ID           Requested_Dest_ID  \
0       S01aa  SD0002GOOD_001_v               0    

In [6]:
df_final.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Requested_Dest_ID[2],Requested_Dest_ID[3],Requested_Dest_ID[4],Requested_Dest_ID[5],Sort_Code[0],Sort_Code[1],Sort_Code[2],Sort_Code[3],Sort_Code[4],Sort_Code[5]
0,"""250917 110405 027""","""N""","""SMC""","""10.158.244.100:7200""","""AWCS.Comm""","""""","""""","""22:45:29,656",MC01,54177,...,-1,-1,-1,-1,0,-1,-1,-1,-1,-1


In [7]:
# Asumo que tu dataframe final actual se llama df_final (el que ya tiene columnas como Requested_Dest_ID[0], Sort_Code[0], etc.)
df_work = df_final.copy()

# ---------------------------------------------------
# 1) Limpieza genérica: quitar comillas envolventes y trim
# ---------------------------------------------------
def strip_quotes(x):
    if isinstance(x, str):
        s = x.strip()
        if len(s) >= 2 and ((s[0] == '"' and s[-1] == '"') or (s[0] == "'" and s[-1] == "'")):
            return s[1:-1].strip()
        return s
    return x

for c in df_work.columns:
    if pd.api.types.is_object_dtype(df_work[c]):
        df_work[c] = df_work[c].map(strip_quotes)

# ---------------------------------------------------
# 2) Tipado de columnas numéricas (si existen)
# ---------------------------------------------------

# columnas string únicamente
# columnas string únicamente
str_cols = [c for c in df_work.columns if isinstance(c, str)]

req_cols  = [c for c in str_cols if c.startswith("Requested_Dest_ID[")]
sort_cols = [c for c in str_cols if c.startswith("Sort_Code[")]

num_cols = req_cols + sort_cols + (["Actual_Dest_ID"] if "Actual_Dest_ID" in str_cols else [])

for c in num_cols:
    df_work[c] = pd.to_numeric(df_work[c], errors="coerce")

# ---------------------------------------------------
# 3) Construir tabla S04 “limpia” mínima para análisis
# ---------------------------------------------------
# Seleccionamos columnas clave si existen:
keep_cols = [c for c in [
    "Event","Telegram_Type","PLC_RecordNumber","Item_ID","Index_No",
    "Location_ID","Barcode","Actual_Dest_ID",
    "Requested_Dest_ID[0]","Requested_Dest_ID[1]","Requested_Dest_ID[2]",
    "Requested_Dest_ID[3]","Requested_Dest_ID[4]","Requested_Dest_ID[5]",
    "Sort_Code[0]","Sort_Code[1]","Sort_Code[2]","Sort_Code[3]","Sort_Code[4]","Sort_Code[5]"
] if c in df_work.columns]

s04 = df_work[keep_cols].copy()

# success = 1 si Sort_Code[0] == 0 (éxito normal)
if "Sort_Code[0]" in s04.columns:
    s04["success"] = (s04["Sort_Code[0]"] == 0).astype("Int64")
else:
    s04["success"] = pd.NA

# reason_code = primero distinto de -1 en el array de Sort_Code (si existe)
sort_cols = [c for c in s04.columns if c.startswith("Sort_Code[")]
def first_reason(row):
    for c in sort_cols:
        v = row.get(c, None)
        if pd.notna(v) and v != -1:
            return int(v)
    return int(row["Sort_Code[0]"]) if "Sort_Code[0]" in row and pd.notna(row["Sort_Code[0]"]) else None

s04["reason_code"] = s04.apply(first_reason, axis=1).astype("Int64")

# Mapeo de códigos a texto (puedes extenderlo con toda tu tabla)
reason_map = {
    0:  "Success",
    1:  "Unknown",
    2:  "Unexpected_Container",
    3:  "Tracking_Error",
    4:  "Gap_Error",
    5:  "Destination_Full",
    6:  "Destination_Non_Operational",
    7:  "Invalid_Destination",
    8:  "No_Read",
    9:  "No_Code",
    10: "Multi_Label",
    12: "Destination_Disabled",
    13: "Throughput_Limit",
    14: "Failed_To_Divert",
    16: "No_Destination_Received",
    17: "Lost_Container",
    18: "Dimension_Error",
    19: "Weight_Error",
    20: "Container_Utilization",
    21: "Unable_To_Divert",
    22: "Destination_Not_Attempted",
}
s04["reason_text"] = s04["reason_code"].map(reason_map)

# ---------------------------------------------------
# 4) (Opcional) Formato “largo” por índice (útil para análisis)
#     - Un registro por requested destination con su sort code
# ---------------------------------------------------
# Derivamos el índice máximo en base a columnas presentes
req_cols = [c for c in s04.columns if c.startswith("Requested_Dest_ID[")]
max_idx = max([int(c.split("[")[1].rstrip("]")) for c in req_cols], default=-1)

if max_idx >= 0:
    long_frames = []
    id_cols = [c for c in ["PLC_RecordNumber","Item_ID","Location_ID","Barcode","Actual_Dest_ID"] if c in s04.columns]
    base_cols = id_cols + ["success","reason_code","reason_text"]
    for i in range(max_idx + 1):
        rd = f"Requested_Dest_ID[{i}]"
        sc = f"Sort_Code[{i}]"
        if rd in s04.columns and sc in s04.columns:
            tmp = s04[base_cols + [rd, sc]].copy()
            tmp.rename(columns={rd:"Requested_Dest_ID", sc:"Sort_Code"}, inplace=True)
            tmp["option_index"] = i
            long_frames.append(tmp)
    if long_frames:
        s04_long = pd.concat(long_frames, ignore_index=True)
        # ejemplo de filtro: solo requested válidos (>=0)
        s04_long = s04_long[s04_long["Requested_Dest_ID"].notna()]
    else:
        s04_long = None
else:
    s04_long = None

# ---------------------------------------------------
# 5) Vistas rápidas
# ---------------------------------------------------
print("S04 (wide) - primeras filas:")
s04.head(10)




S04 (wide) - primeras filas:


Unnamed: 0,Event,Telegram_Type,PLC_RecordNumber,Item_ID,Index_No,Location_ID,Barcode,Actual_Dest_ID,Requested_Dest_ID[0],Requested_Dest_ID[1],...,Requested_Dest_ID[5],Sort_Code[0],Sort_Code[1],Sort_Code[2],Sort_Code[3],Sort_Code[4],Sort_Code[5],success,reason_code,reason_text
0,AwcsConverterSendS04,S04,972,14121U,972,S01aa,SD0002GOOD_001_v,0,0,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
1,AwcsConverterSendS04,S04,1330,14479U,1330,S01aa,SD0003GOOD_001_v,16,16,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
2,AwcsConverterSendS04,S04,1313,14462U,1313,S01aa,SD0009GOOD_001_v,0,0,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
3,AwcsConverterSendS04,S04,1318,14467U,1318,S01aa,SD0006GOOD_001_v,2,2,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
4,AwcsConverterSendS04,S04,1326,14475U,1326,S01aa,SD0009GOOD_001_v,4,4,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
5,AwcsConverterSendS04,S04,1088,14237U,1088,S01ab,SD0004GOOD_001_v,3002,3001,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
6,AwcsConverterSendS04,S04,1322,14471U,1322,S01aa,SD0010GOOD_001_v,6,6,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
7,AwcsConverterSendS04,S04,1329,14478U,1329,S01aa,SD0010GOOD_001_v,10,10,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
8,AwcsConverterSendS04,S04,1336,14485U,1336,S01aa,SD0003GOOD_001_v,14,14,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success
9,AwcsConverterSendS04,S04,1316,14465U,1316,S01aa,SD0006GOOD_001_v,16,16,-1,...,-1,0,-1,-1,-1,-1,-1,1,0,Success


In [8]:
if s04_long is not None:
    print("\nS04 (long) - primeras filas:")
    


S04 (long) - primeras filas:


In [9]:
s04_long.head(15)

Unnamed: 0,PLC_RecordNumber,Item_ID,Location_ID,Barcode,Actual_Dest_ID,success,reason_code,reason_text,Requested_Dest_ID,Sort_Code,option_index
0,972,14121U,S01aa,SD0002GOOD_001_v,0,1,0,Success,0,0,0
1,1330,14479U,S01aa,SD0003GOOD_001_v,16,1,0,Success,16,0,0
2,1313,14462U,S01aa,SD0009GOOD_001_v,0,1,0,Success,0,0,0
3,1318,14467U,S01aa,SD0006GOOD_001_v,2,1,0,Success,2,0,0
4,1326,14475U,S01aa,SD0009GOOD_001_v,4,1,0,Success,4,0,0
5,1088,14237U,S01ab,SD0004GOOD_001_v,3002,1,0,Success,3001,0,0
6,1322,14471U,S01aa,SD0010GOOD_001_v,6,1,0,Success,6,0,0
7,1329,14478U,S01aa,SD0010GOOD_001_v,10,1,0,Success,10,0,0
8,1336,14485U,S01aa,SD0003GOOD_001_v,14,1,0,Success,14,0,0
9,1316,14465U,S01aa,SD0006GOOD_001_v,16,1,0,Success,16,0,0
