In [1]:
from pathlib import Path
import json, gzip, shutil, re, glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from statsmodels.tsa.seasonal import STL
from collections import defaultdict
import matplotlib.pyplot as plt
from pprint import pprint

In [2]:
raw_dir = Path('/content/drive/MyDrive/TAQA/eval/P8-36-short')#
csv_paths = sorted(raw_dir.glob('*.csv'))
clean_dir = Path('/content/drive/MyDrive/TAQA/clean')
meta_dir = Path('/content/drive/MyDrive/TAQA/clean/meta') #contains per tool scaler

for d in (clean_dir, meta_dir):
  d.mkdir(parents=True, exist_ok=True)

In [3]:
frames= []
for file in tqdm(csv_paths, desc="Reading"):
  df = pd.read_csv(file, parse_dates = ['Timestamp'])

  value_col = df.columns[1]
  df = df.rename(columns={value_col: 'Value'})[['Timestamp', 'Value']]

  parts = file.stem.split('.')
  _,tool,kind,param = parts
  df['Tool'] = tool
  df['Kind'] = kind
  df['param']= param
  frames.append(df)

raw = pd.concat(frames, ignore_index=True)

wide = (raw.pivot_table(index=['Tool', 'Timestamp'],columns= 'param', values='Value').reset_index()) #reset index will max the index into a column

cols = ['Timestamp', 'Tool'] + [c for c in wide.columns if c not in ('Timestamp', 'Tool')] #rearranging the columns
wide = wide[cols]
print(wide.shape)
wide.head()

Reading: 100%|██████████| 9/9 [00:05<00:00,  1.73it/s]

(6991, 11)





param,Timestamp,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature
0,2024-09-20 11:56:10.266816500+00:00,P8-36,10.15,100.0,32.6,19.74,-17.86,100.0,1.0,14.74,19.74
1,2024-09-27 09:13:04.400897300+00:00,P8-36,14.18,-0.38,4.54,18.03,9.99,0.0,6.0,14.53,18.56
2,2024-09-27 09:13:05.406325300+00:00,P8-36,14.18,-0.41,4.54,18.03,9.99,0.0,6.0,14.53,18.56
3,2024-09-27 09:13:06.394713500+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6.0,14.53,18.56
4,2024-09-27 09:13:07.398477+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6.0,14.53,18.56


In [4]:
wide["Timestamp"] = pd.to_datetime(wide["Timestamp"], utc=True)
wide = (wide.set_index("Timestamp").sort_index())


orig_wide = wide.copy()
wide

param,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-09-20 11:56:10.266816500+00:00,P8-36,10.15,100.00,32.60,19.74,-17.86,100.0,1.0,14.74,19.74
2024-09-27 09:13:04.400897300+00:00,P8-36,14.18,-0.38,4.54,18.03,9.99,0.0,6.0,14.53,18.56
2024-09-27 09:13:05.406325300+00:00,P8-36,14.18,-0.41,4.54,18.03,9.99,0.0,6.0,14.53,18.56
2024-09-27 09:13:06.394713500+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6.0,14.53,18.56
2024-09-27 09:13:07.398477+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6.0,14.53,18.56
...,...,...,...,...,...,...,...,...,...,...
2024-09-27 11:48:17.430328900+00:00,P8-36,14.19,100.05,18.41,17.69,-3.89,100.0,2.0,14.52,17.81
2024-09-27 11:48:18.434758100+00:00,P8-36,14.18,100.08,18.41,17.69,-3.89,100.0,2.0,14.52,17.81
2024-09-27 11:48:20.451474300+00:00,P8-36,14.19,100.05,18.42,17.69,-3.89,100.0,2.0,14.52,17.81
2024-09-27 11:49:24.571723400+00:00,P8-36,14.18,100.05,1118.44,17.69,-3.92,100.0,2.0,114.53,117.81


In [5]:
wide.dtypes

Unnamed: 0_level_0,0
param,Unnamed: 1_level_1
Tool,object
Battery-Voltage,float64
Choke-Position,float64
Downstream-Pressure,float64
Downstream-Temperature,float64
Downstream-Upstream-Difference,float64
Target-Position,float64
Tool-State,float64
Upstream-Pressure,float64
Upstream-Temperature,float64


In [6]:
wide = (
    wide.reset_index()
        .drop_duplicates(subset=["Tool", "Timestamp"])   # same row twice
        .sort_values(["Tool", "Timestamp"])
        .set_index("Timestamp")
)

print("rows :", len(wide))
wide.head()

rows : 6991


param,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-09-20 11:56:10.266816500+00:00,P8-36,10.15,100.0,32.6,19.74,-17.86,100.0,1.0,14.74,19.74
2024-09-27 09:13:04.400897300+00:00,P8-36,14.18,-0.38,4.54,18.03,9.99,0.0,6.0,14.53,18.56
2024-09-27 09:13:05.406325300+00:00,P8-36,14.18,-0.41,4.54,18.03,9.99,0.0,6.0,14.53,18.56
2024-09-27 09:13:06.394713500+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6.0,14.53,18.56
2024-09-27 09:13:07.398477+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6.0,14.53,18.56


In [7]:
from pandas import NA
CHOKE_OPEN_THRESHOLD = 10.0   # % open

wide["IsOpen"] = (wide["Choke-Position"] > CHOKE_OPEN_THRESHOLD).astype(int)

# ΔT
wide["DeltaTemperature"] = (
    wide["Upstream-Temperature"] - wide["Downstream-Temperature"]
)

tmp = wide["Tool-State"].round()

# 2️⃣  keep NaNs as <NA> and cast to nullable Int8
wide["Tool-State"] = tmp.astype("Int16")
# keep a float32 copy for models (Int8 → float32 later anyway)
wide["ToolStateNum"] = wide["Tool-State"].astype(np.float32)

print("ToolState codes:", wide["Tool-State"].dropna().unique())

wide.head()

ToolState codes: <IntegerArray>
[1, 6, 11, 2]
Length: 4, dtype: Int16


param,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature,IsOpen,DeltaTemperature,ToolStateNum
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2024-09-20 11:56:10.266816500+00:00,P8-36,10.15,100.0,32.6,19.74,-17.86,100.0,1,14.74,19.74,1,0.0,1.0
2024-09-27 09:13:04.400897300+00:00,P8-36,14.18,-0.38,4.54,18.03,9.99,0.0,6,14.53,18.56,0,0.53,6.0
2024-09-27 09:13:05.406325300+00:00,P8-36,14.18,-0.41,4.54,18.03,9.99,0.0,6,14.53,18.56,0,0.53,6.0
2024-09-27 09:13:06.394713500+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6,14.53,18.56,0,0.53,6.0
2024-09-27 09:13:07.398477+00:00,P8-36,14.18,-0.38,4.54,18.03,9.98,0.0,6,14.53,18.56,0,0.53,6.0


In [8]:
def rule_flat_line(s, window=5):
    repeat = s.rolling(window).apply(lambda x: len(set(np.round(x, 6))) == 1)
    return repeat.fillna(0).astype(bool)

def rule_zero(s):      return s == 0
def rule_high(s, thr): return s >= thr

# apply rule catalogue
rules = {
    "FlatLine_DownP" : rule_flat_line(wide["Downstream-Pressure"]),
    "Zero_DownP"     : rule_zero(wide["Downstream-Pressure"]),
    "High_DownP"     : rule_high(wide["Downstream-Pressure"], 12_000),

    "FlatLine_UpP"   : rule_flat_line(wide["Upstream-Pressure"]),
    "Zero_UpP"       : rule_zero(wide["Upstream-Pressure"]),
    "High_UpP"       : rule_high(wide["Upstream-Pressure"], 12_000),

    "Zero_Batt"      : rule_zero(wide["Battery-Voltage"]),
}

rule_alert = pd.DataFrame(rules).any(axis=1)         # OR of all rules
wide["RuleAlert"] = rule_alert
print("rows flagged by rule layer:", wide["RuleAlert"].mean()*100, "%")

rows flagged by rule layer: 45.801745100843945 %


In [9]:
# optional forward-fill very short gaps (≤2 samples) to keep continuity
wide = wide.groupby("Tool").apply(
    lambda g: g.ffill(limit=2)
).reset_index(level=0, drop=True)

# after ffill, drop rows with remaining NaNs in core signals
CORE_COLS = ["Battery-Voltage", "Choke-Position",
             "Upstream-Pressure", "Downstream-Pressure",
             "Upstream-Temperature", "Downstream-Temperature", "DeltaTemperature","Target-Position","Tool-State"]

wide = wide.dropna(subset=CORE_COLS)
ml_df = wide.loc[~wide["RuleAlert"]].copy()
print("rows after NaN clean:", len(wide))
print("rows after Alert clean:", len(ml_df))

rows after NaN clean: 6991
rows after Alert clean: 3789


  wide = wide.groupby("Tool").apply(


In [10]:
wide.to_parquet("/content/drive/MyDrive/TAQA/clean/wide_P8-36_full.parquet")
ml_df.to_parquet("/content/drive/MyDrive/TAQA/clean/wide_P8-36_flat.parquet")

The latest is up to this only

In [None]:
HARD_LIMITS = {
    "Battery-Voltage":          (10, 16),
    "Upstream-Pressure":        (0, 7000),
    "Downstream-Pressure":      (0, 7000),
    "Upstream-Temperature":     (0, 150),
    "Downstream-Temperature":   (0, 150),
    "Choke-Position":           (0, 100),
    "Target-Position":          (0, 100),
    "Tool-State":               (0, 20),
}

def apply_hard_limits(df):
    for col, (lo, hi) in HARD_LIMITS.items():
        if col in df.columns:
            df.loc[(df[col] < lo) | (df[col] > hi), col] = np.nan
    return df

In [None]:
RESAMPLE_RULE = "10s"   # target grid
FFILL_LIMIT   = 3       # 3×10 s = 30 s ffill window

clean_frames = []

for tool, df_tool in wide.groupby("Tool"):

    res = (df_tool
           .drop(columns="Tool")              # resample numeric cols only
           .resample(RESAMPLE_RULE).median()
           .pipe(apply_hard_limits)
           .ffill(limit=FFILL_LIMIT)
           .dropna(how="all"))                # trim power-off stretches

    # re-add tool column for later grouping
    res["Tool"] = tool
    clean_frames.append(res)

clean = pd.concat(clean_frames).sort_index()
worst_nan = round(clean.drop(columns="Tool").isna().mean().max(), 3)

print("Rows after resample & trim:", len(clean))
print("Worst NaN ratio:", worst_nan)

Rows after resample & trim: 171659
Worst NaN ratio: 0.225


In [None]:
# ❶ Per-column NaN ratios (global view)
print(clean.drop(columns="Tool")
          .isna()
          .mean()
          .sort_values(ascending=False)
          .round(3))

# ❷ Per-tool × column matrix (helps pinpoint if the gap is only in P8-41, etc.)
nan_matrix = (clean
              .groupby("Tool")
              .apply(lambda g: g.isna().mean())
              .round(3))
print(nan_matrix)


param
Battery-Voltage                   0.225
Upstream-Pressure                 0.006
Downstream-Pressure               0.003
Downstream-Temperature            0.000
Upstream-Temperature              0.000
Choke-Position                    0.000
Downstream-Upstream-Difference    0.000
dtype: float64
param  Battery-Voltage  Choke-Position  Downstream-Pressure  \
Tool                                                          
P8-1             0.000           0.000                0.000   
P8-38            0.611           0.001                0.034   
P8-41            1.000           0.000                0.000   
P8-59            0.000           0.000                0.000   
P8-7             0.006           0.000                0.000   

param  Downstream-Temperature  Downstream-Upstream-Difference  \
Tool                                                            
P8-1                    0.000                           0.000   
P8-38                   0.001                           0.001 

  .apply(lambda g: g.isna().mean())


In [None]:
#purpose is to drop entire tool
DROP_NAN_THRESH = 0.15
MIN_GOOD_COLS   = 4

good_mask = {}
tool_good_cols = {}

for tool, g in clean.groupby("Tool"):
    good_cols = []
    for col in g.columns.difference(["Tool"]):
        nan_frac = g[col].isna().mean()
        is_const = g[col].nunique(dropna=True) <= 1
        keep_it  = (nan_frac <= DROP_NAN_THRESH) and (not is_const)
        good_mask[(tool, col)] = keep_it
        if keep_it:
            good_cols.append(col)
    tool_good_cols[tool] = good_cols

drop_tools = [t for t, cols in tool_good_cols.items() if len(cols) < MIN_GOOD_COLS]
clean      = clean[~clean["Tool"].isin(drop_tools)]
print("Dropped entire tools:", drop_tools)

# ── build the final DataFrame with NaNs left in place for sparse cols ────
# (training code will .dropna() on a per-model basis)
final_cols = {col for (t,col), ok in good_mask.items() if ok}
clean      = clean[["Tool"] + list(final_cols)]
print("Columns retained:", final_cols)
print("Shape after prune:", clean.shape)


Dropped entire tools: []
Columns retained: {'Choke-Position', 'Upstream-Pressure', 'Upstream-Temperature', 'Downstream-Upstream-Difference', 'Battery-Voltage', 'Downstream-Pressure', 'Downstream-Temperature'}
Shape after prune: (171659, 8)


In [None]:
nan_matrix = (clean
              .groupby("Tool")
              .apply(lambda g: g.drop(columns="Tool").isna().mean())
              .round(3))

print("\n▶︎ NaN ratio per tool–sensor:")
print(nan_matrix.to_string())


▶︎ NaN ratio per tool–sensor:
param  Choke-Position  Upstream-Pressure  Upstream-Temperature  Downstream-Upstream-Difference  Battery-Voltage  Downstream-Pressure  Downstream-Temperature
Tool                                                                                                                                                        
P8-1            0.000               0.00                   0.0                           0.000            0.000                0.000                   0.000
P8-38           0.001               0.06                   0.0                           0.001            0.611                0.034                   0.001
P8-41           0.000               0.00                   0.0                           0.000            1.000                0.000                   0.000
P8-59           0.000               0.00                   0.0                           0.000            0.000                0.000                   0.000
P8-7            0.000      

  .apply(lambda g: g.drop(columns="Tool").isna().mean())


In [None]:
scalers = {}
for tool, g in clean.groupby("Tool"):
    scalers[tool] = {col: {"mean": g[col].mean(),
                           "std":  g[col].std(ddof=0)}
                     for col in clean.columns.difference(["Tool"])}

with open(meta_dir / "scalers.json", "w") as f:
    json.dump(scalers, f, indent=2)
print("✓ scalers.json written")

✓ scalers.json written


In [None]:
with open("/content/drive/MyDrive/TAQA/clean/meta/scalers.json") as f:
    scalers = json.load(f)

print(f"Tools saved  : {list(scalers.keys())}")
print(f"Sensors saved: {list(next(iter(scalers.values())).keys())}")


Tools saved  : ['P8-1', 'P8-38', 'P8-41', 'P8-59', 'P8-7']
Sensors saved: ['Battery-Voltage', 'Choke-Position', 'Downstream-Pressure', 'Downstream-Temperature', 'Downstream-Upstream-Difference', 'Upstream-Pressure', 'Upstream-Temperature']


In [None]:
tool = "P8-41"
print(f"\nScaler snapshot for {tool}:")
pprint(scalers[tool])


Scaler snapshot for P8-41:
{'Battery-Voltage': {'mean': nan, 'std': nan},
 'Choke-Position': {'mean': 99.79870395296118, 'std': 1.6527570634971624},
 'Downstream-Pressure': {'mean': 2629.7071606687446, 'std': 185.58085196869618},
 'Downstream-Temperature': {'mean': 105.53092235760839,
                            'std': 3.466191861727839},
 'Downstream-Upstream-Difference': {'mean': -3.461282587135166,
                                    'std': 2.2085987293254012},
 'Upstream-Pressure': {'mean': 2626.246881907056, 'std': 186.1521254876871},
 'Upstream-Temperature': {'mean': 109.67286766789458,
                          'std': 3.6286673070297475}}


In [None]:
# ── set where the file will live ───────────────────────────────────────────
BV_META_PATH = meta_dir / "battery_voltage_mad.json"

batt_meta = {}
if "Battery-Voltage" in clean.columns:

    # we already have Battery-Voltage in float dtype; NaNs mark gaps / outages
    for tool, g in clean.groupby("Tool"):

        series = g["Battery-Voltage"]
        nan_ratio = series.isna().mean()

        # skip tools with sparse or absent battery data
        if series.empty or (series.isna().mean() > 0.15):
            print(f"skipping Battery STL for {tool} "
                  f"(nan_ratio = {nan_ratio:.2%})")
            continue

        # 1-minute grid for STL
        s_1m  = series.resample("1min").mean().dropna()
        stl   = STL(s_1m, period=1440, robust=False).fit()
        resid = stl.resid

        mad   = np.median(np.abs(resid - np.median(resid)))   # robust MAD
        batt_meta[tool] = {"mad": mad, "cutoff": 5 * mad}

# write the metadata
with open(BV_META_PATH, "w") as f:
    json.dump(batt_meta, f, indent=2)

print("battery_voltage_mad.json written")


skipping Battery STL for P8-38 (nan_ratio = 61.11%)
skipping Battery STL for P8-41 (nan_ratio = 100.00%)
battery_voltage_mad.json written


In [None]:
from pprint import pprint

print("\nTools included in STL-MAD:")
pprint(batt_meta)


Tools included in STL-MAD:
{'P8-1': {'cutoff': np.float64(0.000975368205211602),
          'mad': np.float64(0.0001950736410423204)},
 'P8-59': {'cutoff': np.float64(5.773159728050814e-14),
           'mad': np.float64(1.1546319456101628e-14)},
 'P8-7': {'cutoff': np.float64(0.9099507245250571),
          'mad': np.float64(0.18199014490501142)}}


In [None]:
# save dataset in Parquet
clean_out = (clean                      # ← dataframe from Step 4
             .set_index("Tool", append=True)   # Timestamp · Tool → swap
             .swaplevel()                      # Tool first, Timestamp second
             .sort_index())                    # tidy ordering

out_path = clean_dir / "wide_tools_flat.parquet"
clean_out.to_parquet(out_path, compression="snappy")

print("✓ wide_tools_flat.parquet written")
print("  file :", out_path)
print("  shape:", clean_out.shape)
print("  columns retained:", list(clean_out.columns))


✓ wide_tools_flat.parquet written
  file : /content/drive/MyDrive/TAQA/clean/wide_tools_flat.parquet
  shape: (171659, 7)
  columns retained: ['Choke-Position', 'Upstream-Pressure', 'Upstream-Temperature', 'Downstream-Upstream-Difference', 'Battery-Voltage', 'Downstream-Pressure', 'Downstream-Temperature']


In [None]:
PQ = Path("/content/drive/MyDrive/TAQA/clean/wide_tools_flat.parquet")
df = pd.read_parquet(PQ)

# 1) basic shape & columns
print("shape :", df.shape)
print("index :", df.index.names)          # ['Tool', 'Timestamp']
print("cols  :", df.columns.tolist())

# 2) per-tool row counts
tool_sizes = df.groupby(level="Tool").size()
print("\nrows per tool:")
print(tool_sizes)

# 3) confirm Battery-Voltage present only where expected
batt_nan = df["Battery-Voltage"].isna().groupby(level="Tool").mean().round(3)
print("\nBattery-Voltage NaN-ratio by tool:")
print(batt_nan)


shape : (171659, 7)
index : ['Tool', 'Timestamp']
cols  : ['Choke-Position', 'Upstream-Pressure', 'Upstream-Temperature', 'Downstream-Upstream-Difference', 'Battery-Voltage', 'Downstream-Pressure', 'Downstream-Temperature']

rows per tool:
Tool
P8-1      17602
P8-38     15883
P8-41     28232
P8-59      6573
P8-7     103369
dtype: int64

Battery-Voltage NaN-ratio by tool:
Tool
P8-1     0.000
P8-38    0.611
P8-41    1.000
P8-59    0.000
P8-7     0.006
Name: Battery-Voltage, dtype: float64
