In [1]:
from pathlib import Path
import json, gzip, shutil, re, glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from statsmodels.tsa.seasonal import STL
from collections import defaultdict
import matplotlib.pyplot as plt
from pprint import pprint

In [2]:
raw_dir = Path('/content/drive/MyDrive/TAQA/raw')#
csv_paths = sorted(raw_dir.glob('*.csv'))
clean_dir = Path('/content/drive/MyDrive/TAQA/clean')
meta_dir = Path('/content/drive/MyDrive/TAQA/clean/meta') #contains per tool scaler

for d in (clean_dir, meta_dir):
  d.mkdir(parents=True, exist_ok=True)

In [3]:
frames= []
for file in tqdm(csv_paths, desc="Reading"):
  df = pd.read_csv(file, parse_dates = ['Timestamp'])

  value_col = df.columns[1]
  df = df.rename(columns={value_col: 'Value'})[['Timestamp', 'Value']]

  parts = file.stem.split('.')
  _,tool,kind,param = parts
  df['Tool'] = tool
  df['Kind'] = kind
  df['param']= param
  frames.append(df)

raw = pd.concat(frames, ignore_index=True)

wide = (raw.pivot_table(index=['Tool', 'Timestamp'],columns= 'param', values='Value').reset_index()) #reset index will max the index into a column

cols = ['Timestamp', 'Tool'] + [c for c in wide.columns if c not in ('Timestamp', 'Tool')] #rearranging the columns
wide = wide[cols]
print(wide.shape)
wide.head()

Reading: 100%|██████████| 54/54 [02:59<00:00,  3.32s/it]


(1646521, 11)


param,Timestamp,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature
0,2025-02-17 09:12:56.373838+00:00,P8-1,14.14728,100.1176,15.36703,14.12418,-4.313263,100.0,2.0,19.6803,14.20404
1,2025-02-17 09:12:57.384841500+00:00,P8-1,14.14728,100.1176,15.36871,14.12436,-4.12323,100.0,2.0,19.49194,14.20425
2,2025-02-17 09:12:58.383848+00:00,P8-1,14.15137,100.1176,15.36885,14.12439,-3.998016,100.0,2.0,19.36687,14.20432
3,2025-02-17 09:12:59.397851+00:00,P8-1,14.14728,100.147,15.36862,14.12445,-3.916,100.0,2.0,19.28462,14.20441
4,2025-02-17 09:13:00.396942400+00:00,P8-1,14.14319,100.1176,15.36815,14.12457,-3.86116,100.0,2.0,19.22931,14.2045


In [4]:
wide["Timestamp"] = pd.to_datetime(wide["Timestamp"], utc=True)
wide = (wide.set_index("Timestamp").sort_index())


orig_wide = wide.copy()
wide

param,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-09-20 09:17:54.336897+00:00,P8-38,10.18000,99.670000,6.90000,20.140000,8.250000,100.0,1.0,15.150000,20.150000
2024-09-20 09:17:55.356612500+00:00,P8-38,10.18000,99.670000,6.91000,20.140000,8.240000,100.0,1.0,15.150000,20.150000
2024-09-20 09:17:56.359871100+00:00,P8-38,10.18000,99.670000,6.93000,20.140000,8.220000,100.0,1.0,15.150000,20.150000
2024-09-20 09:17:57.363915300+00:00,P8-38,10.18000,99.670000,6.94000,20.140000,8.210000,100.0,1.0,15.150000,20.150000
2024-09-20 09:17:58.384392800+00:00,P8-38,10.18000,99.670000,6.96000,20.140000,8.190000,100.0,1.0,15.150000,20.150000
...,...,...,...,...,...,...,...,...,...,...
2025-04-24 23:35:23.629452900+00:00,P8-38,14.93687,-0.675674,14.48557,7.410736,6.845551,0.0,0.0,7.640015,7.273041
2025-04-24 23:35:24.636833500+00:00,P8-38,14.93687,-0.702698,14.48550,7.410736,6.845612,0.0,0.0,7.639893,7.272949
2025-04-24 23:35:25.637804400+00:00,P8-38,14.94096,-0.702698,14.48546,7.410675,6.845963,0.0,0.0,7.639496,7.272949
2025-04-24 23:35:26.644175600+00:00,P8-38,14.94096,-0.702698,14.48558,7.410614,6.846359,0.0,0.0,7.639221,7.272949


In [5]:
wide.dtypes

Unnamed: 0_level_0,0
param,Unnamed: 1_level_1
Tool,object
Battery-Voltage,float64
Choke-Position,float64
Downstream-Pressure,float64
Downstream-Temperature,float64
Downstream-Upstream-Difference,float64
Target-Position,float64
Tool-State,float64
Upstream-Pressure,float64
Upstream-Temperature,float64


In [6]:
wide = (
    wide.reset_index()
        .drop_duplicates(subset=["Tool", "Timestamp"])   # same row twice
        .sort_values(["Tool", "Timestamp"])
        .set_index("Timestamp")
)

print("rows :", len(wide))
wide.head()

rows : 1646521


param,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-02-17 09:12:56.373838+00:00,P8-1,14.14728,100.1176,15.36703,14.12418,-4.313263,100.0,2.0,19.6803,14.20404
2025-02-17 09:12:57.384841500+00:00,P8-1,14.14728,100.1176,15.36871,14.12436,-4.12323,100.0,2.0,19.49194,14.20425
2025-02-17 09:12:58.383848+00:00,P8-1,14.15137,100.1176,15.36885,14.12439,-3.998016,100.0,2.0,19.36687,14.20432
2025-02-17 09:12:59.397851+00:00,P8-1,14.14728,100.147,15.36862,14.12445,-3.916,100.0,2.0,19.28462,14.20441
2025-02-17 09:13:00.396942400+00:00,P8-1,14.14319,100.1176,15.36815,14.12457,-3.86116,100.0,2.0,19.22931,14.2045


In [7]:
from pandas import NA
CHOKE_OPEN_THRESHOLD = 10.0   # % open

wide["IsOpen"] = (wide["Choke-Position"] > CHOKE_OPEN_THRESHOLD).astype(int)

# ΔT
wide["DeltaTemperature"] = (
    wide["Upstream-Temperature"] - wide["Downstream-Temperature"]
)

tmp = wide["Tool-State"].round()

# 2️⃣  keep NaNs as <NA> and cast to nullable Int8
wide["Tool-State"] = tmp.astype("Int16")
# keep a float32 copy for models (Int8 → float32 later anyway)
wide["ToolStateNum"] = wide["Tool-State"].astype(np.float32)

print("ToolState codes:", wide["Tool-State"].dropna().unique())

wide.head()

ToolState codes: <IntegerArray>
[2, 1, 4, 5, 9, 0, 6, 10, 3, 11, 12, -1, 15969, 7, 8, 7680]
Length: 16, dtype: Int16


param,Tool,Battery-Voltage,Choke-Position,Downstream-Pressure,Downstream-Temperature,Downstream-Upstream-Difference,Target-Position,Tool-State,Upstream-Pressure,Upstream-Temperature,IsOpen,DeltaTemperature,ToolStateNum
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2025-02-17 09:12:56.373838+00:00,P8-1,14.14728,100.1176,15.36703,14.12418,-4.313263,100.0,2,19.6803,14.20404,1,0.07986,2.0
2025-02-17 09:12:57.384841500+00:00,P8-1,14.14728,100.1176,15.36871,14.12436,-4.12323,100.0,2,19.49194,14.20425,1,0.07989,2.0
2025-02-17 09:12:58.383848+00:00,P8-1,14.15137,100.1176,15.36885,14.12439,-3.998016,100.0,2,19.36687,14.20432,1,0.07993,2.0
2025-02-17 09:12:59.397851+00:00,P8-1,14.14728,100.147,15.36862,14.12445,-3.916,100.0,2,19.28462,14.20441,1,0.07996,2.0
2025-02-17 09:13:00.396942400+00:00,P8-1,14.14319,100.1176,15.36815,14.12457,-3.86116,100.0,2,19.22931,14.2045,1,0.07993,2.0


In [8]:
def rule_flat_line(s, window=5):
    repeat = s.rolling(window).apply(lambda x: len(set(np.round(x, 6))) == 1)
    return repeat.fillna(0).astype(bool)

def rule_zero(s):      return s == 0
def rule_high(s, thr): return s >= thr

# apply rule catalogue
rules = {
    "FlatLine_DownP" : rule_flat_line(wide["Downstream-Pressure"]),
    "Zero_DownP"     : rule_zero(wide["Downstream-Pressure"]),
    "High_DownP"     : rule_high(wide["Downstream-Pressure"], 12_000),

    "FlatLine_UpP"   : rule_flat_line(wide["Upstream-Pressure"]),
    "Zero_UpP"       : rule_zero(wide["Upstream-Pressure"]),
    "High_UpP"       : rule_high(wide["Upstream-Pressure"], 12_000),

    "Zero_Batt"      : rule_zero(wide["Battery-Voltage"]),
}

rule_alert = pd.DataFrame(rules).any(axis=1)         # OR of all rules
wide["RuleAlert"] = rule_alert
print("rows flagged by rule layer:", wide["RuleAlert"].mean()*100, "%")

rows flagged by rule layer: 21.58915677358503 %


In [9]:
# optional forward-fill very short gaps (≤2 samples) to keep continuity
wide = wide.groupby("Tool").apply(
    lambda g: g.ffill(limit=2)
).reset_index(level=0, drop=True)

# after ffill, drop rows with remaining NaNs in core signals
CORE_COLS = ["Battery-Voltage", "Choke-Position",
             "Upstream-Pressure", "Downstream-Pressure",
             "Upstream-Temperature", "Downstream-Temperature", "DeltaTemperature","Target-Position","Tool-State"]

wide = wide.dropna(subset=CORE_COLS)
ml_df = wide.loc[~wide["RuleAlert"]].copy()
print("rows after NaN clean:", len(wide))
print("rows after Alert clean:", len(ml_df))

  wide = wide.groupby("Tool").apply(


rows after NaN clean: 1643736
rows after Alert clean: 1288266


In [10]:
wide.to_parquet("/content/drive/MyDrive/TAQA/clean/wide36_tools_full.parquet")
ml_df.to_parquet("/content/drive/MyDrive/TAQA/clean/wide36_tools_flat.parquet")

The latest is up to this only