In [1]:
import pandas as pd

In [2]:
tess_data = pd.read_csv("../data/TOI_2025.10.04_08.44.51.csv", skiprows=69)


In [3]:
tess_data[["tid", "tfopwg_disp"]].to_csv("../data/tess_labels.csv", index=False)

In [4]:
tess_labels = pd.read_csv("../data/tess_labels.csv")
tess_labels.head()

Unnamed: 0,tid,tfopwg_disp
0,50365310,FP
1,88863718,PC
2,124709665,FP
3,106997505,FP
4,238597883,FP


In [7]:
import os
import pandas as pd
import numpy as np

# Paths
labels_path = "../data/tess_labels.csv"   # or your main labels file
timeseries_folder = "../data/data"        # folder with time series files
output_folder = "../data/processed_tess"
output_csv = os.path.join(output_folder, "tess_data_20000.csv")

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Read label file
labels_df = pd.read_csv(labels_path)

# Container for all observations
data_list = []
not_found_count = 0
for _, row in labels_df.iterrows():
    tid = row["tid"]
    label = row["tfopwg_disp"]
    file_path = os.path.join(timeseries_folder, f"{tid}.csv")

    # Check if file exists
    if not os.path.exists(file_path):
        #print(f"⚠️ File not found: {file_path}")
        not_found_count += 1
        print(f"⚠️ Not found count: {not_found_count}", end="\r")
        continue

    # Read the time series file
    ts = pd.read_csv(file_path)

    # If the CSV has a single column, flatten it
    if ts.shape[1] == 1:
        values = ts.iloc[:, 0].values
    else:
        # If more columns exist, you can adjust this
        values = ts.values.flatten()

    # Take last 2000 points (or pad with NaNs if shorter)
    last_points = values[-20000:]
    if len(last_points) < 20000:
        last_points = np.pad(last_points, (20000 - len(last_points), 0), constant_values=np.nan)

    # Append class label
    combined = np.append(last_points, label)
    data_list.append(combined)

# Convert to DataFrame
final_df = pd.DataFrame(data_list)

# Rename columns
final_df.columns = [f"p{i+1}" for i in range(20000)] + ["label"]

# Save to CSV
final_df.to_csv(output_csv, index=False)

print(f"✅ Saved processed data to: {output_csv}")
print(f"Shape: {final_df.shape}")


✅ Saved processed data to: ../data/processed_tess/tess_data_20000.csv
Shape: (4756, 20001)


In [8]:
final_df["label"].value_counts()

label
PC     2733
FP      787
CP      486
KP      391
APC     298
FA       61
Name: count, dtype: int64

In [9]:
# Keep only desired labels
valid_labels = ["FP", "FA", "CP", "KP"]
filtered_df = final_df[final_df["label"].isin(valid_labels)].copy()

# Map to binary classes
label_map = {
    "FP": 0,
    "FA": 0,
    "CP": 1,
    "KP": 1
}
filtered_df["label"] = filtered_df["label"].map(label_map).astype(int)

# (Optional) save again
output_csv_filtered = os.path.join(output_folder, "tess_data_20000_filtered.csv")
filtered_df.to_csv(output_csv_filtered, index=False)

print(f"✅ Saved filtered binary data to: {output_csv_filtered}")
print(f"Shape: {filtered_df.shape}")
print(filtered_df["label"].value_counts())


✅ Saved filtered binary data to: ../data/processed_tess/tess_data_20000_filtered.csv
Shape: (1725, 20001)
label
1    877
0    848
Name: count, dtype: int64
