In [1]:
import pandas as pd
import json
import os

In [2]:
train_date_from = "2024-05-18"
train_date_to = "2024-06-30"

val_date_from = "2024-07-01"
val_date_to = "2024-07-21"

test_date_from = "2024-11-10"
test_date_to = "2024-11-21"

In [14]:
base_file = "dataset/new_dataset.csv"
setup_folder = "dataset/setup"
cleaned_folder = "dataset/cleaned"
cleaned_train_folder = f"{cleaned_folder}/train"
cleaned_val_folder = f"{cleaned_folder}/val"
cleaned_test_folder = f"{cleaned_folder}/test"
cleaned_test_per_day_folder = f"{cleaned_folder}/test_per_day"

In [4]:
os.makedirs(setup_folder, exist_ok=True)
os.makedirs(cleaned_folder, exist_ok=True)
os.makedirs(cleaned_train_folder, exist_ok=True)
os.makedirs(cleaned_val_folder, exist_ok=True)
os.makedirs(cleaned_test_folder, exist_ok=True)
os.makedirs(cleaned_test_per_day_folder, exist_ok=True)

In [15]:
base_dataset = pd.read_csv(base_file)
base_dataset["created_at"] = pd.to_datetime(
    base_dataset["created_at"], format="ISO8601", utc=True
)

base_dataset = base_dataset.sort_values(by=["created_at", "name"])
base_dataset["date"] = base_dataset["created_at"].dt.date

print("Dataset Columns", base_dataset.columns)

Dataset Columns Index(['id', 'created_at', 'updated_at', 'deleted_at', 'name', 'voltage',
       'current', 'power', 'power_factor', 'frequency', 'energy',
       'apparent_power', 'reactive_power', 'date'],
      dtype='object')


In [6]:
def save_json(data, path):
    with open(path, "w") as json_file:
        json.dump(data, json_file, indent=4)
    print(f"Files saved successfully: {path}")


def save_csv(data, path):
    data.to_csv(path, index=False)
    print(f"Files saved successfully: {path}")

In [7]:
daily_counts = base_dataset.groupby(["date", "name"]).size().reset_index(name="count")
daily_counts["date"] = daily_counts["date"].astype(str)

structured_result = {}

for _, row in daily_counts.iterrows():
    date = row["date"]
    sensor_name = row["name"]
    count = row["count"]

    sensor_data = base_dataset[
        (base_dataset["date"] == pd.to_datetime(date).date())
        & (base_dataset["name"] == sensor_name)
    ]

    sensor_data = sensor_data.sort_values(by="created_at")
    sensor_data["time_diff"] = sensor_data["created_at"].diff().dt.total_seconds()
    average_interval = sensor_data["time_diff"].mean()
    total_energy_kWh = 0
    for i, row in sensor_data.iterrows():
        power = row["power"]
        interval = average_interval / 3600

        energy_kWh = (power * interval) / 1000
        total_energy_kWh += energy_kWh

    if date not in structured_result:
        structured_result[date] = {"date": date, "result": []}

    structured_result[date]["result"].append(
        {
            "name": sensor_name,
            "count": count,
            "interval": average_interval,
            "kWh": total_energy_kWh,
        }
    )

final_result = list(structured_result.values())
save_json(final_result, "dataset/daily_counts_by_sensor.json")

Files saved successfully: dataset/daily_counts_by_sensor.json


In [8]:
final_result_df = pd.DataFrame(final_result)

train_result = []
val_result = []
test_result = []

for date_entry in final_result:
    date = date_entry["date"]

    if train_date_from <= date <= train_date_to:
        train_result.append(date_entry)
    elif val_date_from <= date <= val_date_to:
        val_result.append(date_entry)
    elif test_date_from <= date <= test_date_to:
        test_result.append(date_entry)

save_json(train_result, f"{setup_folder}/train.json")
save_json(val_result, f"{setup_folder}/val.json")
save_json(test_result, f"{setup_folder}/test.json")

Files saved successfully: dataset/setup/train.json
Files saved successfully: dataset/setup/val.json
Files saved successfully: dataset/setup/test.json


In [9]:
def get_date_ranges(json_data):
    return [pd.to_datetime(entry["date"]) for entry in json_data]


train_dates = get_date_ranges(train_result)
val_dates = get_date_ranges(val_result)
test_dates = get_date_ranges(test_result)

print(train_dates)
print(val_dates)
print(test_dates)

[Timestamp('2024-05-18 00:00:00'), Timestamp('2024-05-19 00:00:00'), Timestamp('2024-05-20 00:00:00'), Timestamp('2024-05-21 00:00:00'), Timestamp('2024-05-22 00:00:00'), Timestamp('2024-05-23 00:00:00'), Timestamp('2024-05-24 00:00:00'), Timestamp('2024-05-25 00:00:00'), Timestamp('2024-05-26 00:00:00'), Timestamp('2024-05-27 00:00:00'), Timestamp('2024-05-28 00:00:00'), Timestamp('2024-05-29 00:00:00'), Timestamp('2024-05-30 00:00:00'), Timestamp('2024-05-31 00:00:00'), Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-02 00:00:00'), Timestamp('2024-06-03 00:00:00'), Timestamp('2024-06-04 00:00:00'), Timestamp('2024-06-05 00:00:00'), Timestamp('2024-06-06 00:00:00'), Timestamp('2024-06-07 00:00:00'), Timestamp('2024-06-08 00:00:00'), Timestamp('2024-06-09 00:00:00'), Timestamp('2024-06-10 00:00:00'), Timestamp('2024-06-11 00:00:00'), Timestamp('2024-06-12 00:00:00'), Timestamp('2024-06-13 00:00:00'), Timestamp('2024-06-14 00:00:00'), Timestamp('2024-06-15 00:00:00'), Timestamp('20

In [10]:
def save_sensor_data_to_csv(sensor, filter, file_type="train"):
    filtered = base_dataset[
        (base_dataset["created_at"].dt.date.isin([d.date() for d in filter]))
        & (base_dataset["name"] == sensor)
    ]
    file_path = os.path.join(cleaned_folder, file_type, f"{sensor}_{file_type}.csv")
    save_csv(filtered, file_path)

In [11]:
sensors = base_dataset["name"].unique()

for sensor in sensors:
    if pd.notna(sensor) and isinstance(sensor, str) and sensor.strip() != "nan":
        save_sensor_data_to_csv(sensor, train_dates, "train")
        save_sensor_data_to_csv(sensor, val_dates, "val")
        save_sensor_data_to_csv(sensor, test_dates, "test")

Files saved successfully: dataset/cleaned\train\Sensor 1_train.csv
Files saved successfully: dataset/cleaned\val\Sensor 1_val.csv
Files saved successfully: dataset/cleaned\test\Sensor 1_test.csv
Files saved successfully: dataset/cleaned\train\Sensor 2_train.csv
Files saved successfully: dataset/cleaned\val\Sensor 2_val.csv
Files saved successfully: dataset/cleaned\test\Sensor 2_test.csv
Files saved successfully: dataset/cleaned\train\Sensor 3_train.csv
Files saved successfully: dataset/cleaned\val\Sensor 3_val.csv
Files saved successfully: dataset/cleaned\test\Sensor 3_test.csv


In [12]:
def save_sensor_data_per_day_to_csv(sensor, date, data, folder):
    file_name = f"{sensor}_{date}.csv"
    file_path = os.path.join(folder, file_name)
    save_csv(data, file_path)

In [13]:
test_dates = [pd.to_datetime(date).date() for date in test_dates]

for date in test_dates:
    date_str = date.strftime("%Y-%m-%d")
    date_data = base_dataset[base_dataset["created_at"].dt.date == date]
    sensors_for_date = date_data["name"].unique()
    for sensor in sensors_for_date:
        sensor_data = date_data[date_data["name"] == sensor]
        save_sensor_data_per_day_to_csv(
            sensor, date_str, sensor_data, cleaned_test_per_day_folder
        )

Files saved successfully: dataset/cleaned/test_per_day\Sensor 1_2024-11-10.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 2_2024-11-10.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 3_2024-11-10.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 1_2024-11-11.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 2_2024-11-11.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 3_2024-11-11.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 1_2024-11-12.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 2_2024-11-12.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 3_2024-11-12.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 1_2024-11-13.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 2_2024-11-13.csv
Files saved successfully: dataset/cleaned/test_per_day\Sensor 3_2024-11-13.csv
Files saved successfully: dataset/cleaned/test_per_d