# 0.Data Ingestion

This notebook is responsible for loading, standardizing, and validating
all datasets used in the project.

Datasets:
- Phase 1 (Training): Jan 2016 – Sep 2020
- Phase 2 (Validation): Jan 2016 – Oct 2020 (forecast window: Oct 2020)
- Test (Final Test): Jan 2016 – Nov 2020 (forecast window: Nov 2020)

Important:
- No feature engineering is performed here.
- No modeling is performed here.
- Target values (`value`) are preserved but not used.

In [1]:
import pandas as pd
from datetime import timedelta

def parse_tsf(path, freq_minutes=15):
    records = []
    in_data_section = False

    with open(path, "r") as file:
        for line in file:
            line = line.strip()

            # Skip empty lines
            if not line:
                continue

            # Detect start of data section (robust)
            if line.lower().startswith("@data"):
                in_data_section = True
                continue

            # Skip metadata before data section
            if not in_data_section:
                continue

            # Ignore any accidental metadata lines
            if line.startswith("@"):
                continue

            # Split only on first colon
            try:
                series_id, rest = line.split(":", 1)
            except ValueError:
                continue

            values = rest.split(",")

            # Parse start timestamp
            start_time = pd.to_datetime(values[0], errors="coerce")
            if pd.isna(start_time):
                continue

            # Parse values
            for i, v in enumerate(values[1:]):
                try:
                    val = float(v)
                except ValueError:
                    continue

                records.append({
                    "series_id": series_id.strip(),
                    "timestamp": start_time + timedelta(minutes=freq_minutes * i),
                    "value": val
                })

    return pd.DataFrame(records)

def normalize_timestamp(df):
    df["timestamp"] = pd.to_datetime(
        df["timestamp"],
        utc=True,
        errors="coerce"
    ).dt.tz_localize(None)
    return df

phase1 = parse_tsf("../data/raw/phase_1_data.tsf")
phase2 = parse_tsf("../data/raw/phase_2_data.tsf")
test   = parse_tsf("../data/raw/final_test_data/nov_data.tsf")

# Normalize timestamps safely
phase1 = normalize_timestamp(phase1)
phase2 = normalize_timestamp(phase2)
test   = normalize_timestamp(test)

# Save parsed datasets
phase1.to_csv("../data/processed/phase1.csv", index=False)
phase2.to_csv("../data/processed/phase2.csv", index=False)
test.to_csv("../data/processed/test.csv", index=False)

In [2]:
print(phase1.head())
print(phase1.tail())
print(phase1.info())

   series_id           timestamp   value
0  Building3 2016-03-02 02:01:00  1321.0
1  Building3 2016-03-02 02:16:00  1321.0
2  Building3 2016-03-02 02:31:00  1321.0
3  Building3 2016-03-02 02:46:00  1293.0
4  Building3 2016-03-02 03:01:00  1293.0
       series_id           timestamp  value
473652    Solar5 2020-09-30 22:30:00  25.28
473653    Solar5 2020-09-30 22:45:00  25.30
473654    Solar5 2020-09-30 23:00:00  28.62
473655    Solar5 2020-09-30 23:15:00  31.94
473656    Solar5 2020-09-30 23:30:00  32.92
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473657 entries, 0 to 473656
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   series_id  473657 non-null  object        
 1   timestamp  473657 non-null  datetime64[ns]
 2   value      473657 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 10.8+ MB
None


In [3]:
print(phase2.head())
print(phase2.tail())
print(phase2.info())

   series_id           timestamp   value
0  Building3 2016-03-02 02:01:00  1321.0
1  Building3 2016-03-02 02:16:00  1321.0
2  Building3 2016-03-02 02:31:00  1321.0
3  Building3 2016-03-02 02:46:00  1293.0
4  Building3 2016-03-02 03:01:00  1293.0
       series_id           timestamp  value
496308    Solar5 2020-10-31 22:30:00  13.47
496309    Solar5 2020-10-31 22:45:00  15.93
496310    Solar5 2020-10-31 23:00:00  17.11
496311    Solar5 2020-10-31 23:15:00  18.29
496312    Solar5 2020-10-31 23:30:00  20.64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496313 entries, 0 to 496312
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   series_id  496313 non-null  object        
 1   timestamp  496313 non-null  datetime64[ns]
 2   value      496313 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 11.4+ MB
None


In [4]:
print(test.head())
print(test.tail())
print(test.info())

   series_id           timestamp   value
0  Building3 2016-03-02 02:01:00  1321.0
1  Building3 2016-03-02 02:16:00  1321.0
2  Building3 2016-03-02 02:31:00  1321.0
3  Building3 2016-03-02 02:46:00  1293.0
4  Building3 2016-03-02 03:01:00  1293.0
       series_id           timestamp  value
519273    Solar5 2020-11-30 22:30:00   9.86
519274    Solar5 2020-11-30 22:45:00   6.22
519275    Solar5 2020-11-30 23:00:00   7.14
519276    Solar5 2020-11-30 23:15:00   5.64
519277    Solar5 2020-11-30 23:30:00   4.45
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519278 entries, 0 to 519277
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   series_id  519278 non-null  object        
 1   timestamp  519278 non-null  datetime64[ns]
 2   value      519278 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 11.9+ MB
None
