In [1]:
import pandas as pd

# Datei laden (z. B. B4S)
df_b4s = pd.read_csv("EDM_B4S.csv", sep=";")
df_b4s = df_b4s.rename(columns={df_b4s.columns[0]: "time"})
df_b4s["time"] = pd.to_datetime(df_b4s["time"])
df_b4s = df_b4s.set_index("time")
df_b4s.head()  # Ausgabe der ersten Zeilen des DataFrames  

 

Unnamed: 0_level_0,1-1:1290*255,1-1:2290*255,1-1:5290*255,1-1:6290*255,1-1:7290*255,1-1:8290*255,Nettolast_P_kW,Nettolast_Q_kvar
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-02 23:15:00+00:00,30014,0,2612,0,0,8200,30.014,-5.588
2020-03-02 23:30:00+00:00,27356,0,2538,0,0,8322,27.356,-5.784
2020-03-02 23:45:00+00:00,25930,0,2544,0,0,8302,25.93,-5.758
2020-03-03 00:00:00+00:00,24318,0,2432,0,0,8296,24.318,-5.864
2020-03-03 00:15:00+00:00,24300,0,2460,0,0,8082,24.3,-5.622


In [3]:
# Entfernen von Spalten mit nur NaN
df_b4s = df_b4s.dropna(axis=1, how="all")
# Entfernen von Zeilen mit nur NaN
df_b4s = df_b4s.dropna(axis=0, how="all")

 # Vorwärtsauffüllen der NaN-Werte
df_b4s = df_b4s.fillna(method="ffill") 
# Rückwärtsauffüllen der NaN-Werte
df_b4s = df_b4s.fillna(method="bfill")  
# df_b4s = df_b4s.dropna(axis=1, how="all") 

# df_b4s = df_b4s.dropna(axis=0, how="all")  # Entfernen von Zeilen mit nur NaN
df_b4s = df_b4s.drop_duplicates()  # Entfernen von Duplikaten
df_b4s = df_b4s.sort_index()  # Sortieren nach Zeitindex  
df_b4s.head()  # Ausgabe der ersten Zeilen des DataFrames  


  df_b4s = df_b4s.fillna(method="ffill")
  df_b4s = df_b4s.fillna(method="bfill")


Unnamed: 0_level_0,1-1:1290*255,1-1:2290*255,1-1:5290*255,1-1:6290*255,1-1:7290*255,1-1:8290*255,Nettolast_P_kW,Nettolast_Q_kvar
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-02 23:15:00+00:00,30014,0,2612,0,0,8200,30.014,-5.588
2020-03-02 23:30:00+00:00,27356,0,2538,0,0,8322,27.356,-5.784
2020-03-02 23:45:00+00:00,25930,0,2544,0,0,8302,25.93,-5.758
2020-03-03 00:00:00+00:00,24318,0,2432,0,0,8296,24.318,-5.864
2020-03-03 00:15:00+00:00,24300,0,2460,0,0,8082,24.3,-5.622


In [4]:
weather_raw = pd.read_excel("Meteodaten_korrigiert.xlsx", sheet_name="MeasurementMeteo15Min")
weather_df = weather_raw[["MeasurementTimestampUtc", "Parameter", "MeasurementValue"]].dropna()
weather_df["MeasurementTimestampUtc"] = pd.to_datetime(weather_df["MeasurementTimestampUtc"])
weather_df = weather_df.rename(columns={"MeasurementTimestampUtc": "time"})

# Pivot: Wetterparameter als Spalten
weather_wide = weather_df.pivot(index="time", columns="Parameter", values="MeasurementValue")


In [1]:
import pandas as pd
import holidays

# -------------------------
# 1. Lastdaten laden
# -------------------------
df_b4s = pd.read_csv("EDM_B4S.csv", sep=";")
df_b4s = df_b4s.rename(columns={df_b4s.columns[0]: "time"})
df_b4s["time"] = pd.to_datetime(df_b4s["time"], errors="coerce")
df_b4s = df_b4s.set_index("time")

# -------------------------
# 2. Wetterdaten laden (limitiert für Performance)
# -------------------------
weather_raw = pd.read_excel("Meteodaten_korrigiert.xlsx", sheet_name="MeasurementMeteo15Min")
weather_df = weather_raw[["MeasurementTimestampUtc", "Parameter", "MeasurementValue"]].dropna()

# Zeitspalte auf "tz-naiv" konvertieren
weather_df["MeasurementTimestampUtc"] = pd.to_datetime(weather_df["MeasurementTimestampUtc"]).dt.tz_localize(None)
weather_df = weather_df.rename(columns={"MeasurementTimestampUtc": "time"})

# Pivotieren
weather_wide = weather_df.pivot(index="time", columns="Parameter", values="MeasurementValue")

if df_b4s.index.tz is not None:
    df_b4s.index = df_b4s.index.tz_localize(None)

if weather_wide.index.tz is not None:
    weather_wide.index = weather_wide.index.tz_localize(None)


# -------------------------
# 3. Zusammenführen (nur Zeitschnitt)
# -------------------------
df = df_b4s.join(weather_wide, how="inner")

# -------------------------
# 4. Feature Engineering: Zeitmerkmale
# -------------------------
df["hour"] = df.index.hour
df["weekday"] = df.index.weekday
df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)

# -------------------------
# 5. Bereinigung
# -------------------------
# Anzahl NaNs
nan_summary = df.isna().sum()

# Entfernen
df = df.dropna()

# Doppelte Zeitstempel zählen und entfernen
duplicate_count = df.index.duplicated().sum()
df = df[~df.index.duplicated(keep="first")]




# Zeitspalte in datetime umwandeln
df['time'] = pd.to_datetime(df['time'])

# Feiertage für die Schweiz (optional: prov="ZH" für Zürich, "BE" für Bern, etc.)
ch_holidays = holidays.CH(years=[2020, 2021], prov="ZH")

# Neue Spalte 'Holiday': 1 für Feiertag, sonst 0
df['Holiday'] = df['time'].dt.date.apply(lambda x: 1 if x in ch_holidays else 0)


# Ergebnis anzeigen
print("Fehlende Werte pro Spalte:\n", nan_summary[nan_summary > 0])
print("Doppelte Zeitstempel:", duplicate_count)
print("Endgültige Form:", df.shape)

# Optional speichern
df.to_csv("eda_b4s_clean.csv")




ModuleNotFoundError: No module named 'holidays'

In [14]:
df_b4s_clean = pd.read_csv("eda_b4s_clean.csv")
#head der bereinigten Daten anzeigen
df_b4s_clean.head(5)  # Ausgabe der ersten Zeilen des bereinigten Data

Unnamed: 0,time,1-1:1290*255,1-1:2290*255,1-1:5290*255,1-1:6290*255,1-1:7290*255,1-1:8290*255,Nettolast_P_kW,Nettolast_Q_kvar,dkl010h0,fkl010h0,fkl010h1,gre000h0,rre150h0,tre200h0,hour,weekday,is_weekend
0,2020-03-02 23:15:00,30014,0,2612,0,0,8200,30.014,-5.588,244.0,5.925,11.6,0.0,0.1,3.45,23,0,0
1,2020-03-02 23:30:00,27356,0,2538,0,0,8322,27.356,-5.784,244.0,5.7,11.6,0.0,0.1,3.3,23,0,0
2,2020-03-02 23:45:00,25930,0,2544,0,0,8302,25.93,-5.758,245.0,5.75,11.6,0.0,0.1,3.275,23,0,0
3,2020-03-03 00:00:00,24318,0,2432,0,0,8296,24.318,-5.864,245.0,5.8,11.6,0.0,0.1,3.25,0,1,0
4,2020-03-03 00:15:00,24300,0,2460,0,0,8082,24.3,-5.622,245.0,5.85,11.9,0.0,0.025,3.225,0,1,0


In [15]:
print("Lastdaten-Zeitraum:", df_b4s.index.min(), "bis", df_b4s.index.max())
print("Wetterdaten-Zeitraum:", weather_wide.index.min(), "bis", weather_wide.index.max())


Lastdaten-Zeitraum: 2020-03-02 23:15:00 bis 2021-05-31 22:00:00
Wetterdaten-Zeitraum: 2019-09-02 23:15:00 bis 2020-12-06 15:00:00


In [None]:
import pandas as pd

# 1. Bereinigte Datei laden
df_clean = pd.read_csv("../data/eda_b4s_clean.csv", parse_dates=["time"])
df_clean = df_clean.set_index("time")
zeitraum_clean = (df_clean.index.min(), df_clean.index.max())


# 2. Vollständige Datei laden
df_b4s_voll = pd.read_csv("../data/EDM_B4S_voll.csv", sep=";", encoding="utf-8")
df_b4s_voll = df_b4s_voll.rename(columns={df_b4s_voll.columns[0]: "time"})
df_b4s_voll["time"] = pd.to_datetime(df_b4s_voll["time"], errors="coerce")
df_b4s_voll = df_b4s_voll.set_index("time")

# 👉 Zeitzonen entfernen (bevor irgendetwas gefiltert wird)
df_b4s_voll.index = df_b4s_voll.index.tz_localize(None)
start_new = pd.Timestamp("2019-12-06 00:00:00")
end_existing = df_clean.index.min()
start_new_naive = start_new
end_existing_naive = end_existing

# 3. Neuen Zeitabschnitt ausschneiden
df_new_part = df_b4s_voll.loc[start_new_naive:end_existing_naive - pd.Timedelta(minutes=15)]

# 4. Ausgabe prüfen
print("Zeitraum bestehender Clean-Datensatz:", zeitraum_clean)
print("Neuer Zeitabschnitt:", df_new_part.index.min(), "bis", df_new_part.index.max())
print("Neue Zeilen:", df_new_part.shape[0])


Zeitraum bestehender Clean-Datensatz: (Timestamp('2020-03-02 23:15:00'), Timestamp('2020-12-06 14:30:00'))
Neuer Zeitabschnitt: 2019-12-06 00:00:00 bis 2020-03-02 23:00:00
Neue Zeilen: 8349


In [None]:
# Speichern des neuen Zeitabschnitts als CSV in den data-Ordner

df_new_part.to_csv("../data/eda_b4s_new_part.csv")

In [None]:
# Überblick über die neue Datei für EDA
print(df_new_part.info())
print(df_new_part.describe())
df_new_part.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8349 entries, 2019-12-06 00:00:00 to 2020-03-02 23:00:00
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   1-1:1290*255      8349 non-null   float64
 1   1-1:2290*255      8349 non-null   float64
 2   1-1:5290*255      8349 non-null   float64
 3   1-1:6290*255      8349 non-null   float64
 4   1-1:7290*255      8349 non-null   float64
 5   1-1:8290*255      8349 non-null   float64
 6   Nettolast_P_kW    8349 non-null   float64
 7   Nettolast_Q_kvar  8349 non-null   float64
 8   Unnamed: 9        0 non-null      float64
 9   Unnamed: 10       8349 non-null   float64
 10  Unnamed: 11       0 non-null      float64
dtypes: float64(11)
memory usage: 782.7 KB
None
       1-1:1290*255  1-1:2290*255  1-1:5290*255  1-1:6290*255  1-1:7290*255  \
count   8349.000000   8349.000000   8349.000000   8349.000000   8349.000000   
mean      14.913530      0.668049      0

Unnamed: 0_level_0,1-1:1290*255,1-1:2290*255,1-1:5290*255,1-1:6290*255,1-1:7290*255,1-1:8290*255,Nettolast_P_kW,Nettolast_Q_kvar,Unnamed: 9,Unnamed: 10,Unnamed: 11
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-12-06 00:00:00,12.764,0.0,1.101,0.0,0.0,4.051,51.056,-11.8,,51.056,
2019-12-06 00:15:00,11.792,0.0,1.104,0.0,0.0,3.963,47.168,-11.436,,47.168,
2019-12-06 00:30:00,12.493,0.0,1.088,0.0,0.0,4.054,49.972,-11.864,,49.972,
2019-12-06 00:45:00,12.596,0.0,1.11,0.0,0.0,4.053,50.384,-11.772,,50.384,
2019-12-06 01:00:00,11.687,0.0,1.083,0.0,0.0,4.025,46.748,-11.768,,46.748,


In [11]:
import pandas as pd

# Datei laden
df = pd.read_csv("../data/EDM_B4S.csv", sep=";", encoding="utf-8")

# Zeitspalte vorbereiten
df = df.rename(columns={df.columns[0]: "time"})
df["time"] = pd.to_datetime(df["time"], errors="coerce")
df = df.set_index("time")

# --- WICHTIG: Erst formatieren, dann umbenennen ---
# Relevante Originalspalten
original_spalten = ["1-1:1290*255", "1-1:2290*255", "1-1:5290*255", "1-1:6290*255", "1-1:8290*255"]

# Zuverlässige Umwandlung in float
for s in original_spalten:
    df[s] = (
        df[s].astype(str)
             .str.replace(",", ".", regex=False)
             .str.strip()
             .replace("", "0")
             .astype(float)
    )

# Dann umbenennen
df = df.rename(columns={
    "1-1:1290*255": "Z_1290",
    "1-1:2290*255": "Z_2290",
    "1-1:5290*255": "Z_5290",
    "1-1:6290*255": "Z_6290",
    "1-1:8290*255": "Z_8290"
})

# Differenzberechnung mit 15-Minuten-Intervall
df["P_berechnet_kW"] = (df["Z_1290"].diff() - df["Z_2290"].diff())
df["Q_berechnet_kvar"] = (df["Z_8290"].diff() - df["Z_5290"].diff() - df["Z_6290"].diff()) 

# Differenz zur Messung
df["P_Abweichung"] = df["P_berechnet_kW"] - df["Nettolast_P_kW"]
df["Q_Abweichung"] = df["Q_berechnet_kvar"] - df["Nettolast_Q_kvar"]

# Ergebnis speichern
df.to_csv("EDM_B4S_mit_P_Q_Berechnung.csv")

# Beispiel
print(df[["P_berechnet_kW", "Nettolast_P_kW", "P_Abweichung", 
          "Q_berechnet_kvar", "Nettolast_Q_kvar", "Q_Abweichung"]].dropna().head(10))


                           P_berechnet_kW  Nettolast_P_kW  P_Abweichung  \
time                                                                      
2020-03-02 23:30:00+00:00         -2658.0          27.356     -2685.356   
2020-03-02 23:45:00+00:00         -1426.0          25.930     -1451.930   
2020-03-03 00:00:00+00:00         -1612.0          24.318     -1636.318   
2020-03-03 00:15:00+00:00           -18.0          24.300       -42.300   
2020-03-03 00:30:00+00:00         -1350.0          22.950     -1372.950   
2020-03-03 00:45:00+00:00           638.0          23.588       614.412   
2020-03-03 01:00:00+00:00           798.0          24.386       773.614   
2020-03-03 01:15:00+00:00            66.0          24.452        41.548   
2020-03-03 01:30:00+00:00          2078.0          26.530      2051.470   
2020-03-03 01:45:00+00:00         -2798.0          23.732     -2821.732   

                           Q_berechnet_kvar  Nettolast_Q_kvar  Q_Abweichung  
time                 

In [19]:
import pandas as pd

# ----------------------------
# 1. CSV-Datei laden
# ----------------------------
df = pd.read_csv("../data/EDM_B4S.csv", sep=";", encoding="utf-8")

# Zeitspalte vorbereiten
df = df.rename(columns={df.columns[0]: "time"})
df["time"] = pd.to_datetime(df["time"], errors="coerce")
df = df.set_index("time")

# ----------------------------
# 2. Formatierung korrigieren
# ----------------------------
# Spalten, die bereinigt werden müssen
alle_spalten = [
    "1-1:1290*255", "1-1:2290*255", "1-1:5290*255",
    "1-1:6290*255", "1-1:8290*255", "Nettolast_P_kW", "Nettolast_Q_kvar"
]

# Alle als Float bereinigen
for s in alle_spalten:
    df[s] = (
        df[s].astype(str)
             .str.replace(",", ".", regex=False)              # Komma zu Punkt
             .str.replace("Mär", "3", regex=False)             # Sonderfall März
             .str.extract(r"([-+]?[0-9]*\.?[0-9]+)")[0]        # Nur Zahl extrahieren
             .astype(float)
    )

# ----------------------------
# 3. Umbenennung für Klarheit
# ----------------------------
df = df.rename(columns={
    "1-1:1290*255": "Z_1290",
    "1-1:2290*255": "Z_2290",
    "1-1:5290*255": "Z_5290",
    "1-1:6290*255": "Z_6290",
    "1-1:8290*255": "Z_8290",
    "Nettolast_P_kW": "P_gemessen",
    "Nettolast_Q_kvar": "Q_gemessen"

})


df["P_berechnet_kW"] = (df["Z_1290"] - df["Z_2290"]) / 1000
df["Q_berechnet_kvar"] = -(df["Z_8290"] - df["Z_5290"] - df["Z_6290"]) / 1000



# ----------------------------
# 5. Abweichungen berechnen
# ----------------------------
df["P_Abweichung"] = df["P_berechnet_kW"] - df["P_gemessen"]
df["Q_Abweichung"] = df["Q_berechnet_kvar"] - df["Q_gemessen"]

# ----------------------------
# 6. Ergebnis speichern
# ----------------------------
df.to_csv("EDM_B4S_mit_P_Q_Berechnung.csv")

# ----------------------------
# 7. Ergebnis anzeigen
# ----------------------------
print(df[[
    "P_berechnet_kW", "P_gemessen", "P_Abweichung",
    "Q_berechnet_kvar", "Q_gemessen", "Q_Abweichung"
]].dropna().head(10))


                           P_berechnet_kW  P_gemessen  P_Abweichung  \
time                                                                  
2020-03-02 23:15:00+00:00          30.014      30.014           0.0   
2020-03-02 23:30:00+00:00          27.356      27.356           0.0   
2020-03-02 23:45:00+00:00          25.930      25.930           0.0   
2020-03-03 00:00:00+00:00          24.318      24.318           0.0   
2020-03-03 00:15:00+00:00          24.300      24.300           0.0   
2020-03-03 00:30:00+00:00          22.950      22.950           0.0   
2020-03-03 00:45:00+00:00          23.588      23.588           0.0   
2020-03-03 01:00:00+00:00          24.386      24.386           0.0   
2020-03-03 01:15:00+00:00          24.452      24.452           0.0   
2020-03-03 01:30:00+00:00          26.530      26.530           0.0   

                           Q_berechnet_kvar  Q_gemessen  Q_Abweichung  
time                                                                   
202

In [21]:
import pandas as pd

# ----------------------------
# 1. CSV-Datei laden
# ----------------------------
df = pd.read_csv("../data/EDM_B4S_voll.csv", sep=";", encoding="utf-8")

# Zeitspalte vorbereiten
df = df.rename(columns={df.columns[0]: "time"})
df["time"] = pd.to_datetime(df["time"], errors="coerce")
df = df.set_index("time")

# ----------------------------
# 2. Formatierung korrigieren
# ----------------------------
# Spalten, die bereinigt werden müssen
alle_spalten = [
    "1-1:1290*255", "1-1:2290*255", "1-1:5290*255",
    "1-1:6290*255", "1-1:8290*255", "Nettolast_P_kW", "Nettolast_Q_kvar"
]

# Alle als Float bereinigen
for s in alle_spalten:
    df[s] = (
        df[s].astype(str)
             .str.replace(",", ".", regex=False)              # Komma zu Punkt
             .str.replace("Mär", "3", regex=False)             # Sonderfall März
             .str.extract(r"([-+]?[0-9]*\.?[0-9]+)")[0]        # Nur Zahl extrahieren
             .astype(float)
    )

# ----------------------------
# 3. Umbenennung für Klarheit
# ----------------------------
df = df.rename(columns={
    "1-1:1290*255": "Z_1290",
    "1-1:2290*255": "Z_2290",
    "1-1:5290*255": "Z_5290",
    "1-1:6290*255": "Z_6290",
    "1-1:8290*255": "Z_8290",
    "Nettolast_P_kW": "P_gemessen",
    "Nettolast_Q_kvar": "Q_gemessen"

})


df["P_berechnet_kW"] = (df["Z_1290"] - df["Z_2290"]) 
df["Q_berechnet_kvar"] = -(df["Z_8290"] - df["Z_5290"] - df["Z_6290"]) 



# ----------------------------
# 5. Abweichungen berechnen
# ----------------------------
df["P_Abweichung"] = df["P_berechnet_kW"] - df["P_gemessen"]
df["Q_Abweichung"] = df["Q_berechnet_kvar"] - df["Q_gemessen"]

# ----------------------------
# 6. Ergebnis speichern
# ----------------------------
df.to_csv("EDM_B4S_mit_P_Q_Berechnung.csv")

# ----------------------------
# 7. Ergebnis anzeigen
# ----------------------------
print(df[[
    "P_berechnet_kW", "P_gemessen", "P_Abweichung",
    "Q_berechnet_kvar", "Q_gemessen", "Q_Abweichung"
]].dropna().head(10))


                           P_berechnet_kW  P_gemessen  P_Abweichung  \
time                                                                  
2019-09-11 00:00:00+00:00           9.318      37.272       -27.954   
2019-09-11 00:15:00+00:00           8.050      32.200       -24.150   
2019-09-11 00:30:00+00:00           6.826      27.304       -20.478   
2019-09-11 00:45:00+00:00           7.298      29.192       -21.894   
2019-09-11 01:00:00+00:00           6.275      25.100       -18.825   
2019-09-11 01:15:00+00:00           6.377      25.508       -19.131   
2019-09-11 01:30:00+00:00           6.098      24.392       -18.294   
2019-09-11 01:45:00+00:00           6.118      24.472       -18.354   
2019-09-11 02:00:00+00:00           5.949      23.796       -17.847   
2019-09-11 02:15:00+00:00           6.027      24.108       -18.081   

                           Q_berechnet_kvar  Q_gemessen  Q_Abweichung  
time                                                                   
201

In [23]:
# Neuimport nach Kernel-Reset
import pandas as pd

# Beide Dateien erneut laden
df_full = pd.read_csv("../data/EDM_B4S_voll.csv", sep=";", encoding="utf-8")
df_short = pd.read_csv("../data/EDM_B4S.csv", sep=";", encoding="utf-8")

# Zeitspalten vorbereiten
df_full = df_full.rename(columns={df_full.columns[0]: "time"})
df_short = df_short.rename(columns={df_short.columns[0]: "time"})

df_full["time"] = pd.to_datetime(df_full["time"], errors="coerce")
df_short["time"] = pd.to_datetime(df_short["time"], errors="coerce")

# Extrahieren der Nettolast-P-Werte
df_full_sub = df_full[["time", "Nettolast_P_kW"]].copy()
df_short_sub = df_short[["time", "Nettolast_P_kW"]].copy()

df_full_sub = df_full_sub.rename(columns={"Nettolast_P_kW": "P_full"})
df_short_sub = df_short_sub.rename(columns={"Nettolast_P_kW": "P_short"})

# Zusammenführen
df_compare = pd.merge(df_full_sub, df_short_sub, on="time", how="inner")

# Formatierung bereinigen
for col in ["P_full", "P_short"]:
    df_compare[col] = (
        df_compare[col].astype(str)
                        .str.replace(",", ".", regex=False)
                        .str.extract(r"([-+]?[0-9]*\.?[0-9]+)")[0]
                        .astype(float)
    )

# Abweichung berechnen
df_compare["Abweichung"] = df_compare["P_full"] - df_compare["P_short"]
# Ergebnis anzeigen
print(df_compare.head(10))




                       time  P_full  P_short  Abweichung
0 2020-03-02 23:15:00+00:00  60.028   30.014      30.014
1 2020-03-02 23:30:00+00:00  54.712   27.356      27.356
2 2020-03-02 23:45:00+00:00  51.860   25.930      25.930
3 2020-03-03 00:00:00+00:00  48.636   24.318      24.318
4 2020-03-03 00:15:00+00:00  48.600   24.300      24.300
5 2020-03-03 00:30:00+00:00  45.900   22.950      22.950
6 2020-03-03 00:45:00+00:00  47.176   23.588      23.588
7 2020-03-03 01:00:00+00:00  48.772   24.386      24.386
8 2020-03-03 01:15:00+00:00  48.904   24.452      24.452
9 2020-03-03 01:30:00+00:00  53.060   26.530      26.530


In [24]:
# Vergleich aller gemeinsamen Spalten mit gleichen Namen

# Gleiche Zeitspalte
df_full = pd.read_csv("../data/EDM_B4S_voll.csv", sep=";", encoding="utf-8")
df_short = pd.read_csv("../data/EDM_B4S.csv", sep=";", encoding="utf-8")

# Zeitspalten anpassen
df_full = df_full.rename(columns={df_full.columns[0]: "time"})
df_short = df_short.rename(columns={df_short.columns[0]: "time"})

df_full["time"] = pd.to_datetime(df_full["time"], errors="coerce")
df_short["time"] = pd.to_datetime(df_short["time"], errors="coerce")

# Nur gemeinsame Spaltennamen
gemeinsame_spalten = list(set(df_full.columns).intersection(set(df_short.columns)))
gemeinsame_spalten.remove("time")

# Beide DataFrames reduzieren auf gemeinsame Spalten
df_full_sub = df_full[["time"] + gemeinsame_spalten].copy()
df_short_sub = df_short[["time"] + gemeinsame_spalten].copy()

# Zusammenführen
df_merged = pd.merge(df_full_sub, df_short_sub, on="time", how="inner", suffixes=("_full", "_short"))

# Vergleichs-DataFrame erzeugen: Differenzen
vergleich_df = pd.DataFrame({"time": df_merged["time"]})
for spalte in gemeinsame_spalten:
    col_full = f"{spalte}_full"
    col_short = f"{spalte}_short"
    
    # Formatierung: Kommas zu Punkten, extrahiere numerische Werte
    df_merged[col_full] = df_merged[col_full].astype(str).str.replace(",", ".", regex=False).str.extract(r"([-+]?[0-9]*\.?[0-9]+)")[0].astype(float)
    df_merged[col_short] = df_merged[col_short].astype(str).str.replace(",", ".", regex=False).str.extract(r"([-+]?[0-9]*\.?[0-9]+)")[0].astype(float)

    vergleich_df[spalte + "_Abweichung"] = df_merged[col_full] - df_merged[col_short]

# Ergebnis anzeigen
print(vergleich_df.head(10))

                       time  1-1:5290*255_Abweichung  1-1:8290*255_Abweichung  \
0 2020-03-02 23:15:00+00:00                -2610.694                -8195.930   
1 2020-03-02 23:30:00+00:00                -2536.731                -8317.899   
2 2020-03-02 23:45:00+00:00                -2542.728                -8297.909   
3 2020-03-03 00:00:00+00:00                -2430.784                -8291.912   
4 2020-03-03 00:15:00+00:00                -2458.770                -8078.019   
5 2020-03-03 00:30:00+00:00                -2390.804                -8247.934   
6 2020-03-03 00:45:00+00:00                -2332.833                -8527.794   
7 2020-03-03 01:00:00+00:00                -2480.759                -8375.870   
8 2020-03-03 01:15:00+00:00                -2656.671                -8469.823   
9 2020-03-03 01:30:00+00:00                -2698.650                -8519.798   

   Nettolast_Q_kvar_Abweichung  1-1:7290*255_Abweichung  \
0                       -5.468                   