In [121]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.ensemble import IsolationForest


### Preprocessing

In [122]:
df = pd.read_csv("E-Building-Data.csv")
df["date_time"] = pd.to_datetime(df["date_time"])
df.head()



Unnamed: 0,date_time,device_id,tmp,hum,CO2,VOC,vis,IR,WIFI,BLE,rssi,channel_rssi,snr,gateway,channel_index,spreading_factor,bandwidth,f_cnt
0,2022-06-01 11:16:16,hka-aqm-e001,24.38,31.88,468,3633,365,148,3,7,-102,-102,7.2,drag-lps8-02,4,7,125000,2
1,2022-06-01 11:16:56,hka-aqm-e001,24.46,31.58,473,5755,348,141,3,5,-112,-112,2.8,drag-outd-01,2,7,125000,3
2,2022-06-01 11:34:26,hka-aqm-e001,24.39,32.21,486,5755,1126,420,4,7,-115,-115,-2.0,drag-outd-01,2,7,125000,1
3,2022-06-01 11:34:46,hka-aqm-e001,24.51,32.65,485,7169,1047,383,255,255,-111,-111,-2.8,drag-lps8-02,1,7,125000,2
4,2022-06-01 11:49:11,hka-aqm-e001,24.87,32.31,521,7169,1967,943,255,255,-109,-109,2.2,drag-lps8-02,4,7,125000,1


In [123]:
df.loc[:, "room_number"] = df["device_id"].str.split("-").str[-1]
df.drop("device_id", inplace=True, axis=1)

In [124]:
df.loc[:, "year"] = df["date_time"].dt.year
df.loc[:, "month"] = df["date_time"].dt.month
df.loc[:, "dayofweek"] = df["date_time"].dt.dayofweek
df.loc[:, "hour"] = df["date_time"].dt.hour

In [125]:
def resample_dataframes(dataframe:pd.DataFrame,resampling_type:str,verteilung=False):
    df = dataframe.copy()
    if verteilung:
        df['date_time'] = df['date_time'].dt.weekday
        df = df.groupby('date_time').mean()

    else:

        df = df.set_index('date_time').resample(resampling_type).mean().reset_index()
        return df

In [126]:
features = ["date_time", "CO2", "tmp", "vis", "hum", "VOC"]

In [127]:
df_anomalie = df[df["room_number"] == "e001"]
df_anomalie = resample_dataframes(df_anomalie[features], "h")
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_anomalie.date_time,
    y=df_anomalie.tmp,
    marker_color='#64378C'
))
fig.update_layout(
    title='Temperatur vor dem IsolationForest für E001',
    xaxis_title='Datum',
    yaxis_title='Temperatur',
    showlegend=True
)
fig.show()

In [128]:
isoforest = IsolationForest(n_estimators = 100, contamination = 0.0075, max_samples = int(df.shape[0]*0.8))
# Isolation Forest auf den wichtigsten numerischen Werten durchführen (CO2, tmp, vis, hum und VOC).
prediction = isoforest.fit_predict(df[["CO2", "tmp", "vis", "hum", "VOC"]])
print("Number of outliers detected: {}".format(prediction[prediction < 0].sum()))
print("Number of normal samples detected: {}".format(prediction[prediction >= 0].sum()))
score = isoforest.decision_function(df[["CO2", "tmp", "vis", "hum", "VOC"]])
df["anomaly_score"] = score
# Zeilen mit anomaly_score < 0 werden vom Isolation Forest als Ausreißer interpretiert.
df = df[df.anomaly_score >= 0]

Number of outliers detected: -9632
Number of normal samples detected: 1274579


In [129]:
df_anomalie_detected = df[df["room_number"] == "e001"]
df_anomalie_detected = resample_dataframes(df_anomalie_detected[features], "h")
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_anomalie_detected.date_time,
    y=df_anomalie_detected.tmp,
    marker_color='#64378C'
))

fig.update_layout(
    title='Temperatur nach dem IsolationForest für E001',
    xaxis_title='Datum',
    yaxis_title='Temperatur',
    showlegend=True
)
fig.show()

### Anomalies

In [130]:
from scipy.stats import zscore

unique_rooms = df['room_number'].unique()


anomalies_summary_df = []


for room in unique_rooms:
    room_data = df[df['room_number'] == room].copy()
    
    # Calculate Z-scores for temperature, humidity, CO2, and VOC levels
    room_data['tmp_zscore'] = zscore(room_data['tmp'])
    room_data['hum_zscore'] = zscore(room_data['hum'])
    room_data['CO2_zscore'] = zscore(room_data['CO2'])
    room_data['VOC_zscore'] = zscore(room_data['VOC'])
    
    
    threshold = 3
    
   
    room_data['tmp_anomaly'] = room_data['tmp_zscore'].abs() > threshold
    room_data['hum_anomaly'] = room_data['hum_zscore'].abs() > threshold
    room_data['CO2_anomaly'] = room_data['CO2_zscore'].abs() > threshold
    room_data['VOC_anomaly'] = room_data['VOC_zscore'].abs() > threshold
    
    
    anomalies_summary = {
        'room_number': room,
        'tmp_anomaly': room_data['tmp_anomaly'].sum(),
        'hum_anomaly': room_data['hum_anomaly'].sum(),
        'CO2_anomaly': room_data['CO2_anomaly'].sum(),
        'VOC_anomaly': room_data['VOC_anomaly'].sum()
    }
    
    
    anomalies_summary_df.append(anomalies_summary)

anomalies = pd.DataFrame.from_dict(anomalies_summary_df)

In [131]:
room_counts = df['room_number'].value_counts().reset_index()
room_counts.columns = ['room_number', 'count']


anomalies = anomalies.merge(room_counts, on='room_number')

# Calculate the anomaly rate
anomalies['anomaly_rate'] = (
    anomalies['tmp_anomaly'] +
    anomalies['hum_anomaly'] +
    anomalies['CO2_anomaly'] +
    anomalies['VOC_anomaly']
) / anomalies['count'].round(4)

anomalies['anomaly_count'] = (
    anomalies['tmp_anomaly'] +
    anomalies['hum_anomaly'] +
    anomalies['CO2_anomaly'] +
    anomalies['VOC_anomaly'] )


anomalies.sort_values(by="anomaly_rate", ascending=False)

In [133]:
anomalies.sort_values(by="anomaly_rate", ascending=False, inplace=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=anomalies.room_number,
    y=anomalies.anomaly_rate,
    marker_color='#64378C'

))

fig.update_layout(
    title='Anomalierate pro Raum',
    xaxis_title='Raum',
    yaxis_title='Anomalierate',
    showlegend=True
)

fig.show()

In [134]:
anomalies.sort_values(by="anomaly_count", ascending=False, inplace=True)
fig = go.Figure()
fig.add_bar(x=anomalies.room_number,y=anomalies.VOC_anomaly, name="VOC anomalies")
fig.add_bar(x=anomalies.room_number,y=anomalies.CO2_anomaly, name="CO2_anomalies")
fig.add_bar(x=anomalies.room_number,y=anomalies.anomaly_count, name="anomalies")
fig.update_layout(barmode="group")
fig.update_layout(
    title='Anomalien mit den größten Anteilen',
    xaxis_title='Raum',
    yaxis_title='Anomalien',
    showlegend=True
)
fig.show()

### CO2-Ampel Feature

In [135]:
# Farben der CO2-Ampeln laut der Quelle https://www.h-ka.de/fileadmin/Hochschule_Karlsruhe_HKA/Bilder_VW-EBI/HKA_VW-EBI_Anleitung_CO2-Ampeln.pdf
df.loc[(df.CO2 < 850), "color"] = "green"
df.loc[(df.CO2 >= 850) & (df.CO2 < 1200), "color"] = "yellow"
df.loc[(df.CO2 >= 1200) & (df.CO2 < 1600), "color"] = "red"
df.loc[(df.CO2 >= 1600), "color"] = "red_blinking"

In [136]:
df_ampel = df.sort_values(by="room_number")[["room_number","CO2", "color"]]
df_ampel = df_ampel.merge(room_counts, on='room_number')
df_ampel = df_ampel.groupby(['room_number', 'color']).size().unstack(fill_value=0).reset_index()

In [137]:
df_ampel["count"] = df_ampel["green"] + df_ampel["red"] + df_ampel["red_blinking"] + df_ampel["yellow"]

In [138]:
df_ampel["red_blinking_rate"] = df_ampel["red_blinking"] / df_ampel["count"]
df_ampel["red_rate"] =  df_ampel["red"] / df_ampel["count"]
df_ampel["red_yellow_rate"] =  df_ampel["yellow"] / df_ampel["count"]

In [139]:
df_ampel.sort_values(by="red_yellow_rate", ascending=False, inplace=True)

In [141]:
fig = go.Figure()
fig.add_trace(go.Bar(x=df_ampel.room_number,y=df_ampel.red_yellow_rate, name="yellow", marker_color="orange"))
fig.add_trace(go.Bar(x=df_ampel.room_number,y=df_ampel.red_rate, name="red", marker_color="red"))
fig.add_trace(go.Bar(x=df_ampel.room_number,y=df_ampel.red_blinking_rate, name="red_blinking", marker_color="black"))
fig.update_layout(barmode="stack")
fig.update_layout(
    title='Räume nach Ampelfarbe "gelb" sortiert',
    xaxis_title='Raum',
    yaxis_title='Rate',
    showlegend=True,
    
)
fig.show()