# Setup

In [None]:
# Basics
from pathlib import Path
import json

# Data Manipulation
import pandas as pd
import numpy as np

# Data Visualization
import plotly.offline as pyo
import plotly.io as pio
import plotly.express as px
from matplotlib import pyplot as plt

In [None]:
# Initializing Plotly (off-line mode)
pyo.init_notebook_mode(connected=True)

# Setting Plotly template as Simple White
pio.templates.default = "plotly_white"

# Paths
raw_data_folder = Path.cwd().resolve().parent.joinpath("data", "raw")
processed_data_folder = Path.cwd().resolve().parent.joinpath("data", "processed")

# 1. Getting Data

In [None]:
origin_df = pd.DataFrame()
for jsonl_file in list(raw_data_folder.glob("*.jsonl")):
    with open(jsonl_file, "r") as jsonl_file:
        origin_df = pd.concat(
            (
                origin_df, 
                pd.DataFrame(
                    pd.json_normalize(  # Normalizes "levels" in the json schema to columns
                        (json.loads(line) for line in jsonl_file)  # Loads each line at a time
                    )
                )
            ),
            ignore_index=True)

# Renaming columns (no reason to do this, just making it more readable)
origin_df.rename(columns= lambda col_name: col_name.split(".")[0], inplace=True)

# Creating DataFrame for analysis
df = (
    origin_df
    # Selecting the correct equipment
    .loc[lambda _df: _df["IP"].eq("10.115.17.226"), ["Data", "IP", "HeliumLevel", "BoreTemperature"]]
    .copy()
    .set_index("Data")
    .pipe(lambda _df: _df.set_index(pd.to_datetime(_df.index).tz_convert(None)))
    .assign(
        **{
            "HeliumLevel": lambda _df: _df["HeliumLevel"].astype("float64"),
            "BoreTemperature": lambda _df: _df["BoreTemperature"].astype("float64"),
            "Day": lambda _df: pd.to_datetime(_df.index).date
        }
    )
    .sort_index()
)


# 2. Exploratory Data Analysis (EDA)

The main questions to be explored are:
- How is the data distributed over time?
  - Do the measurements show any pattern?
  - Is the data frequency constant within any window (X measurements per day or hour)?

### Observing data

In [4]:
df.iloc[np.r_[0:4, -4:0]]

Unnamed: 0_level_0,IP,HeliumLevel,BoreTemperature,Day
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-21 05:00:18,10.115.17.226,91.5,59.799999,2022-10-21
2022-10-21 09:40:14,10.115.17.226,91.5,59.799999,2022-10-21
2022-10-21 09:40:24,10.115.17.226,91.5,59.799999,2022-10-21
2022-10-21 12:23:49,10.115.17.226,91.5,59.799999,2022-10-21
2022-10-24 21:11:51,10.115.17.226,91.199997,77.0,2022-10-24
2022-10-24 21:12:27,10.115.17.226,91.199997,77.0,2022-10-24
2022-10-25 05:00:18,10.115.17.226,90.699997,67.400002,2022-10-25
2022-10-25 09:59:37,10.115.17.226,90.699997,67.400002,2022-10-25


In [6]:
fig = px.line(
    df.sort_index(), 
    y="BoreTemperature", 
    line_group="IP", 
    color="IP", 
    markers=True,
    hover_data=["IP"],
    title="Time Series for Bore temperature",
    height=500
)
fig.update_traces(
    hovertemplate="<br>%{x|%H:%M - %d/%m}<br>IP: %{customdata[0]}<br>BoreTemperature: %{y}<extra></extra>"
)
fig.update_layout(
    yaxis=dict(showgrid=False)
)
fig.update_traces(
    hovertemplate="<br>%{x|%H:%M - %d/%m}<br>IP: %{customdata[0]}<br>BoreTemperature: %{y}<extra></extra>"
)
fig.update_layout(
    yaxis=dict(showgrid=False)
)

In [7]:
fig = px.line(
    df.sort_index(), 
    y="HeliumLevel", 
    line_group="IP", 
    color="IP", 
    markers=True,
    hover_data=["IP"],
    title="Time Series for Helium level",
    height=500
)
fig.update_traces(
    hovertemplate="<br>%{x|%H:%M - %d/%m}<br>IP: %{customdata[0]}<br>BoreTemperature: %{y}<extra></extra>"
)
fig.update_layout(
    yaxis=dict(showgrid=False)
)


### Analyzing the number of measurements (general and daily)

In [11]:
pd.crosstab(df["IP"], columns=df["Day"])

Day,2022-10-21,2022-10-22,2022-10-23,2022-10-24,2022-10-25
IP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.115.17.226,5,5,3,3,2


# 3. Exporting data

In [12]:
df[["HeliumLevel", "BoreTemperature"]].to_csv(
    path_or_buf=processed_data_folder.joinpath("10.115.17.226.csv")
)