# 1. Obtendo os dados

## Preparando a caixa de ferramentas üß∞

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import plotly.offline as pyo
import plotly.io as pio
import plotly.express as px
from matplotlib import pyplot as plt

# Initializing Plotly (off-line mode)
pyo.init_notebook_mode(connected=True)

# Setting Plotly template as Simple White
pio.templates.default = "plotly_white"

raw_data_folder = Path.cwd().resolve().parent.joinpath("data", "raw")
processed_data_folder = Path.cwd().resolve().parent.joinpath("data", "processed")

In [3]:
origin_df = pd.DataFrame()
for jsonl_file in list(raw_data_folder.glob("*.jsonl")):
    with open(jsonl_file, "r") as jsonl_file:
        origin_df = pd.concat(
            (
                origin_df, 
                pd.DataFrame(
                    pd.json_normalize(  # Normalizes "levels" in the json schema to columns
                        (json.loads(line) for line in jsonl_file)  # Loads each line at a time
                    )
                )
            ),
            ignore_index=True)

# Renaming columns (no reason to do this, just making it more readable)
origin_df.rename(columns= lambda col_name: col_name.split(".")[0], inplace=True)

# Creating DataFrame for analysis
df = origin_df[["Data", "IP", "HeliumLevel", "BoreTemperature"]].copy().set_index("Data")
df["HeliumLevel"] = df["HeliumLevel"].astype("float64")
df["BoreTemperature"] = df["BoreTemperature"].astype("float64")
df.index = pd.to_datetime(df.index).tz_convert(None)  # Removing UTC Time Zone info
df["Day"] = df.index.date
df.sort_index(inplace=True)


# 2. Analisando os dados de forma explorat√≥ria (EDA)

As principais quest√µes a serem exploradas s√£o:
- Como os dados est√£o distribu√≠dos ao longo do tempo?
  - As medi√ß√µes exibem algum padr√£o?
  - A frequ√™ncia de dados √© constante dentro de alguma janela (X medi√ß√µes por dia ou hora)?
- H√° algum equipamento com dados muito discrepante?
- Quais e quantos equipamentos escolher para a modelagem?
- Qual crit√©rio usar para divis√£o de janelas e treino e teste?
- Quais modelos podem ser promissores?

Como n√£o h√° contexto do neg√≥cio sobre os dados, a EDA ser√° direcionada a responder essas perguntas. 

#### Observando os dados

In [4]:
df.iloc[np.r_[0:4, -4:0]]

Unnamed: 0_level_0,IP,HeliumLevel,BoreTemperature,Day
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-21 02:00:19,172.22.8.31,76.900002,46.5,2022-10-21
2022-10-21 05:00:17,10.115.16.230,85.0,53.5,2022-10-21
2022-10-21 05:00:18,10.115.17.226,91.5,59.799999,2022-10-21
2022-10-21 05:00:19,172.17.216.6,91.400002,53.5,2022-10-21
2022-10-25 17:25:13,10.23.149.30,0.0,0.0,2022-10-25
2022-10-25 17:25:49,10.23.153.13,79.699997,44.400002,2022-10-25
2022-10-25 17:49:53,10.23.153.13,79.699997,44.400002,2022-10-25
2022-10-25 20:45:17,172.19.152.53,0.0,0.0,2022-10-25


#### Visualizando todas as s√©ries temporais

In [9]:
fig = px.line(
    df.sort_index(), 
    y="BoreTemperature", 
    line_group="IP", 
    color="IP", 
    markers=True,
    hover_data=["IP"],
    title="S√©ries Temporais para BoreTemperature",
    height=500
)
fig.update_traces(
    hovertemplate="<br>%{x|%H:%M - %d/%m}<br>IP: %{customdata[0]}<br>BoreTemperature: %{y}<extra></extra>"
)
fig.update_layout(
    yaxis=dict(showgrid=False)
)
fig.show(renderer="colab")

In [6]:
fig = px.line(
    df.sort_index(), 
    y="HeliumLevel", 
    line_group="IP", 
    color="IP", 
    markers=True,
    hover_data=["IP"],
    title="S√©ries Temporais para HeliumLevel",
    height=500
)
fig.update_traces(
    hovertemplate="<br>%{x|%H:%M - %d/%m}<br>IP: %{customdata[0]}<br>BoreTemperature: %{y}<extra></extra>"
)
fig.update_layout(
    yaxis=dict(showgrid=False)
)
fig.show(renderer="colab")

#### Anota√ß√µes
- As medi√ß√µes n√£o s√£o constantes em hor√°rio e n√∫mero de marca√ß√µes por dia.
  - O equipamento 172.19.28.70 se destaca: tem 12 medi√ß√µes no mesmo dia (bem superior √† m√©dia). Pode ser v√°lido checar o a causa raiz desse n√∫meros.
- V√°rios equipamentos tem os valores constantes ao longo do tempo - e no geral apresentam baixa varia√ß√£o.
- V√°rios equipamentos n√£o tem medi√ß√µes todos os dias.

#### Analisando o n√∫mero de medi√ß√µes (geral e di√°ria)

In [7]:
df_cross = pd.crosstab(df["IP"], columns=df["Day"], margins=True).sort_values(by="All", ascending=False)

df_cross

Day,2022-10-21,2022-10-22,2022-10-23,2022-10-24,2022-10-25,All
IP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
All,25,29,44,60,53,211
172.19.28.70,0,2,5,12,2,21
10.115.17.226,5,5,3,3,2,18
192.100.16.66,3,3,3,3,3,15
10.99.80.179,3,3,3,3,3,15
10.23.152.32,3,3,1,3,3,13
10.23.153.13,1,3,1,2,4,11
10.115.16.230,2,2,3,2,2,11
10.115.21.239,0,0,5,3,3,11
172.16.228.91,0,0,3,3,3,9


#### Anota√ß√µes

Os equipamentos ser√£o selecionados com o crit√©rio:
-   Top 3 equipamentos com medi√ß√µes.
-   O equipamento deve ter medi√ß√µes em todos dias.

In [8]:
selected_ips = df_cross[
    (df_cross.ne(0).all(1))
    & (df_cross.index != "All")
].nlargest(
    n=3, 
    columns="All"
    ).index.tolist()

display(selected_ips)

['10.115.17.226', '192.100.16.66', '10.99.80.179']

#### Visualizando as s√©ries temporais selecionadas

In [23]:
fig = px.line(
    df[df["IP"].isin(selected_ips)].sort_index(), 
    y="BoreTemperature", 
    line_group="IP", 
    color="IP", 
    markers=True,
    hover_data=["IP"],
    title="S√©ries Temporais para BoreTemperature",
    height=300
)
fig.update_traces(
    hovertemplate="<br>%{x|%H:%M - %d/%m}<br>IP: %{customdata[0]}<br>BoreTemperature: %{y}<extra></extra>"
)
fig.update_layout(
    yaxis=dict(showgrid=False),
    xaxis=(dict(dtick="d", tickformat="%b %d"))
)
fig.show(renderer="colab")

In [24]:
# Creating plot
fig = px.line(
    df[df["IP"].isin(selected_ips)].sort_index(), 
    y="HeliumLevel", 
    line_group="IP", 
    color="IP", 
    markers=True,
    hover_data=["IP"],
    title="S√©ries Temporais para HeliumLevel",
    height=300
)
fig.update_traces(
    hovertemplate="<br>%{x|%H:%M - %d/%m}<br>IP: %{customdata[0]}<br>BoreTemperature: %{y}<extra></extra>"
)
fig.update_layout(
    yaxis=dict(showgrid=False),
    xaxis=(dict(dtick="d", tickformat="%b %d"))
)
fig.show(renderer="colab")

## Exportando os dados

A an√°lise de componentes das s√©ries temporais e a cria√ß√£o dos modelos ser√£o feitos em notebooks separados (um para cada equipamento).

In [11]:
for selected_ip in selected_ips:
    df[df["IP"]==selected_ip][["HeliumLevel", "BoreTemperature"]].rename_axis(f"{selected_ip}").to_csv(
        path_or_buf=processed_data_folder/f"{selected_ip}.csv"
    )