# Pandas

## Init

In [1]:
!pip install plotly
!pip install plotly_express

[0m

In [1]:
%matplotlib inline

import sys
import pandas
print("Python Version : ", sys.version)
print("Pandas Version : ", pandas.__version__)

Python Version :  3.11.6 | packaged by conda-forge | (main, Oct  3 2023, 11:57:02) [GCC 12.3.0]
Pandas Version :  2.0.3


# Grafana util functions

In [3]:
import json

def normalizeJsonGrafanaHeader(it):
    return json.loads(it.replace("=", "\":").replace(", ", ", \"").replace("{", "{\""))

print(normalizeJsonGrafanaHeader('{instance="otel-collector:8889", job="otel-collector"}'))

{'instance': 'otel-collector:8889', 'job': 'otel-collector'}


In [4]:
def headerSeriesToColumnsDF(inline_headers_series):
    headers = []
    for it in inline_headers_series:
        headers.append(normalizeJsonGrafanaHeader(it))
    
    return pd.DataFrame.from_records(headers)

# Data loading

## Create DataFrame from CSV file

* To avoid using memory to guest data type, it is recommended to pass the type : https://stackoverflow.com/questions/24251219/pandas-read-csv-low-memory-and-dtype-options

In [6]:
import pandas as pd

df = pd.read_csv("/data/grafana/scrape-data-2024-04-13 17_34_26.csv", sep=',', index_col=False)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 2 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Time                                                    33 non-null     int64  
 1   {instance="otel-collector:8889", job="otel-collector"}  33 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 660.0 bytes


Unnamed: 0,Time,"{instance=""otel-collector:8889"", job=""otel-collector""}"
0,1712999820000,0.022383
1,1712999850000,0.016509
2,1712999880000,0.00528
3,1712999910000,0.015655
4,1712999940000,0.018002


## Normalize table

Goal is to Reorganize table to flexible dataset by :

* flatten the headers into rows that we can manipulate

In [7]:
data = pd.melt(
    df,
    id_vars=['Time'],
    value_vars=[col for col in df.columns if col != "Time"]
)
data.head()

Unnamed: 0,Time,variable,value
0,1712999820000,"{instance=""otel-collector:8889"", job=""otel-col...",0.022383
1,1712999850000,"{instance=""otel-collector:8889"", job=""otel-col...",0.016509
2,1712999880000,"{instance=""otel-collector:8889"", job=""otel-col...",0.00528
3,1712999910000,"{instance=""otel-collector:8889"", job=""otel-col...",0.015655
4,1712999940000,"{instance=""otel-collector:8889"", job=""otel-col...",0.018002


In [8]:
# Join value & headersToColumns
normDf = pd.concat(
    (
        data.drop("variable", axis=1), # time and data
        headerSeriesToColumnsDF(data["variable"]), # headers columns
    ), 
    axis=1
)

normDf.info()
normDf.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Time      33 non-null     int64  
 1   value     33 non-null     float64
 2   instance  33 non-null     object 
 3   job       33 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 1.2+ KB


Unnamed: 0,Time,value,instance,job
0,1712999820000,0.022383,otel-collector:8889,otel-collector
1,1712999850000,0.016509,otel-collector:8889,otel-collector
2,1712999880000,0.00528,otel-collector:8889,otel-collector
3,1712999910000,0.015655,otel-collector:8889,otel-collector
4,1712999940000,0.018002,otel-collector:8889,otel-collector


# Data cleaning

## Sampling

In [9]:
normDf.head(1)

Unnamed: 0,Time,value,instance,job
0,1712999820000,0.022383,otel-collector:8889,otel-collector


## DataSet Validation
### Column analysis
#### String values

In [10]:
# List all unique value from one columns
normDf['instance'].unique()

array(['otel-collector:8889'], dtype=object)

## Exporting
### Persist to CSV file

In [11]:
normDf.to_csv('/data/clean.csv')

## Plot
### Persist to CSV file

In [None]:
normDf.info()

In [None]:
import plotly.express as px

In [None]:
px.line(normDf, x="Time", y="value", color="instance")

In [None]:
px.imshow(
    normDf.drop("Time", axis=1).iloc[:2].values
)