# Exploratory Data Analysis

# 0. Enviroment

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
pd.options.display.max_columns = None

# 1. Data

The dataset contains anonymized time series records of a machine's operation from an FPSO. The recorded variables are as follows:
1.	Cycle: Sequential identifier of measurement cycles.
2.	Preset_1 & Preset_2: Variables that control a specific operating point of the machine.
3.	Temperature: Temperature recorded in the equipment.
4.	Vibrations (X, Y & Z): Vibrations along the machine's axes.
5.	Frequency: Operating frequency of the machine.
6.	Fail: Variable indicating whether the machine is in a failure state at the given timestamp.

In [None]:
df_vessel = pd.read_excel("../data/01_raw/oge_vessel_data.xlsx")

## 1.1. Understanding the dataset
Simple validation of schema, shape and presets.

In [51]:
df_vessel = df_vessel.sort_values("Cycle")

In [52]:
display(
    df_vessel.head(),
    df_vessel.shape,
    df_vessel.dtypes,
)

Unnamed: 0,Cycle,Preset_1,Preset_2,Temperature,Pressure,VibrationX,VibrationY,VibrationZ,Frequency,Fail
0,1,3,6,44.235186,47.657254,46.441769,64.820327,66.45452,44.48325,False
1,2,2,4,60.807234,63.172076,62.005951,80.714431,81.246405,60.228715,False
2,3,2,1,79.027536,83.03219,82.64211,98.254386,98.785196,80.993479,False
3,4,2,3,79.716242,100.508634,122.362321,121.363429,118.652538,80.315567,False
4,5,2,5,39.989054,51.764833,42.514302,61.03791,50.716469,64.245166,False


(800, 10)

Cycle            int64
Preset_1         int64
Preset_2         int64
Temperature    float64
Pressure       float64
VibrationX     float64
VibrationY     float64
VibrationZ     float64
Frequency      float64
Fail              bool
dtype: object

In [53]:
# Checking if there's incorrect cycle id
df_id = df_vessel[["Cycle"]].reset_index()

df_id["index"] = df_id["index"]+1
df_id["check"] = df_id.eval("index == Cycle")

assert df_id.query("check == False").shape[0] == 0

In [54]:
# number of presets of each type and its use frequency
display(
    df_vessel["Preset_1"].value_counts().sort_index(),
    df_vessel["Preset_2"].value_counts().sort_index(),
)

Preset_1
1    264
2    281
3    255
Name: count, dtype: int64

Preset_2
1     95
2    101
3    101
4     93
5    100
6    101
7    109
8    100
Name: count, dtype: int64

## 1.2. 