# Notebook for data exploration and visualization

### Import of all necessary libaries and check if MPS is available for faster computation


In [2]:
from src.utils.check_mps_device import check_mps_device
import matplotlib.pyplot as plt
import plotly.express as px

from src.utils.data_loading import load_data
from src.utils.filtering import filter_data
import seaborn as sns
from src.utils.label_encoding import label_encode_column

# Check if PyTorch Multi-Process Service (MPS) is available (GPU)
check_mps_device()

ModuleNotFoundError: No module named 'torch'

In [None]:
# Load data
df = load_data()

## Filtering
Based on the proposed [filters](https://link.springer.com/article/10.1007/s42064-021-0101-5#preview) in the following cells different filter options are applied and tested:
1. The event must contain at least two CDMs, one to infer from and one to use as the target.
2. The last CDM released for the event must be within a day (time_to_tca < 1) of the TCA.
3. The first CDM released for the event must be at least two days before the TCA (time to tca ⩾ 2),
   and all the CDMs that were within two days from the TCA (time to tca < 2) are removed.

All filters combined can be applied by calling ```filter_data()```

In [None]:
# Group by event_id and count the number of CDMs for each event
event_counts = df.groupby("event_id").size().reset_index(name="cdm_count")
event_counts.info()

### The event must contain at least two CDMs, one to infer from and one to use as the target.

In [None]:
# Keep events with at least two CDMs
valid_events = event_counts[event_counts["cdm_count"] >= 2]["event_id"]
valid_events.info()

In [None]:
# Filter the data based on valid event IDs
filtered_data = df[df["event_id"].isin(valid_events)]
filtered_data.info()

### The last CDM released for the event must be within a day (time_to_tca < 1) of the TCA.

In [None]:
# Keep the last CDM within a day of TCA
filtered_data = filtered_data.loc[
    filtered_data.groupby("event_id")["time_to_tca"].idxmin()
]
filtered_data.info()

### The first CDM released for the event must be at least two days before the TCA (time to tca ⩾ 2)

In [None]:
# Keep events with the first CDM at least two days before TCA
filtered_data = filtered_data[filtered_data["time_to_tca"] >= 2]
filtered_data.info()

## Plots for data visualisation

In [None]:
# Load data
df = load_data()

# Scatter plot to visualize the event_id in a time_to_tca over risk diagram
fig = px.scatter(df, x="risk", y="time_to_tca", color="event_id")
fig.update_traces(marker=dict(size=4))

# Save the scatter plot as a PNG image
fig.show()

In [None]:
fig.update_layout({
    'plot_bgcolor': 'rgba(255, 255, 255, 1)',
    'paper_bgcolor': 'rgba(255, 255, 255, 1)',
    'font': {'color': 'black'}
})
fig.write_image("../../figures/scatter_plot.png")

In [None]:
# Filter data depending on requirements of challenge
df_filtered = filter_data(df)
print(df_filtered.info())

# Filtered scatter plot to visualize teh event_id in a time_to_tca over risk diagram
fig = px.scatter(df_filtered, x="risk", y="time_to_tca", color="event_id")
fig.update_traces(marker=dict(size=4))
fig.show()

In [None]:
fig.update_layout({
    'plot_bgcolor': 'rgba(255, 255, 255, 1)',
    'paper_bgcolor': 'rgba(255, 255, 255, 1)',
    'font': {'color': 'black'}
})
fig.write_image("../../figures/scatter_plot_filtered.png")

In [None]:
# Extract features and labels from data
features = df_filtered.drop(["risk"], axis=1)
target = df_filtered["risk"]

In [None]:
# Distribution of the target variable "risk" of filtered data
plt.figure(figsize=(8, 5))

# Create a white background with black grid lines
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "grid.color": "black"})

# Plot the histogram with a blue color
sns.histplot(target, bins=30, kde=True, color="blue")

# Set title and labels
plt.title("Distribution of Risk", color="black")
plt.xlabel("Risk (base 10 log)", color="black")
plt.ylabel("Frequency", color="black")

plt.show()

In [None]:
# Save the histogram plot as a PNG image
plt.savefig('../../figures/risk_distribution.png', bbox_inches='tight')

In [None]:
# Box plot for categorical feature "c_object_type"
plt.figure(figsize=(12, 6))
sns.boxplot(x="c_object_type", y="risk", data=df_filtered)
plt.title("Box Plot of Risk by Object Type")
plt.xlabel("Object Type")
plt.ylabel("Risk (base 10 log)")
plt.show()

In [None]:
# Correlation plot that shows the correlation of different features
# Pearson correlation coefficient, which ranges from -1 to 1:
# Positive Correlation (close to +1): As one variable increases, the other variable also tends to increase.
# Negative Correlation (close to -1): As one variable increases, the other variable tends to decrease.
# No Correlation (close to 0): There is no linear relationship between the variables.

label_encode_column(df_filtered, "c_object_type")
correlation_matrix = df_filtered.corr()

plt.figure(figsize=(20, 20))

sns.heatmap(correlation_matrix, annot=False, cmap="BrBG", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

## NaN Values

In [None]:
# Load and data and filter it
df = load_data()

# Check for missing values
missing_values = df.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])

In [None]:
# Create a heatmap to visualize missing values
plt.figure(figsize=(20, 20))
sns.heatmap(df.isnull(), cbar=False, cmap="inferno")
plt.title("Missing Values Heatmap")
plt.show()