# Data Preprocessing

In [None]:
from src.utils.check_mps_device import check_mps_device
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from src.utils.data_loading import load_data
from src.utils.filtering import filter_data
from src.utils.label_encoding import label_encode_column

# Check if PyTorch Multi-Process Service (MPS) is available (GPU)
check_mps_device()

In [None]:
selected_features = [
    "geocentric_latitude",  # Latitude of conjunction point [deg]
    "c_sigma_rdot",  # covariance; radial velocity standard deviation (sigma) of chaser [m/s]
    "c_obs_used",  # number of observations used for orbit determination (per CDM) of chaser
    "c_time_lastob_start",
    # start of the time in days of the last accepted observation used in the orbit determination of chaser
    "c_time_lastob_end",
    # end of the time interval in days of the last accepted observation used in the orbit determination of chaser
    "mahalanobis_distance",  # The distance between the chaser and target
    "miss_distance",  # relative position between chaser & target at tca [m
    "time_to_tca",  # Time interval between CDM creation and time-of-closest approach [days]
    "t_cndot_r",
    # covariance; correlation of normal (cross-track) velocity vs radial position of chaser
    "c_cr_area_over_mass",
    # solar radiation coefficient . A/m (ballistic coefficient equivalent) of chaser
    "max_risk_estimate",  # maximum collision probability obtained by scaling combined covariance
    "c_span",  # size used by the collision risk computation algorithm of chaser [m]
    "max_risk_scaling",  # scaling factor used to compute maximum collision probability
    "t_rcs_estimate",  # radar cross-sectional area [m2m2] of target
    "c_sigma_t",
    # covariance; transverse (along-track) position standard deviation (sigma) of chaser [m]
    "c_obs_available",  # number of observations available for orbit determination (per CDM),
    "risk",
]

### a. Data Exploration:

In [None]:
# Load and data and filter it
df = load_data()
df_filtered = filter_data(df)

# Label encode the categorical column "c_object_type"
label_encode_column(df_filtered, "c_object_type")

In [None]:
# Explore the target variable distribution
df_filtered["risk"].hist(bins=30)
plt.title("Distribution of Risk")
plt.xlabel("Risk")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Explore the distribution of selected features
df_filtered[selected_features].hist(bins=20, figsize=(15, 12))
plt.suptitle("Distribution of Selected Features")
plt.show()

### b. Handling Missing Values (NaN)

In [None]:
df_filtered_selected = df_filtered[selected_features]

missing_before_imputation = df_filtered_selected.isnull().sum()
print("Missing values before imputation:")
print(missing_before_imputation[missing_before_imputation > 0])

In [None]:
# Create a heatmap to visualize missing values
plt.figure(figsize=(8, 8))
sns.heatmap(df_filtered_selected.isnull(), cbar=False, cmap="inferno")
plt.title("Missing Values Heatmap")
plt.show()

## Imputing with RandomForestRegressor

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# Assuming df_filtered_selected contains your DataFrame and selected_features contains the columns with missing values
features_with_nan = ["c_sigma_rdot", "t_cndot_r", "t_rcs_estimate"]

# Create an instance of IterativeImputer with RandomForestRegressor
imputer = IterativeImputer(
    estimator=RandomForestRegressor(), max_iter=10, random_state=42
)

# Use the copy method to create a copy of the DataFrame
df_filtered_selected_copy = df_filtered_selected.copy()

# Fit and transform the imputer on the original dataset
df_filtered_imputed = pd.DataFrame(
    imputer.fit_transform(df_filtered_selected_copy[features_with_nan]),
    columns=features_with_nan,
)

# Update the original DataFrame with the imputed values
df_filtered_selected_copy[features_with_nan] = df_filtered_imputed

In [None]:
df_new = df_filtered_selected_copy.copy()

# Check missing values after imputation on the imputed DataFrame
missing_after_imputation = df_new.isnull().sum()
print("Missing values after imputation:")
print(missing_after_imputation[missing_after_imputation > 0])

In [None]:
# Descriptive statistics before imputation
# Load and filter original data for comparison
df = load_data()
df_filtered_comp = filter_data(df)
df_filtered_selected_comp = df_filtered_comp[selected_features]

stats_before_imputation = df_filtered_selected_comp[features_with_nan].describe()

# Descriptive statistics after imputation
stats_after_imputation = df_new[features_with_nan].describe()

# Calculate percentage change for each statistic
percentage_change = (
                            (stats_after_imputation - stats_before_imputation) / stats_before_imputation
                    ) * 100

# Display the results
print("\nPercentage Change After Imputation:")
print(percentage_change)