In [17]:
from src.utils.check_mps_device import check_mps_device

from src.utils.data_loading import load_data
from src.utils.filtering import filter_data

from src.utils.label_encoding import label_encode_column

import numpy as np

# Check if PyTorch Multi-Process Service (MPS) is available (GPU)
check_mps_device()

tensor([1.], device='mps:0')


In [18]:
selected_features = [
    "geocentric_latitude",  # Latitude of conjunction point [deg]
    "c_sigma_rdot",  # covariance; radial velocity standard deviation (sigma) of chaser [m/s]
    "c_obs_used",  # number of observations used for orbit determination (per CDM) of chaser
    "c_time_lastob_start",
    # start of the time in days of the last accepted observation used in the orbit determination of chaser
    "c_time_lastob_end",
    # end of the time interval in days of the last accepted observation used in the orbit determination of chaser
    "mahalanobis_distance",  # The distance between the chaser and target
    "miss_distance",  # relative position between chaser & target at tca [m
    "time_to_tca",  # Time interval between CDM creation and time-of-closest approach [days]
    "t_cndot_r",
    # covariance; correlation of normal (cross-track) velocity vs radial position of chaser
    "c_cr_area_over_mass",
    # solar radiation coefficient . A/m (ballistic coefficient equivalent) of chaser
    "max_risk_estimate",  # maximum collision probability obtained by scaling combined covariance
    "c_span",  # size used by the collision risk computation algorithm of chaser [m]
    "max_risk_scaling",  # scaling factor used to compute maximum collision probability
    "t_rcs_estimate",  # radar cross-sectional area [m2m2] of target
    "c_sigma_t",
    # covariance; transverse (along-track) position standard deviation (sigma) of chaser [m]
    "c_obs_available",  # number of observations available for orbit determination (per CDM),
    "risk",
]

In [19]:
# Load and data and filter it
df = load_data()
df_filtered = filter_data(df)

# Remove rows with missing values
df_filtered.dropna(axis=0, how="any", inplace=True)

# Label encode the categorical column "c_object_type"
label_encode_column(df_filtered, "c_object_type")

# Call the function to get the processed DataFrame
df_processed = df_filtered[selected_features]

# Separate features and target variable
X = df_processed.drop('risk', axis=1)  # features (17)
y = df_processed['risk']  # target

Raw data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162634 entries, 0 to 162633
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 127.8+ MB
Filtered data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3032 entries, 0 to 3031
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 2.4+ MB


In [20]:
from sklearn.model_selection import train_test_split

y_class = np.where(y >= -6, 1, 0)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True,
                                                    stratify=y_class)
# 
# X_train, X_val, y_train_real, y_val_real = train_test_split(X_t, y_t_real, 
#                                                             test_size=0.20, 
#                                                             random_state=21,  
#                                                             shuffle=True,
#                                                             stratify=y_t_class)
# 
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1058, 16)
(1058,)
(265, 16)
(265,)


## SMOTE

In [21]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data to balance the classes
y_train = np.where(y_train >= -6, 1, 0)

smote = SMOTE(random_state=42, k_neighbors=20)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [22]:
from sklearn.preprocessing import StandardScaler

# Perform scaling after splitting the data
scaler = StandardScaler()

# Fit the scaler on the training set
X_train_final = scaler.fit_transform(X_train_resampled)
y_train_final = y_train_resampled

X_test_final = scaler.transform(X_test)
y_test_final = y_test
# # Display the shapes of the sets
print("Testing set shape:", X_test_final.shape, y_test_final.shape)
print("Training set shape:", X_train_final.shape, y_train_final.shape)

Testing set shape: (265, 16) (265,)
Training set shape: (1962, 16) (1962,)


## Model Selection

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Hyperparameter Tuning
param_grid = {
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
}

best_models = {}

# Only consider Random Forest Regressor
model_name = 'Random Forest'
model = RandomForestRegressor()

if model_name in param_grid:
    # Use GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grid[model_name], scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train_final, y_train_final)
    best_models[model_name] = grid_search.best_estimator_
else:
    # For Random Forest Regressor without hyperparameters, use the default configuration
    best_models[model_name] = model.fit(X_train_final, y_train_final)

In [24]:
from src.utils.f_beta import calculate_f_beta_sklearn

# Create dictionaries to store results
f_beta_scores = {}
mse_hr_scores = {}

# Train the Random Forest model
best_model_name = 'Random Forest'

# Predict on the test set
y_pred_test = best_models[model_name].predict(X_test_final)
y_pred = np.where(y_pred_test >= -6, -5, -6.00001)

# Check the shapes before calculating the scores
print(f"Shapes before calculation for {best_model_name}:")
print("y_test_final shape:", y_test_final.shape)
print("y_pred shape:", y_pred.shape)

# Calculate F-beta and MSE_HR scores
score2, f_beta2, mse_hr2 = calculate_f_beta_sklearn(y_test_final, y_pred, beta=2, threshold=-6)

print(f"Best Model: {best_model_name}")
print(f"Score: {score2}")
print(f"F-beta Score: {f_beta2}")
print(f"MSE_HR Score: {mse_hr2}")

Shapes before calculation for Random Forest:
y_test_final shape: (265,)
y_pred shape: (265,)
Best Model: Random Forest
Score: 1.289534146107888
F-beta Score: 0.2785923753665689
MSE_HR Score: 0.3592543808804966


In [25]:
print(y_pred)

[-5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5.
 -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5. -5

In [33]:
print(y_pred_test)

[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.98 0.   0.   0.   0.   0.   0.   0.   0.   0.02 0.   0.   0.
 0.   0.   0.   1.   0.   0.   0.   0.   0.01 0.   0.02 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.58 0.02 0.   0.   0.   0.   1.   0.
 0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.5  0.   0.   0.   0.27
 0.13 0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.
 0.03 1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.56 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.96 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.99 0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.
 0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   1

In [31]:
y_test_final

2790    -8.111821
1921   -30.000000
1990   -30.000000
556    -10.236048
1829    -9.498804
          ...    
249     -5.264880
2387   -30.000000
1391    -5.813326
2202   -11.787280
1982    -4.388595
Name: risk, Length: 265, dtype: float64