# NO PCA and no NaN imputing

## Train-Test Split

In [1]:
import pandas as pd

from src.utils.filtering import filter_data
from src.utils.data_loading import load_data
from sklearn.model_selection import train_test_split
from src.utils.label_encoding import label_encode_column
from sklearn.preprocessing import StandardScaler

In [2]:
selected_features = [
    "geocentric_latitude",  # Latitude of conjunction point [deg]
    "c_sigma_rdot",  # covariance; radial velocity standard deviation (sigma) of chaser [m/s]
    "c_obs_used",  # number of observations used for orbit determination (per CDM) of chaser
    "c_time_lastob_start",
    # start of the time in days of the last accepted observation used in the orbit determination of chaser
    "c_time_lastob_end",
    # end of the time interval in days of the last accepted observation used in the orbit determination of chaser
    "mahalanobis_distance",  # The distance between the chaser and target
    "miss_distance",  # relative position between chaser & target at tca [m
    "time_to_tca",  # Time interval between CDM creation and time-of-closest approach [days]
    "t_cndot_r",
    # covariance; correlation of normal (cross-track) velocity vs radial position of chaser
    "c_cr_area_over_mass",
    # solar radiation coefficient . A/m (ballistic coefficient equivalent) of chaser
    "max_risk_estimate",  # maximum collision probability obtained by scaling combined covariance
    "c_span",  # size used by the collision risk computation algorithm of chaser [m]
    "max_risk_scaling",  # scaling factor used to compute maximum collision probability
    "t_rcs_estimate",  # radar cross-sectional area [m2m2] of target
    "c_sigma_t",
    # covariance; transverse (along-track) position standard deviation (sigma) of chaser [m]
    "c_obs_available",  # number of observations available for orbit determination (per CDM),
    "risk",
]

In [4]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
nan = np.nan
class StopExecution(Exception):
    def _render_traceback_(self):
        return []

# Load and data and filter it
df = load_data()
df_filtered = filter_data(df)



# Remove rows with missing values
#df_filtered.dropna(axis=0, how="any", inplace=True)

#print("hi")


# Label encode the categorical column "c_object_type"
label_encode_column(df_filtered, "c_object_type")


# Call the function to get the processed DataFrame
df_processed = df_filtered[selected_features]
arr_processed = df_processed.to_numpy()
imputer = KNNImputer(n_neighbors=2, weights="uniform")
arr_imputed=imputer.fit_transform(arr_processed)
#print(arr_imputed.shape)
df_imputed = pd.DataFrame(data=arr_imputed[0:,0:],index=[i for i in range(arr_imputed.shape[0])],columns=selected_features)
# Separate features and target variable
X = df_imputed.drop('risk', axis=1)
y = df_imputed['risk']

# Split the data into training, testing, and validation sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

# Perform scaling after splitting the data
scaler = StandardScaler()

# Fit the scaler on the training set
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same scaler to the validation and test sets
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Display the shapes of the sets
print("Full Training set shape:", X_train_full.shape, y_train_full.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
print("Training set shape:", X_train_scaled.shape, y_train.shape)
print("Validation set shape:", X_val_scaled.shape, y_val.shape)


Raw data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162634 entries, 0 to 162633
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 127.8+ MB
Filtered data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3032 entries, 0 to 3031
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 2.4+ MB
(3032, 17)
Full Training set shape: (2425, 16) (2425,)
Testing set shape: (607, 16) (607,)
Training set shape: (1818, 16) (1818,)
Validation set shape: (607, 16) (607,)


Added the KNNI imputer in the above cell.

## Model Selection

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# Choose Models
models = {
    # 'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor()
}

# Hyperparameter Tuning
param_grid = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
}

best_models = {}

for model_name, model in models.items():
    if model_name in param_grid:
        # Use GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid[model_name], scoring='neg_mean_squared_error', cv=5)
        grid_search.fit(X_train_scaled, y_train)
        best_models[model_name] = grid_search.best_estimator_
    else:
        # For models without hyperparameters, use the default configuration
        best_models[model_name] = model.fit(X_train_scaled, y_train)


## Model Training:

In [6]:
# Train Models
for model_name, model in best_models.items():
    model.fit(X_train_scaled, y_train)

# Evaluate Performance
from sklearn.metrics import mean_squared_error

# Evaluate on training set
for model_name, model in best_models.items():
    y_pred_train = model.predict(X_train_scaled)
    mse_train = mean_squared_error(y_train, y_pred_train)
    print(f'{model_name} - MSE on training set: {mse_train}')

# Evaluate on validation set
for model_name, model in best_models.items():
    y_pred_val = model.predict(X_val_scaled)
    mse_val = mean_squared_error(y_val, y_pred_val)
    print(f'{model_name} - MSE on validation set: {mse_val}')


Random Forest - MSE on training set: 0.029886637427188643
Random Forest - MSE on validation set: 0.09465651937298585


In [7]:
from sklearn.metrics import mean_absolute_error

# Assuming y_pred is your predicted values and y_true is the true target values
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

# Calculate MAE for training set
mae_train = mean_absolute_error(y_train, y_pred_train)

# Calculate MAE for validation set
mae_val = mean_absolute_error(y_val, y_pred_val)

# Print the MAE values
print(f"MAE on training set: {mae_train}")
print(f"MAE on validation set: {mae_val}")


MAE on training set: 16.625454950015342
MAE on validation set: 16.258605110271596




In [8]:
from sklearn.metrics import r2_score

# Calculate R-squared for training set
r2_train = r2_score(y_train, y_pred_train)

# Calculate R-squared for validation set
r2_val = r2_score(y_val, y_pred_val)

# Print the R-squared values
print(f"R-squared on training set: {r2_train}")
print(f"R-squared on validation set: {r2_val}")
print("hi")

R-squared on training set: -4.119008021062443
R-squared on validation set: -3.7891234703833687
hi


## Model Comparison:

## Fine-Tuning