In [1]:
from src.utils.check_mps_device import check_mps_device
import pandas as pd

from src.utils.data_loading import load_data
from src.utils.filtering import filter_data

from src.utils.label_encoding import label_encode_column

# Set the display option to show all rows and columns
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

# Check if PyTorch Multi-Process Service (MPS) is available (GPU)
check_mps_device()

tensor([1.], device='mps:0')


In [2]:
# Load data
df = load_data()
df_filtered = filter_data(df)

Raw data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162634 entries, 0 to 162633
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 127.8+ MB
Filtered data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3032 entries, 0 to 3031
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 2.4+ MB


In [3]:
# Label encode categorical columns (e.g., "c_object_type")
label_encode_column(df_filtered, "c_object_type")

# Calculate correlation matrix
correlation_matrix = df_filtered.corr()

# Identify features with high correlation with the target variable ("risk")
target_correlation = correlation_matrix["risk"].abs().sort_values(ascending=False)
best_features = target_correlation[1:]  # Exclude the target variable
top_features = best_features.head(10)
print("Top Features:")
print(top_features)

Top Features:
c_obs_used              0.388148
c_time_lastob_end       0.351368
c_obs_available         0.322001
c_time_lastob_start     0.309940
c_span                  0.277618
miss_distance           0.247441
time_to_tca             0.235292
mahalanobis_distance    0.222178
max_risk_estimate       0.214653
t_h_apo                 0.206298
Name: risk, dtype: float64


In [4]:
from sklearn.feature_selection import mutual_info_regression

# Label encode categorical columns (e.g., "c_object_type")
# Load data
df = load_data()
df_filtered = filter_data(df)

df_filtered.dropna(axis=0, how='any', inplace=True)

label_encode_column(df_filtered, "c_object_type")

features = df_filtered.drop(["risk"], axis=1)
target = df_filtered["risk"]

# Calculate mutual information between features and target
mutual_info_scores = mutual_info_regression(features, target)

# Create a DataFrame with feature names and their mutual information scores
mi_df = pd.DataFrame({"Feature": features.columns, "Mutual_Info_Score": mutual_info_scores})
mi_df = mi_df.sort_values(by="Mutual_Info_Score", ascending=False)

# Print the features with the highest mutual information scores
print("Features with the highest mutual information scores:")
print(mi_df.head(10))


Raw data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162634 entries, 0 to 162633
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 127.8+ MB
Filtered data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3032 entries, 0 to 3031
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 2.4+ MB
Features with the highest mutual information scores:
                 Feature  Mutual_Info_Score
4       max_risk_scaling           0.458331
3      max_risk_estimate           0.383761
89             c_sigma_t           0.247939
93          c_sigma_rdot           0.235979
21        t_rcs_estimate           0.232890
55                c_sedr           0.201011
83  mahalanobis_distance           0.200237
25             t_j2k_sma           0.199910
16      t_actual_od_span           0.196539
27             t_j2k_inc           0.192831


In [5]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Label encode categorical columns (e.g., "c_object_type")
# Load data
df = load_data()
df_filtered = filter_data(df)

df_filtered.dropna(axis=0, how='any', inplace=True)

label_encode_column(df_filtered, "c_object_type")

features = df_filtered.drop(["risk"], axis=1)
target = df_filtered["risk"]

# Feature selection using SelectKBest with mutual_info_regression
selector = SelectKBest(score_func=mutual_info_regression, k=10) 
X_selected = selector.fit_transform(features, target)

# Get the selected feature names
selected_feature_names = features.columns[selector.get_support()]

# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(X_selected, columns=selected_feature_names)

# Print selected feature names
print("Selected Feature Names:")
print(selected_feature_names)

Raw data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162634 entries, 0 to 162633
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 127.8+ MB
Filtered data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3032 entries, 0 to 3031
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 2.4+ MB
Selected Feature Names:
Index(['max_risk_estimate', 'max_risk_scaling', 't_rcs_estimate',
       't_cd_area_over_mass', 't_j2k_sma', 't_j2k_inc', 'c_sedr',
       'mahalanobis_distance', 'c_sigma_t', 'c_sigma_rdot'],
      dtype='object')


In [6]:
# Assuming 'selected_feature_names', 'mi_df', and 'top_features' are all DataFrames or Series

# Extract feature names from mi_df and top_features
mi_features = set(mi_df.head(10)['Feature'])
top_features_set = set(top_features.index)

# Extract feature names from selected_feature_names
selected_features_set = set(selected_feature_names)

# Find the common features among the three sets
common_features = mi_features.intersection(top_features_set, selected_features_set)

# Print or use the common features as needed
print("Common Features:", common_features)


Common Features: {'mahalanobis_distance', 'max_risk_estimate'}
