In [None]:
from src.utils.filtering import filter_data
from src.utils.label_encoding import label_encode_column
from src.utils.data_loading import load_data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df = load_data(data_path="../../data/testing/test_data.csv")
print(df.head())
df_filtered = filter_data(df)

df_filtered.dropna(axis=0, how='any', inplace=True)

label_encode_column(df_filtered, "c_object_type")

features = df_filtered.drop(["risk"], axis=1)
target = df_filtered["risk"]

# Assuming 'selected_features' is a DataFrame and 'target' is a Series
selected_features = ['time_to_tca', 'mahalanobis_distance', 'max_risk_estimate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_filtered[selected_features], target, test_size=0.2, random_state=42)

# Train the model (Random Forest as an example)
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Transform predictions and true values back to the original scale
# y_test_original = 10 ** y_test
# y_pred_original = 10 ** y_pred
# 
# # Calculate MSE in the original scale
# mse_original = mean_squared_error(y_test_original, y_pred_original)
# print("Mean Squared Error in Original Scale:", mse_original)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error in Original Scale:", mse)


In [9]:
from imblearn.over_sampling import SMOTE
from src.utils.filtering import filter_data
from src.utils.label_encoding import label_encode_column
from src.utils.data_loading import load_data
from sklearn.model_selection import train_test_split

df = load_data(data_path="../../data/testing/test_data.csv")
print(df.head())
df_filtered = filter_data(df)

df_filtered.dropna(axis=0, how='any', inplace=True)

label_encode_column(df_filtered, "c_object_type")

X = df_filtered.drop(["risk"], axis=1)
y = df_filtered["risk"]

# Assuming 'selected_features' is a DataFrame and 'target' is a Series
selected_features = ['time_to_tca', 'mahalanobis_distance', 'max_risk_estimate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_filtered[selected_features], y, test_size=0.2, random_state=42)

# Convert y_train to a numpy array and then reshape it
y_train = y_train.values.reshape(-1,)

sm = SMOTE(random_state=42, k_neighbors=58)
X_train, y_train = sm.fit_resample(X_train, y_train.re)
print(X_train.shape)
print(y_train.shape)


Raw data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24484 entries, 0 to 24483
Columns: 103 entries, event_id to AP
dtypes: float64(98), int64(4), object(1)
memory usage: 19.2+ MB
   event_id  time_to_tca  mission_id      risk  max_risk_estimate  \
0         0     6.842095          19 -7.296967          -7.208941   
1         0     6.571818          19 -7.282496          -7.199833   
2         0     6.112986          19 -7.316053          -7.217886   
3         0     5.921955          19 -7.334138          -7.228707   
4         0     2.228761          19 -7.332267          -7.227312   

   max_risk_scaling  miss_distance  relative_speed  relative_position_r  \
0          1.787894        31816.0          7929.0               -365.5   
1          1.759386        31095.0          7929.0               -361.8   
2          1.824263        32202.0          7929.0               -370.7   
3          1.865396        32878.0          7929.0               -376.3   
4          1.863127     

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test_original, y_pred_original)
print("R-squared Score:", r2)
 

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test_original, y_pred_original)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values")
plt.show()


In [None]:
residuals = y_test_original - y_pred_original
plt.scatter(y_test_original, residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("True Values")
plt.ylabel("Residuals")
plt.title("Residuals Plot")
plt.show()


In [None]:
import seaborn as sns

sns.histplot(residuals, bins=30, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Distribution of Residuals")
plt.show()
