# SPACECODE_PS3_BEETLEJUICE2.0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, recall_score, precision_score, f1_score, roc_curve
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import xgboost as xgb
import lightgbm as lgb
import time
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

In [None]:
df = pd.read_csv(r'd:\as\nasa_neows_1950_2025.csv', low_memory=False)
print(f"Dataset: {df.shape[0]} rows, {df.shape[1]} columns")

In [None]:
columns_to_drop = ['Neo Reference ID', 'Name', 'Date', 'Close Approach Date', 'Orbit Determination Date', 'Equinox', 'Orbiting Body']
cols_to_drop = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns=cols_to_drop)

In [None]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

df = df.dropna(subset=['Hazardous'])
df['Hazardous'] = df['Hazardous'].map({'True': True, 'False': False, True: True, False: False})
df['Hazardous'] = df['Hazardous'].astype(int)

print(f"Safe: {(df['Hazardous'] == 0).sum()}, Hazardous: {(df['Hazardous'] == 1).sum()}")

In [None]:
df_viz = df.copy()
feature_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
feature_cols.remove('Hazardous')

scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
hazardous = df_viz[df_viz['Hazardous'] == 1]['Absolute Magnitude']
safe = df_viz[df_viz['Hazardous'] == 0]['Absolute Magnitude']

ax.hist(safe, bins=50, alpha=0.7, label='Safe', color='#2ecc71', edgecolor='white')
ax.hist(hazardous, bins=50, alpha=0.7, label='Hazardous', color='#e74c3c', edgecolor='white')
ax.axvline(x=22.0, color='#f39c12', linestyle='--', linewidth=2.5, label='NASA Threshold (H=22.0)')
ax.set_xlabel('Absolute Magnitude (H)')
ax.set_ylabel('Frequency')
ax.set_title('Absolute Magnitude Distribution')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
key_features = ['Absolute Magnitude', 'Est Dia in KM(max)', 'Relative Velocity km per sec',
                'Miss Dist.(Astronomical)', 'Minimum Orbit Intersection', 'Eccentricity',
                'Semi Major Axis', 'Inclination', 'Orbital Period', 'Hazardous']
key_features = [f for f in key_features if f in df_viz.columns]

corr_matrix = df_viz[key_features].corr()

fig, ax = plt.subplots(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r', center=0, square=True, ax=ax)
ax.set_title('Feature Correlations')
plt.tight_layout()
plt.show()

In [None]:
X = df.drop('Hazardous', axis=1)
y = df['Hazardous']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Imbalance ratio: {(y == 0).sum() / (y == 1).sum():.1f}:1")

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"SMOTE - Before: {Counter(y_train)}, After: {Counter(y_train_smote)}")

In [None]:
undersample = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
print(f"Undersampling - Before: {Counter(y_train)}, After: {Counter(y_train_under)}")

In [None]:
class_weight_ratio = (y_train == 0).sum() / (y_train == 1).sum()

models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=15, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, scale_pos_weight=class_weight_ratio, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
    'LightGBM': lgb.LGBMClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1, verbose=-1)
}

In [None]:
results = []

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start
    
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    results.append({
        'Model': name,
        'Recall': recall_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC-ROC': roc_auc_score(y_test, y_proba),
        'Time': train_time,
        'Probabilities': y_proba
    })
    print(f"{name}: Recall={results[-1]['Recall']:.4f}, AUC={results[-1]['AUC-ROC']:.4f}")

In [None]:
results_df = pd.DataFrame(results).sort_values('Recall', ascending=False)
print(results_df[['Model', 'Recall', 'Precision', 'F1', 'AUC-ROC']].to_string(index=False))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
for r in results:
    fpr, tpr, _ = roc_curve(y_test, r['Probabilities'])
    ax1.plot(fpr, tpr, label=f"{r['Model']} (AUC={r['AUC-ROC']:.3f})", linewidth=2)
ax1.plot([0, 1], [0, 1], 'k--', alpha=0.5)
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curves')
ax1.legend(loc='lower right')

ax2 = axes[1]
x = np.arange(len(results_df))
width = 0.35
ax2.bar(x - width/2, results_df['Recall'], width, label='Recall', color='#e74c3c')
ax2.bar(x + width/2, results_df['AUC-ROC'], width, label='AUC-ROC', color='#3498db')
ax2.set_xticks(x)
ax2.set_xticklabels(results_df['Model'], rotation=15, ha='right')
ax2.set_title('Model Comparison')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
best_model = models[results_df.iloc[0]['Model']]
y_pred_best = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred_best)

fig, ax = plt.subplots(figsize=(8, 6))
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='RdYlGn_r', xticklabels=['Safe', 'Hazardous'], yticklabels=['Safe', 'Hazardous'], ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
importance = pd.DataFrame({'Feature': X.columns, 'Importance': models['XGBoost'].feature_importances_}).sort_values('Importance', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
top = importance.head(10)
ax.barh(range(len(top)), top['Importance'], color=plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(top))))
ax.set_yticks(range(len(top)))
ax.set_yticklabels(top['Feature'])
ax.invert_yaxis()
ax.set_xlabel('Importance')
ax.set_title('Top 10 Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
results_df[['Model', 'Recall', 'Precision', 'F1', 'AUC-ROC', 'Time']].to_csv(r'd:\as\final_results.csv', index=False)
print("Results saved.")