In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings


warnings.filterwarnings('ignore')

np.random.seed(42)

print("Imports loaded")

Imports loaded


In [3]:
df = pd.read_csv('../data/Exoplanet_Dataset.csv')

features = ['radius', 'orbital_period', 'star_mass']
target = 'mass'

df_stage1 = df[features + [target]].dropna()

print(f"Dataset shape: {df_stage1.shape}")
print(f"Samples available: {len(df_stage1)}")
print("\nFeature columns:")
print(df_stage1[features].describe())

Dataset shape: (1521, 4)
Samples available: 1521

Feature columns:
            radius  orbital_period    star_mass
count  1521.000000    1.521000e+03  1521.000000
mean      0.702337    5.583841e+03     0.960060
std       0.548382    2.061013e+05     0.323457
min       0.026000    6.655134e-02     0.015400
25%       0.211000    3.119063e+00     0.784000
50%       0.554000    5.095717e+00     0.980000
75%       1.160000    1.272070e+01     1.148000
max       6.900000    8.035500e+06     2.520000


In [4]:
df_stage1['mass_log'] = np.log1p(df_stage1['mass'])

print("Original mass stats:")
print(f"  Skewness: {df_stage1['mass'].skew():.3f}")
print(f"  Mean: {df_stage1['mass'].mean():.3f}")
print(f"  Median: {df_stage1['mass'].median():.3f}")

print("\nLog-transformed mass stats:")
print(f"  Skewness: {df_stage1['mass_log'].skew():.3f}")
print(f"  Mean: {df_stage1['mass_log'].mean():.3f}")
print(f"  Median: {df_stage1['mass_log'].median():.3f}")

Original mass stats:
  Skewness: 3.616
  Mean: 0.880
  Median: 0.168

Log-transformed mass stats:
  Skewness: 1.619
  Mean: 0.428
  Median: 0.155


In [5]:
X = df_stage1[features]
y = df_stage1['mass_log'] 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

lr_baseline = LinearRegression()
lr_baseline.fit(X_train, y_train)

y_pred_baseline = lr_baseline.predict(X_test)

r2_baseline = r2_score(y_test, y_pred_baseline)
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))

print("=" * 50)
print("BASELINE: Linear Regression + Log Transform")
print("=" * 50)
print(f"Test R²:   {r2_baseline:.4f}")
print(f"Test MAE:  {mae_baseline:.4f}")
print(f"Test RMSE: {rmse_baseline:.4f}")
print()

Training samples: 1216
Test samples: 305
BASELINE: Linear Regression + Log Transform
Test R²:   0.3173
Test MAE:  0.2722
Test RMSE: 0.4179



In [6]:
# polynomial features (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

feature_names = poly.get_feature_names_out(features)
print(f"Original features: {len(features)}")
print(f"Polynomial features: {len(feature_names)}")
print(f"\nNew features created:")
for i, name in enumerate(feature_names):
    print(f"  {i+1}. {name}")

Original features: 3
Polynomial features: 9

New features created:
  1. radius
  2. orbital_period
  3. star_mass
  4. radius^2
  5. radius orbital_period
  6. radius star_mass
  7. orbital_period^2
  8. orbital_period star_mass
  9. star_mass^2


In [7]:
lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train)

y_pred_poly = lr_poly.predict(X_test_poly)

r2_poly = r2_score(y_test, y_pred_poly)
mae_poly = mean_absolute_error(y_test, y_pred_poly)
rmse_poly = np.sqrt(mean_squared_error(y_test, y_pred_poly))

print("=" * 50)
print("LINEAR REGRESSION + POLYNOMIAL (degree=2)")
print("=" * 50)
print(f"Test R²:   {r2_poly:.4f}")
print(f"Test MAE:  {mae_poly:.4f}")
print(f"Test RMSE: {rmse_poly:.4f}")
print()

improvement = (r2_poly / r2_baseline - 1) * 100
print(f"Improvement over baseline: {improvement:+.1f}%")

LINEAR REGRESSION + POLYNOMIAL (degree=2)
Test R²:   0.2632
Test MAE:  0.2773
Test RMSE: 0.4342

Improvement over baseline: -17.0%
