In [1]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Generate synthetic data
X, y = make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:
# Define regression models
models = {
    "Linear Regression": LinearRegression(),
    "Elastic Net": ElasticNet(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [8]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

In [9]:
# Train and evaluate models
results = {}
for name, model in models.items():
    mse, r2 = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {"MSE": mse, "R2": r2}
    
    # Perform k-fold cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    results[name]["CV_MSE"] = -cv_scores.mean()

In [10]:
cv_scores

array([-420.4442946 , -368.23838509, -501.78841082, -360.04808479,
       -455.25057276])

In [11]:
# Print results
print("Model Evaluation Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"  MSE: {metrics['MSE']:.4f}")
    print(f"  R2 Score: {metrics['R2']:.4f}")
    print(f"  Cross-validation MSE: {metrics['CV_MSE']:.4f}")


Model Evaluation Results:

Linear Regression:
  MSE: 0.0111
  R2 Score: 1.0000
  Cross-validation MSE: 0.0107

Elastic Net:
  MSE: 421.8960
  R2 Score: 0.8857
  Cross-validation MSE: 494.7647

Decision Tree:
  MSE: 1257.3113
  R2 Score: 0.6595
  Cross-validation MSE: 1052.7665

Random Forest:
  MSE: 287.4665
  R2 Score: 0.9221
  Cross-validation MSE: 421.1539


In [12]:
# Feature importance for Random Forest
rf_model = models["Random Forest"]
rf_model.fit(X_train_scaled, y_train)
feature_importance = rf_model.feature_importances_
print("\nRandom Forest Feature Importances:")
for i, importance in enumerate(feature_importance):
    print(f"Feature {i+1}: {importance:.4f}")

# Predictions on new data
new_data = np.random.randn(5, 5)  # 5 samples with 5 features
new_data_scaled = scaler.transform(new_data)
print("\nPredictions on new data:")
for name, model in models.items():
    predictions = model.predict(new_data_scaled)
    print(f"\n{name} predictions:")
    print(predictions)



Random Forest Feature Importances:
Feature 1: 0.1744
Feature 2: 0.5509
Feature 3: 0.0565
Feature 4: 0.1429
Feature 5: 0.0754

Predictions on new data:

Linear Regression predictions:
[-92.36023744  45.25618869 101.92074446 -46.24919157  48.12860369]

Elastic Net predictions:
[-59.79264174  30.11116806  68.78762484 -29.3996923   32.79799676]

Decision Tree predictions:
[-66.02238338  85.63467548  91.67081579 -30.78130271  76.77816146]

Random Forest predictions:
[-64.36531148  28.99323804  86.76561245 -47.23289757  38.69939278]
