### Setup and Common Code

In [13]:
# Import all necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Import Datasets
from sklearn.datasets import load_diabetes, fetch_california_housing, make_regression

In [14]:
# Function to quickly evaluate a model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f"MSE: {mse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    # return predictions

### Model-Specific Code

In [15]:
# Option A: Dataset 1 - load_diabetes()
data = load_diabetes()
X, y = data.data, data.target
dataset_name = "Diabetes"

# Option B: Dataset 2 - fetch_california_housing()
data = fetch_california_housing()
X, y = data.data, data.target
dataset_name = "California Housing"

# Option C: Dataset 3 - make_regression()
X, y = make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)
dataset_name = "Synthetic Regression"

# Regression Models

### 1. Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

# Load data (Choose one from Option A, B, or C above)
data = load_diabetes()
X, y = data.data, data.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create, train, and evaluate the model
print("Linear Regression:")
model = LinearRegression()
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Linear Regression:
MSE: 2900.1936
R2 Score: 0.4526


### 2. Ridge Regression

In [17]:
from sklearn.linear_model import Ridge

# Split data (using the same X, y from above)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create, train, and evaluate the model
print("Ridge Regression:")
model = Ridge(alpha=1.0) # alpha is the regularization strength
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Ridge Regression:
MSE: 3077.4159
R2 Score: 0.4192


### 3. Lasso Regression

In [18]:
from sklearn.linear_model import Lasso

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create, train, and evaluate the model
print("Lasso Regression:")
model = Lasso(alpha=0.1) # alpha is the regularization strength
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Lasso Regression:
MSE: 2798.1935
R2 Score: 0.4719


### 4. Support Vector Regressor (SVR)

In [19]:
# Note: SVMs often benefit from feature scaling.
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that scales the data then applies SVR
print("Support Vector Regressor (SVR):")
model = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR(kernel='rbf', C=1.0)) # 'rbf' kernel is common for non-linear problems
])
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Support Vector Regressor (SVR):
MSE: 4332.7385
R2 Score: 0.1822


### 5. Decision Tree Regressor

In [20]:
from sklearn.tree import DecisionTreeRegressor

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create, train, and evaluate the model
print("Decision Tree Regressor:")
model = DecisionTreeRegressor(max_depth=4, random_state=42) # Limiting depth prevents overfitting
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Decision Tree Regressor:
MSE: 3568.9653
R2 Score: 0.3264


### 6. Random Forest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create, train, and evaluate the model
print("Random Forest Regressor:")
model = RandomForestRegressor(n_estimators=100, random_state=42) # n_estimators = number of trees
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Random Forest Regressor:
MSE: 2952.0106
R2 Score: 0.4428


### 7. Gradient Boosting Regressor

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create, train, and evaluate the model
print("Gradient Boosting Regressor:")
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

Gradient Boosting Regressor:
MSE: 2898.4367
R2 Score: 0.4529


### 8. K-Nearest Neighbors Regressor (KNN)

In [23]:
# Note: KNN also benefits greatly from feature scaling.
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that scales the data then applies KNN
print("K-Nearest Neighbors Regressor (KNN):")
model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5)) # n_neighbors is the 'k' value
])
model.fit(X_train, y_train)
evaluate_model(model, X_test, y_test)

K-Nearest Neighbors Regressor (KNN):
MSE: 3047.4499
R2 Score: 0.4248


### Complete Example Run

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes, fetch_california_housing, make_regression

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f"  MSE: {mse:.4f}, R2: {r2:.4f}")

# Define the datasets
datasets = {
    "Diabetes": load_diabetes(),
    "California Housing": fetch_california_housing(),
    "Synthetic": make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)
}

all_models = {}

# Initialize the model
model = LinearRegression()
all_models.update({'Linear Regression' : model})

model = Ridge(alpha=1.0) # alpha is the regularization strength
all_models.update({'Ridge Regression' : model})

model = Lasso(alpha=0.1) # alpha is the regularization strength
all_models.update({'Lasso Regression' : model})

model = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR(kernel='rbf', C=1.0)) # 'rbf' kernel is common for non-linear problems
])
all_models.update({'Support Vector Regressor (SVR)' : model})

model = DecisionTreeRegressor(max_depth=4, random_state=42) # Limiting depth prevents overfitting
all_models.update({'Decision Tree Regressor' : model})

model = RandomForestRegressor(n_estimators=100, random_state=42) # n_estimators = number of trees
all_models.update({'Random Forest Regressor' : model})

model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
all_models.update({'Gradient Boosting Regressor' : model})

model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5)) # n_neighbors is the 'k' value
])
all_models.update({'K-Nearest Neighbors Regressor (KNN)' : model})

# Train and evaluate on each dataset
for model_name, model in all_models.items(): 
    print(f"{model_name}")
    for name, data in datasets.items():
        if name == "Synthetic":
            X, y = data
        else:
            X, y = data.data, data.target

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        
        print(f"{name} Dataset:")
        evaluate_model(model, X_test, y_test)
    print("*"*50)

Linear Regression
Diabetes Dataset:
  MSE: 2900.1936, R2: 0.4526
California Housing Dataset:
  MSE: 0.5559, R2: 0.5758
Synthetic Dataset:
  MSE: 0.0111, R2: 1.0000
**************************************************
Ridge Regression
Diabetes Dataset:
  MSE: 3077.4159, R2: 0.4192
California Housing Dataset:
  MSE: 0.5558, R2: 0.5759
Synthetic Dataset:
  MSE: 0.0179, R2: 1.0000
**************************************************
Lasso Regression
Diabetes Dataset:
  MSE: 2798.1935, R2: 0.4719
California Housing Dataset:
  MSE: 0.6135, R2: 0.5318
Synthetic Dataset:
  MSE: 0.0565, R2: 1.0000
**************************************************
Support Vector Regressor (SVR)
Diabetes Dataset:
  MSE: 4332.7385, R2: 0.1822
California Housing Dataset:
  MSE: 0.3570, R2: 0.7276
Synthetic Dataset:
  MSE: 1060.6264, R2: 0.7128
**************************************************
Decision Tree Regressor
Diabetes Dataset:
  MSE: 3568.9653, R2: 0.3264
California Housing Dataset:
  MSE: 0.5844, R2: 0.5540
S

In [27]:
# --- COMPLETE EXAMPLE: Linear Regression on all 3 datasets ---
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes, fetch_california_housing, make_regression

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f"  MSE: {mse:.4f}, R2: {r2:.4f}")
    return predictions

# Define the datasets
datasets = {
    "Diabetes": load_diabetes(),
    "California Housing": fetch_california_housing(),
    "Synthetic": make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)
}

# Initialize the model
model = LinearRegression()

# Train and evaluate on each dataset
print("Linear Regression Performance:\n")
all_predictions = {}

for name, data in datasets.items():
    if name == "Synthetic":
        X, y = data
    else:
        X, y = data.data, data.target
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    
    print(f"{name} Dataset:")
    predictions = evaluate_model(model, X_test, y_test)
    
    # Store predictions for demonstration
    all_predictions[name] = predictions
    
    # --- NEW PREDICTION CODE ---
    print("  Making new predictions on sample data:")
    
    # Create some sample data points for prediction (using the first few test samples)
    sample_X = X_test[:3]  # Take first 3 samples from test set
    
    # Make predictions
    sample_predictions = model.predict(sample_X)
    
    # Display actual vs predicted values
    for i, (actual, predicted) in enumerate(zip(y_test[:3], sample_predictions)):
        print(f"    Sample {i+1}: Actual = {actual:.2f}, Predicted = {predicted:.2f}")
    
    # Show model coefficients and intercept for insight
    if name == "Diabetes":  # Show this only for one dataset to avoid repetition
        print(f"  Model coefficients: {model.coef_[:3]}...")  # Show first 3 coefficients
        print(f"  Model intercept: {model.intercept_:.2f}")
    
    print("-" * 50)

Linear Regression Performance:

Diabetes Dataset:
  MSE: 2900.1936, R2: 0.4526
  Making new predictions on sample data:
    Sample 1: Actual = 219.00, Predicted = 139.55
    Sample 2: Actual = 70.00, Predicted = 179.52
    Sample 3: Actual = 202.00, Predicted = 134.04
  Model coefficients: [  37.90402135 -241.96436231  542.42875852]...
  Model intercept: 151.35
--------------------------------------------------
California Housing Dataset:
  MSE: 0.5559, R2: 0.5758
  Making new predictions on sample data:
    Sample 1: Actual = 0.48, Predicted = 0.72
    Sample 2: Actual = 0.46, Predicted = 1.76
    Sample 3: Actual = 5.00, Predicted = 2.71
--------------------------------------------------
Synthetic Dataset:
  MSE: 0.0111, R2: 1.0000
  Making new predictions on sample data:
    Sample 1: Actual = -54.69, Predicted = -54.75
    Sample 2: Actual = -64.36, Predicted = -64.43
    Sample 3: Actual = 52.88, Predicted = 52.92
--------------------------------------------------


In [28]:
# --- Additional demonstration: Making predictions on completely new data ---
print("\n" + "="*60)
print("MAKING PREDICTIONS ON NEW SYNTHETIC DATA")
print("="*60)

# Generate some completely new data for the synthetic dataset case
new_synthetic_data = np.array([
    [0.5, -1.2, 0.8, 0.3, -0.5],   # Sample 1
    [-0.8, 1.5, -0.2, 0.9, 0.1],   # Sample 2
    [1.2, -0.3, 0.4, -0.7, 0.6]    # Sample 3
])

# We need to retrain on the synthetic dataset specifically
X_synth, y_synth = make_regression(n_samples=1000, n_features=5, noise=0.1, random_state=42)
X_train_synth, X_test_synth, y_train_synth, y_test_synth = train_test_split(X_synth, y_synth, test_size=0.2, random_state=42)

model_synth = LinearRegression()
model_synth.fit(X_train_synth, y_train_synth)

# Make predictions on the new data
new_predictions = model_synth.predict(new_synthetic_data)

print("New synthetic data predictions:")
for i, (features, prediction) in enumerate(zip(new_synthetic_data, new_predictions)):
    print(f"  Sample {i+1}: Features = {features}, Predicted = {prediction:.2f}")

# Show what the model learned
print(f"\nModel performance on synthetic test data:")
synth_test_pred = model_synth.predict(X_test_synth)
print(f"  MSE: {mean_squared_error(y_test_synth, synth_test_pred):.4f}")
print(f"  R2 Score: {r2_score(y_test_synth, synth_test_pred):.4f}")


MAKING PREDICTIONS ON NEW SYNTHETIC DATA
New synthetic data predictions:
  Sample 1: Features = [ 0.5 -1.2  0.8  0.3 -0.5], Predicted = -29.60
  Sample 2: Features = [-0.8  1.5 -0.2  0.9  0.1], Predicted = 67.01
  Sample 3: Features = [ 1.2 -0.3  0.4 -0.7  0.6], Predicted = 21.32

Model performance on synthetic test data:
  MSE: 0.0111
  R2 Score: 1.0000
