In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds, tree_preds, forest_preds, xgb_preds, knn_preds = [], [], [], [], []
linear_scores, tree_scores, forest_scores, xgb_scores, knn_scores = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds = model.predict(X_test)
    linear_preds.append(preds)
    linear_scores.append(r2_score(y_test, preds))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    tree_preds.append(preds)
    tree_scores.append(r2_score(y_test, preds))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    forest_preds.append(preds)
    forest_scores.append(r2_score(y_test, preds))

# XGBoost - boosting iterations
y_pred_total = np.zeros_like(y_test)
for i in range(n_frames):
    residuals = y_test - y_pred_total
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_test, residuals)
    y_pred_total += 0.3 * model.predict(X_test)
    xgb_preds.append(y_pred_total.copy())
    xgb_scores.append(r2_score(y_test, y_pred_total))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    knn_preds.append(preds)
    knn_scores.append(r2_score(y_test, preds))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred, score, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred[i], color=color, alpha=0.6, label='Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"R² Score: {score[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    subplot_model(1, "Linear Regression", linear_preds, linear_scores, 'red',
                  "Fast, linear, struggles with non-linearity")
    subplot_model(2, "Decision Tree", tree_preds, tree_scores, 'blue',
                  "Non-linear splits, interpretable, prone to overfitting")
    subplot_model(3, "Random Forest", forest_preds, forest_scores, 'yellow',
                  "Ensemble trees, robust, handles noise")
    subplot_model(4, "XGBoost", xgb_preds, xgb_scores, 'blue',
                  "Boosted learners, accurate, optimized")
    subplot_model(5, "KNN", knn_preds, knn_scores, 'red',
                  "Local method, sensitive to k")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

print(f"✅ GIF saved as: {gif_path}")


  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_california.gif


In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds, tree_preds, forest_preds, xgb_preds, knn_preds = [], [], [], [], []
linear_scores, tree_scores, forest_scores, xgb_scores, knn_scores = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds = model.predict(X_test)
    linear_preds.append(preds)
    linear_scores.append(r2_score(y_test, preds))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    tree_preds.append(preds)
    tree_scores.append(r2_score(y_test, preds))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    forest_preds.append(preds)
    forest_scores.append(r2_score(y_test, preds))

# XGBoost - boosting iterations
y_pred_total = np.zeros_like(y_test)
for i in range(n_frames):
    residuals = y_test - y_pred_total
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_test, residuals)
    y_pred_total += 0.3 * model.predict(X_test)
    xgb_preds.append(y_pred_total.copy())
    xgb_scores.append(r2_score(y_test, y_pred_total))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    knn_preds.append(preds)
    knn_scores.append(r2_score(y_test, preds))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred, score, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred[i], color=color, alpha=0.6, label='Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"R² Score: {score[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    subplot_model(1, "Linear Regression", linear_preds, linear_scores, 'red',
                  "Fast, linear, struggles with non-linearity")
    subplot_model(2, "Decision Tree", tree_preds, tree_scores, 'blue',
                  "Non-linear splits, interpretable, prone to overfitting")
    subplot_model(3, "Random Forest", forest_preds, forest_scores, 'yellow',
                  "Ensemble trees, robust, handles noise")
    subplot_model(4, "XGBoost", xgb_preds, xgb_scores, 'blue',
                  "Boosted learners, accurate, optimized")
    subplot_model(5, "KNN", knn_preds, knn_scores, 'red',
                  "Local method, sensitive to k")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

print(f"✅ GIF saved as: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_california.gif


In [5]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds, tree_preds, forest_preds, xgb_preds, knn_preds = [], [], [], [], []
linear_scores, tree_scores, forest_scores, xgb_scores, knn_scores = [], [], [], [], []
best_params = {}

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds = model.predict(X_test)
    linear_preds.append(preds)
    linear_scores.append(r2_score(y_test, preds))
best_params['Linear Regression'] = {}

# Decision Tree - hyperparameter tuning
param_grid = {'max_depth': range(1, 21)}
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['Decision Tree'] = grid_search.best_params_

for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    tree_preds.append(preds)
    tree_scores.append(r2_score(y_test, preds))

# Random Forest - hyperparameter tuning
param_grid = {'n_estimators': range(1, 21)}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['Random Forest'] = grid_search.best_params_

for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    forest_preds.append(preds)
    forest_scores.append(r2_score(y_test, preds))

# XGBoost - hyperparameter tuning
param_grid = {'n_estimators': range(1, 21), 'learning_rate': [0.01, 0.1, 0.3, 0.5, 1.0]}
grid_search = GridSearchCV(XGBRegressor(max_depth=3, verbosity=0), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['XGBoost'] = grid_search.best_params_

y_pred_total = np.zeros_like(y_test)
for i in range(n_frames):
    residuals = y_test - y_pred_total
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_test, residuals)
    y_pred_total += 0.3 * model.predict(X_test)
    xgb_preds.append(y_pred_total.copy())
    xgb_scores.append(r2_score(y_test, y_pred_total))

# KNN - hyperparameter tuning
param_grid = {'n_neighbors': range(1, 21)}
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['KNN'] = grid_search.best_params_

for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    knn_preds.append(preds)
    knn_scores.append(r2_score(y_test, preds))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred, score, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred[i], color=color, alpha=0.6, label='Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"R² Score: {score[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    subplot_model(1, "Linear Regression", linear_preds, linear_scores, 'red',
                  "Fast, linear, struggles with non-linearity")
    subplot_model(2, "Decision Tree", tree_preds, tree_scores, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\nBest Params: {best_params['Decision Tree']}")
    subplot_model(3, "Random Forest", forest_preds, forest_scores, 'yellow',
                  f"Ensemble trees, robust, handles noise\nBest Params: {best_params['Random Forest']}")
    subplot_model(4, "XGBoost", xgb_preds, xgb_scores, 'blue',
                  f"Boosted learners, accurate, optimized\nBest Params: {best_params['XGBoost']}")
    subplot_model(5, "KNN", knn_preds, knn_scores, 'red',
                  f"Local method, sensitive to k\nBest Params: {best_params['KNN']}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "finalmodel_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

print(f"✅ GIF saved as: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: finalmodel_comparison_california.gif


In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import os
import imageio
import glob

# Load dataset
data = fetch_openml(name="house_prices", as_frame=True)
X = data.data.select_dtypes(include=[np.number]).dropna(axis=1)  # Use numerical features only
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_ames"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds, tree_preds, forest_preds, xgb_preds, knn_preds = [], [], [], [], []
linear_scores, tree_scores, forest_scores, xgb_scores, knn_scores = [], [], [], [], []
best_params = {}

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds = model.predict(X_test)
    linear_preds.append(preds)
    linear_scores.append(r2_score(y_test, preds))
best_params['Linear Regression'] = {}

# Decision Tree - hyperparameter tuning
param_grid = {'max_depth': range(1, 21)}
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['Decision Tree'] = grid_search.best_params_

for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    tree_preds.append(preds)
    tree_scores.append(r2_score(y_test, preds))

# Random Forest - hyperparameter tuning
param_grid = {'n_estimators': range(1, 21)}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['Random Forest'] = grid_search.best_params_

for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    forest_preds.append(preds)
    forest_scores.append(r2_score(y_test, preds))

# XGBoost - hyperparameter tuning
param_grid = {'n_estimators': range(1, 21), 'learning_rate': [0.01, 0.1, 0.3, 0.5, 1.0]}
grid_search = GridSearchCV(XGBRegressor(max_depth=3, verbosity=0), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['XGBoost'] = grid_search.best_params_

y_pred_total = np.zeros_like(y_test, dtype=np.float64)
for i in range(n_frames):
    residuals = y_test - y_pred_total
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_test, residuals)
    y_pred_total += 0.3 * model.predict(X_test)
    xgb_preds.append(y_pred_total.copy())
    xgb_scores.append(r2_score(y_test, y_pred_total))

# KNN - hyperparameter tuning
param_grid = {'n_neighbors': range(1, 21)}
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['KNN'] = grid_search.best_params_

for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    knn_preds.append(preds)
    knn_scores.append(r2_score(y_test, preds))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred, score, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(range(len(y_test)), y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(range(len(y_test)), y_pred[i], color=color, alpha=0.6, label='Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Sample Index")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"R² Score: {score[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    subplot_model(1, "Linear Regression", linear_preds, linear_scores, 'red',
                  "Fast, linear, struggles with non-linearity")
    subplot_model(2, "Decision Tree", tree_preds, tree_scores, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\nBest Params: {best_params['Decision Tree']}")
    subplot_model(3, "Random Forest", forest_preds, forest_scores, 'yellow',
                  f"Ensemble trees, robust, handles noise\nBest Params: {best_params['Random Forest']}")
    subplot_model(4, "XGBoost", xgb_preds, xgb_scores, 'blue',
                  f"Boosted learners, accurate, optimized\nBest Params: {best_params['XGBoost']}")
    subplot_model(5, "KNN", knn_preds, knn_scores, 'red',
                  f"Local method, sensitive to k\nBest Params: {best_params['KNN']}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_ames.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

print(f"✅ GIF saved as: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_ames.gif


In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import os
import imageio
import glob

# Load dataset
data = fetch_openml(name="house_prices", as_frame=True)
X = data.data.select_dtypes(include=[np.number]).dropna(axis=1)  # Use numerical features only
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_ames"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds, tree_preds, forest_preds, xgb_preds, knn_preds = [], [], [], [], []
linear_scores, tree_scores, forest_scores, xgb_scores, knn_scores = [], [], [], [], []
best_params = {}

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds = model.predict(X_test)
    linear_preds.append(preds)
    linear_scores.append(r2_score(y_test, preds))
best_params['Linear Regression'] = {}

# Decision Tree - hyperparameter tuning
param_grid = {'max_depth': range(1, 21)}
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['Decision Tree'] = grid_search.best_params_

for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    tree_preds.append(preds)
    tree_scores.append(r2_score(y_test, preds))

# Random Forest - hyperparameter tuning
param_grid = {'n_estimators': range(1, 21)}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['Random Forest'] = grid_search.best_params_

for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    forest_preds.append(preds)
    forest_scores.append(r2_score(y_test, preds))

# XGBoost - hyperparameter tuning
param_grid = {'n_estimators': range(1, 21), 'learning_rate': [0.01, 0.1, 0.3, 0.5, 1.0]}
grid_search = GridSearchCV(XGBRegressor(max_depth=3, verbosity=0), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['XGBoost'] = grid_search.best_params_

y_pred_total = np.zeros_like(y_test, dtype=np.float64)
for i in range(n_frames):
    residuals = y_test - y_pred_total
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_test, residuals)
    y_pred_total += 0.3 * model.predict(X_test)
    xgb_preds.append(y_pred_total.copy())
    xgb_scores.append(r2_score(y_test, y_pred_total))

# KNN - hyperparameter tuning
param_grid = {'n_neighbors': range(1, 21)}
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params['KNN'] = grid_search.best_params_

for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    knn_preds.append(preds)
    knn_scores.append(r2_score(y_test, preds))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred, score, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(range(len(y_test)), y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(range(len(y_test)), y_pred[i], color=color, alpha=0.6, label='Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Sample Index")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"R² Score: {score[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    subplot_model(1, "Linear Regression", linear_preds, linear_scores, 'red',
                  "Fast, linear, struggles with non-linearity")
    subplot_model(2, "Decision Tree", tree_preds, tree_scores, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\nBest Params: {best_params['Decision Tree']}")
    subplot_model(3, "Random Forest", forest_preds, forest_scores, 'yellow',
                  f"Ensemble trees, robust, handles noise\nBest Params: {best_params['Random Forest']}")
    subplot_model(4, "XGBoost", xgb_preds, xgb_scores, 'blue',
                  f"Boosted learners, accurate, optimized\nBest Params: {best_params['XGBoost']}")
    subplot_model(5, "KNN", knn_preds, knn_scores, 'red',
                  f"Local method, sensitive to k\nBest Params: {best_params['KNN']}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_ames.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

print(f"✅ GIF saved as: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_ames.gif


In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds_train, tree_preds_train, forest_preds_train, xgb_preds_train, knn_preds_train = [], [], [], [], []
linear_preds_test, tree_preds_test, forest_preds_test, xgb_preds_test, knn_preds_test = [], [], [], [], []
linear_scores_train, tree_scores_train, forest_scores_train, xgb_scores_train, knn_scores_train = [], [], [], [], []
linear_scores_test, tree_scores_test, forest_scores_test, xgb_scores_test, knn_scores_test = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    linear_preds_train.append(preds_train)
    linear_preds_test.append(preds_test)
    linear_scores_train.append(r2_score(y_train, preds_train))
    linear_scores_test.append(r2_score(y_test, preds_test))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    tree_preds_train.append(preds_train)
    tree_preds_test.append(preds_test)
    tree_scores_train.append(r2_score(y_train, preds_train))
    tree_scores_test.append(r2_score(y_test, preds_test))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    forest_preds_train.append(preds_train)
    forest_preds_test.append(preds_test)
    forest_scores_train.append(r2_score(y_train, preds_train))
    forest_scores_test.append(r2_score(y_test, preds_test))

# XGBoost - boosting iterations
y_pred_total_train = np.zeros_like(y_train)
y_pred_total_test = np.zeros_like(y_test)
for i in range(n_frames):
    residuals_train = y_train - y_pred_total_train
    residuals_test = y_test - y_pred_total_test
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_train, residuals_train)
    y_pred_total_train += 0.3 * model.predict(X_train)
    y_pred_total_test += 0.3 * model.predict(X_test)
    xgb_preds_train.append(y_pred_total_train.copy())
    xgb_preds_test.append(y_pred_total_test.copy())
    xgb_scores_train.append(r2_score(y_train, y_pred_total_train))
    xgb_scores_test.append(r2_score(y_test, y_pred_total_test))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    knn_preds_train.append(preds_train)
    knn_preds_test.append(preds_test)
    knn_scores_train.append(r2_score(y_train, preds_train))
    knn_scores_test.append(r2_score(y_test, preds_test))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred_train, y_pred_test, score_train, score_test, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred_test[i], color=color, alpha=0.6, label='Test Prediction')
        plt.scatter(X_train, y_pred_train[i], color='green', alpha=0.6, label='Train Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"Train R²: {score_train[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, f"Test R²: {score_test[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.7, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    subplot_model(1, "Linear Regression", linear_preds_train, linear_preds_test, linear_scores_train, linear_scores_test, 'red',
                  "Fast, linear, struggles with non-linearity")
    subplot_model(2, "Decision Tree", tree_preds_train, tree_preds_test, tree_scores_train, tree_scores_test, 'blue',
                  "Non-linear splits, interpretable, prone to overfitting")
    subplot_model(3, "Random Forest", forest_preds_train, forest_preds_test, forest_scores_train, forest_scores_test, 'yellow',
                  "Ensemble trees, robust, handles noise")
    subplot_model(4, "XGBoost", xgb_preds_train, xgb_preds_test, xgb_scores_train, xgb_scores_test, 'blue',
                  "Boosted learners, accurate, optimized")
    subplot_model(5, "KNN", knn_preds_train, knn_preds_test, knn_scores_train, knn_scores_test, 'red',
                  "Local method, sensitive to k")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

print(f"✅ GIF saved as: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_california.gif


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds_train, tree_preds_train, forest_preds_train, xgb_preds_train, knn_preds_train = [], [], [], [], []
linear_preds_test, tree_preds_test, forest_preds_test, xgb_preds_test, knn_preds_test = [], [], [], [], []
linear_scores_train, tree_scores_train, forest_scores_train, xgb_scores_train, knn_scores_train = [], [], [], [], []
linear_scores_test, tree_scores_test, forest_scores_test, xgb_scores_test, knn_scores_test = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    linear_preds_train.append(preds_train)
    linear_preds_test.append(preds_test)
    linear_scores_train.append(r2_score(y_train, preds_train))
    linear_scores_test.append(r2_score(y_test, preds_test))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    tree_preds_train.append(preds_train)
    tree_preds_test.append(preds_test)
    tree_scores_train.append(r2_score(y_train, preds_train))
    tree_scores_test.append(r2_score(y_test, preds_test))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    forest_preds_train.append(preds_train)
    forest_preds_test.append(preds_test)
    forest_scores_train.append(r2_score(y_train, preds_train))
    forest_scores_test.append(r2_score(y_test, preds_test))

# XGBoost - boosting iterations
y_pred_total_train = np.zeros_like(y_train)
y_pred_total_test = np.zeros_like(y_test)
for i in range(n_frames):
    residuals_train = y_train - y_pred_total_train
    residuals_test = y_test - y_pred_total_test
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_train, residuals_train)
    y_pred_total_train += 0.3 * model.predict(X_train)
    y_pred_total_test += 0.3 * model.predict(X_test)
    xgb_preds_train.append(y_pred_total_train.copy())
    xgb_preds_test.append(y_pred_total_test.copy())
    xgb_scores_train.append(r2_score(y_train, y_pred_total_train))
    xgb_scores_test.append(r2_score(y_test, y_pred_total_test))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    knn_preds_train.append(preds_train)
    knn_preds_test.append(preds_test)
    knn_scores_train.append(r2_score(y_train, preds_train))
    knn_scores_test.append(r2_score(y_test, preds_test))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred_train, y_pred_test, score_train, score_test, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred_test[i], color=color, alpha=0.6, label='Test Prediction')
        plt.scatter(X_train, y_pred_train[i], color='green', alpha=0.6, label='Train Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"Train R²: {score_train[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, f"Test R²: {score_test[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.7, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    def check_overfitting(score_train, score_test):
        if score_train[i] > score_test[i] + 0.1:
            return "Overfitting"
        elif score_train[i] < score_test[i] - 0.1:
            return "Underfitting"
        else:
            return "Good Fit"

    subplot_model(1, "Linear Regression", linear_preds_train, linear_preds_test, linear_scores_train, linear_scores_test, 'red',
                  f"Fast, linear, struggles with non-linearity\n{check_overfitting(linear_scores_train, linear_scores_test)}")
    subplot_model(2, "Decision Tree", tree_preds_train, tree_preds_test, tree_scores_train, tree_scores_test, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\n{check_overfitting(tree_scores_train, tree_scores_test)}")
    subplot_model(3, "Random Forest", forest_preds_train, forest_preds_test, forest_scores_train, forest_scores_test, 'yellow',
                  f"Ensemble trees, robust, handles noise\n{check_overfitting(forest_scores_train, forest_scores_test)}")
    subplot_model(4, "XGBoost", xgb_preds_train, xgb_preds_test, xgb_scores_train, xgb_scores_test, 'blue',
                  f"Boosted learners, accurate, optimized\n{check_overfitting(xgb_scores_train, xgb_scores_test)}")
    subplot_model(5, "KNN", knn_preds_train, knn_preds_test, knn_scores_train, knn_scores_test, 'red',
                  f"Local method, sensitive to k\n{check_overfitting(knn_scores_train, knn_scores_test)}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

# Determine the best model based on the final test R² score
best_model_index = np.argmax([linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]])
best_model_name = ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost", "KNN"][best_model_index]
best_model_score = [linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]][best_model_index]

print(f"✅ GIF saved as: {gif_path}")
print(f"🏆 Best Model: {best_model_name} with R² Score: {best_model_score:.3f}")

# Add final frame with the best model summary
plt.figure(figsize=(20, 12))
plt.text(0.5, 0.5, f"🏆 Best Model: {best_model_name}\n\nR² Score: {best_model_score:.3f}\n\nWhy: {best_model_name} performed the best because it had the highest test R² score, indicating it generalizes well to unseen data.", 
         fontsize=18, ha='center', va='center', bbox=dict(facecolor='white', edgecolor='black'))
plt.axis('off')
plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
plt.close()

# Add the final frame to the GIF
with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)
    final_frame_path = f"{output_dir}/frame_{n_frames+1:02d}.png"
    final_image = imageio.imread(final_frame_path)
    writer.append_data(final_image)

print(f"✅ GIF updated with the best model summary: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_california.gif
🏆 Best Model: XGBoost with R² Score: 0.470


  plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
  image = imageio.imread(frame_path)
  final_image = imageio.imread(final_frame_path)


✅ GIF updated with the best model summary: model_comparison_california.gif


In [13]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import os
import imageio
import glob

# Load dataset
data = fetch_openml(name="house_prices", as_frame=True)
X = data.data.select_dtypes(include=[np.number]).dropna(axis=1)  # Use numerical features only
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_ames"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds_train, tree_preds_train, forest_preds_train, xgb_preds_train, knn_preds_train = [], [], [], [], []
linear_preds_test, tree_preds_test, forest_preds_test, xgb_preds_test, knn_preds_test = [], [], [], [], []
linear_scores_train, tree_scores_train, forest_scores_train, xgb_scores_train, knn_scores_train = [], [], [], [], []
linear_scores_test, tree_scores_test, forest_scores_test, xgb_scores_test, knn_scores_test = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    linear_preds_train.append(preds_train)
    linear_preds_test.append(preds_test)
    linear_scores_train.append(r2_score(y_train, preds_train))
    linear_scores_test.append(r2_score(y_test, preds_test))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    tree_preds_train.append(preds_train)
    tree_preds_test.append(preds_test)
    tree_scores_train.append(r2_score(y_train, preds_train))
    tree_scores_test.append(r2_score(y_test, preds_test))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    forest_preds_train.append(preds_train)
    forest_preds_test.append(preds_test)
    forest_scores_train.append(r2_score(y_train, preds_train))
    forest_scores_test.append(r2_score(y_test, preds_test))

# XGBoost - boosting iterations
y_pred_total_train = np.zeros_like(y_train, dtype=np.float64)
y_pred_total_test = np.zeros_like(y_test, dtype=np.float64)
for i in range(n_frames):
    residuals_train = y_train - y_pred_total_train
    residuals_test = y_test - y_pred_total_test
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_train, residuals_train)
    y_pred_total_train += 0.3 * model.predict(X_train)
    y_pred_total_test += 0.3 * model.predict(X_test)
    xgb_preds_train.append(y_pred_total_train.copy())
    xgb_preds_test.append(y_pred_total_test.copy())
    xgb_scores_train.append(r2_score(y_train, y_pred_total_train))
    xgb_scores_test.append(r2_score(y_test, y_pred_total_test))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    knn_preds_train.append(preds_train)
    knn_preds_test.append(preds_test)
    knn_scores_train.append(r2_score(y_train, preds_train))
    knn_scores_test.append(r2_score(y_test, preds_test))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred_train, y_pred_test, score_train, score_test, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(range(len(y_test)), y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(range(len(y_test)), y_pred_test[i], color=color, alpha=0.6, label='Test Prediction')
        plt.scatter(range(len(y_train)), y_pred_train[i], color='green', alpha=0.6, label='Train Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Sample Index")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"Train R²: {score_train[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, f"Test R²: {score_test[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.7, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    def check_overfitting(score_train, score_test):
        if score_train[i] > score_test[i] + 0.1:
            return "Overfitting"
        elif score_train[i] < score_test[i] - 0.1:
            return "Underfitting"
        else:
            return "Good Fit"

    subplot_model(1, "Linear Regression", linear_preds_train, linear_preds_test, linear_scores_train, linear_scores_test, 'red',
                  f"Fast, linear, struggles with non-linearity\n{check_overfitting(linear_scores_train, linear_scores_test)}")
    subplot_model(2, "Decision Tree", tree_preds_train, tree_preds_test, tree_scores_train, tree_scores_test, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\n{check_overfitting(tree_scores_train, tree_scores_test)}")
    subplot_model(3, "Random Forest", forest_preds_train, forest_preds_test, forest_scores_train, forest_scores_test, 'yellow',
                  f"Ensemble trees, robust, handles noise\n{check_overfitting(forest_scores_train, forest_scores_test)}")
    subplot_model(4, "XGBoost", xgb_preds_train, xgb_preds_test, xgb_scores_train, xgb_scores_test, 'blue',
                  f"Boosted learners, accurate, optimized\n{check_overfitting(xgb_scores_train, xgb_scores_test)}")
    subplot_model(5, "KNN", knn_preds_train, knn_preds_test, knn_scores_train, knn_scores_test, 'red',
                  f"Local method, sensitive to k\n{check_overfitting(knn_scores_train, knn_scores_test)}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_ames.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

# Determine the best model based on the final test R² score
best_model_index = np.argmax([linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]])
best_model_name = ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost", "KNN"][best_model_index]
best_model_score = [linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]][best_model_index]

print(f"✅ GIF saved as: {gif_path}")
print(f"🏆 Best Model: {best_model_name} with R² Score: {best_model_score:.3f}")

# Add final frame with the best model summary
plt.figure(figsize=(20, 12))
plt.text(0.5, 0.5, f"🏆 Best Model: {best_model_name}\n\nR² Score: {best_model_score:.3f}\n\nWhy: {best_model_name} performed the best because it had the highest test R² score, indicating it generalizes well to unseen data.", 
         fontsize=18, ha='center', va='center', bbox=dict(facecolor='white', edgecolor='black'))
plt.axis('off')
plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
plt.close()

# Add the final frame to the GIF
with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)
    final_frame_path = f"{output_dir}/frame_{n_frames+1:02d}.png"
    final_image = imageio.imread(final_frame_path)
    writer.append_data(final_image)

print(f"✅ GIF updated with the best model summary: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_ames.gif
🏆 Best Model: XGBoost with R² Score: 0.883


  plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
  image = imageio.imread(frame_path)
  final_image = imageio.imread(final_frame_path)


✅ GIF updated with the best model summary: model_comparison_ames.gif


In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds_train, tree_preds_train, forest_preds_train, xgb_preds_train, knn_preds_train = [], [], [], [], []
linear_preds_test, tree_preds_test, forest_preds_test, xgb_preds_test, knn_preds_test = [], [], [], [], []
linear_scores_train, tree_scores_train, forest_scores_train, xgb_scores_train, knn_scores_train = [], [], [], [], []
linear_scores_test, tree_scores_test, forest_scores_test, xgb_scores_test, knn_scores_test = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    linear_preds_train.append(preds_train)
    linear_preds_test.append(preds_test)
    linear_scores_train.append(r2_score(y_train, preds_train))
    linear_scores_test.append(r2_score(y_test, preds_test))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    tree_preds_train.append(preds_train)
    tree_preds_test.append(preds_test)
    tree_scores_train.append(r2_score(y_train, preds_train))
    tree_scores_test.append(r2_score(y_test, preds_test))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    forest_preds_train.append(preds_train)
    forest_preds_test.append(preds_test)
    forest_scores_train.append(r2_score(y_train, preds_train))
    forest_scores_test.append(r2_score(y_test, preds_test))

# XGBoost - boosting iterations
y_pred_total_train = np.zeros_like(y_train, dtype=np.float64)
y_pred_total_test = np.zeros_like(y_test, dtype=np.float64)
for i in range(n_frames):
    residuals_train = y_train - y_pred_total_train
    residuals_test = y_test - y_pred_total_test
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_train, residuals_train)
    y_pred_total_train += 0.3 * model.predict(X_train)
    y_pred_total_test += 0.3 * model.predict(X_test)
    xgb_preds_train.append(y_pred_total_train.copy())
    xgb_preds_test.append(y_pred_total_test.copy())
    xgb_scores_train.append(r2_score(y_train, y_pred_total_train))
    xgb_scores_test.append(r2_score(y_test, y_pred_total_test))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    knn_preds_train.append(preds_train)
    knn_preds_test.append(preds_test)
    knn_scores_train.append(r2_score(y_train, preds_train))
    knn_scores_test.append(r2_score(y_test, preds_test))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred_train, y_pred_test, score_train, score_test, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred_test[i], color=color, alpha=0.6, label='Test Prediction')
        plt.scatter(X_train, y_pred_train[i], color='green', alpha=0.6, label='Train Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"Train R²: {score_train[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, f"Test R²: {score_test[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.7, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    def check_overfitting(score_train, score_test):
        if score_train[i] > score_test[i] + 0.1:
            return "Overfitting"
        elif score_train[i] < score_test[i] - 0.1:
            return "Underfitting"
        else:
            return "Good Fit"

    subplot_model(1, "Linear Regression", linear_preds_train, linear_preds_test, linear_scores_train, linear_scores_test, 'red',
                  f"Fast, linear, struggles with non-linearity\n{check_overfitting(linear_scores_train, linear_scores_test)}")
    subplot_model(2, "Decision Tree", tree_preds_train, tree_preds_test, tree_scores_train, tree_scores_test, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\n{check_overfitting(tree_scores_train, tree_scores_test)}")
    subplot_model(3, "Random Forest", forest_preds_train, forest_preds_test, forest_scores_train, forest_scores_test, 'yellow',
                  f"Ensemble trees, robust, handles noise\n{check_overfitting(forest_scores_train, forest_scores_test)}")
    subplot_model(4, "XGBoost", xgb_preds_train, xgb_preds_test, xgb_scores_train, xgb_scores_test, 'blue',
                  f"Boosted learners, accurate, optimized\n{check_overfitting(xgb_scores_train, xgb_scores_test)}")
    subplot_model(5, "KNN", knn_preds_train, knn_preds_test, knn_scores_train, knn_scores_test, 'red',
                  f"Local method, sensitive to k\n{check_overfitting(knn_scores_train, knn_scores_test)}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

# Determine the best model based on the final test R² score
best_model_index = np.argmax([linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]])
best_model_name = ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost", "KNN"][best_model_index]
best_model_score = [linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]][best_model_index]

print(f"✅ GIF saved as: {gif_path}")
print(f"🏆 Best Model: {best_model_name} with R² Score: {best_model_score:.3f}")

# Add final frame with the best model summary
plt.figure(figsize=(20, 12))
plt.text(0.5, 0.5, f"🏆 Best Model: {best_model_name}\n\nR² Score: {best_model_score:.3f}\n\nWhy: {best_model_name} performed the best because it had the highest test R² score, indicating it generalizes well to unseen data.", 
         fontsize=18, ha='center', va='center', bbox=dict(facecolor='white', edgecolor='black'))
plt.axis('off')
plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
plt.close()

# Add the final frame to the GIF
with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)
    final_frame_path = f"{output_dir}/frame_{n_frames+1:02d}.png"
    final_image = imageio.imread(final_frame_path)
    writer.append_data(final_image)

print(f"✅ GIF updated with the best model summary: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_california.gif
🏆 Best Model: XGBoost with R² Score: 0.470


  plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
  image = imageio.imread(frame_path)
  final_image = imageio.imread(final_frame_path)


✅ GIF updated with the best model summary: model_comparison_california.gif


In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds_train, tree_preds_train, forest_preds_train, xgb_preds_train, knn_preds_train = [], [], [], [], []
linear_preds_test, tree_preds_test, forest_preds_test, xgb_preds_test, knn_preds_test = [], [], [], [], []
linear_scores_train, tree_scores_train, forest_scores_train, xgb_scores_train, knn_scores_train = [], [], [], [], []
linear_scores_test, tree_scores_test, forest_scores_test, xgb_scores_test, knn_scores_test = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    linear_preds_train.append(preds_train)
    linear_preds_test.append(preds_test)
    linear_scores_train.append(r2_score(y_train, preds_train))
    linear_scores_test.append(r2_score(y_test, preds_test))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    tree_preds_train.append(preds_train)
    tree_preds_test.append(preds_test)
    tree_scores_train.append(r2_score(y_train, preds_train))
    tree_scores_test.append(r2_score(y_test, preds_test))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    forest_preds_train.append(preds_train)
    forest_preds_test.append(preds_test)
    forest_scores_train.append(r2_score(y_train, preds_train))
    forest_scores_test.append(r2_score(y_test, preds_test))

# XGBoost - boosting iterations
y_pred_total_train = np.zeros_like(y_train, dtype=np.float64)
y_pred_total_test = np.zeros_like(y_test, dtype=np.float64)
for i in range(n_frames):
    residuals_train = y_train - y_pred_total_train
    residuals_test = y_test - y_pred_total_test
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_train, residuals_train)
    y_pred_total_train += 0.3 * model.predict(X_train)
    y_pred_total_test += 0.3 * model.predict(X_test)
    xgb_preds_train.append(y_pred_total_train.copy())
    xgb_preds_test.append(y_pred_total_test.copy())
    xgb_scores_train.append(r2_score(y_train, y_pred_total_train))
    xgb_scores_test.append(r2_score(y_test, y_pred_total_test))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    knn_preds_train.append(preds_train)
    knn_preds_test.append(preds_test)
    knn_scores_train.append(r2_score(y_train, preds_train))
    knn_scores_test.append(r2_score(y_test, preds_test))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred_train, y_pred_test, score_train, score_test, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred_test[i], color=color, alpha=0.6, label='Test Prediction')
        plt.scatter(X_train, y_pred_train[i], color='green', alpha=0.6, label='Train Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"Train R²: {score_train[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, f"Test R²: {score_test[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.7, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    def check_overfitting(score_train, score_test):
        if score_train[i] > score_test[i] + 0.1:
            return "Overfitting"
        elif score_train[i] < score_test[i] - 0.1:
            return "Underfitting"
        else:
            return "Good Fit"

    subplot_model(1, "Linear Regression", linear_preds_train, linear_preds_test, linear_scores_train, linear_scores_test, 'red',
                  f"Fast, linear, struggles with non-linearity\n{check_overfitting(linear_scores_train, linear_scores_test)}")
    subplot_model(2, "Decision Tree", tree_preds_train, tree_preds_test, tree_scores_train, tree_scores_test, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\n{check_overfitting(tree_scores_train, tree_scores_test)}")
    subplot_model(3, "Random Forest", forest_preds_train, forest_preds_test, forest_scores_train, forest_scores_test, 'yellow',
                  f"Ensemble trees, robust, handles noise\n{check_overfitting(forest_scores_train, forest_scores_test)}")
    subplot_model(4, "XGBoost", xgb_preds_train, xgb_preds_test, xgb_scores_train, xgb_scores_test, 'blue',
                  f"Boosted learners, accurate, optimized\n{check_overfitting(xgb_scores_train, xgb_scores_test)}")
    subplot_model(5, "KNN", knn_preds_train, knn_preds_test, knn_scores_train, knn_scores_test, 'red',
                  f"Local method, sensitive to k\n{check_overfitting(knn_scores_train, knn_scores_test)}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

# Determine the best model based on the final test R² score
best_model_index = np.argmax([linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]])
best_model_name = ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost", "KNN"][best_model_index]
best_model_score = [linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]][best_model_index]

print(f"✅ GIF saved as: {gif_path}")
print(f"🏆 Best Model: {best_model_name} with R² Score: {best_model_score:.3f}")

# Add final frame with the best model summary
plt.figure(figsize=(20, 12))
plt.text(0.5, 0.5, f"🏆 Best Model: {best_model_name}\n\nR² Score: {best_model_score:.3f}\n\nWhy: {best_model_name} performed the best because it had the highest test R² score, indicating it generalizes well to unseen data.", 
         fontsize=18, ha='center', va='center', bbox=dict(facecolor='white', edgecolor='black'))
plt.axis('off')
plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
plt.close()

# Add the final frame to the GIF
with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)
    final_frame_path = f"{output_dir}/frame_{n_frames+1:02d}.png"
    final_image = imageio.imread(final_frame_path)
    writer.append_data(final_image)

print(f"✅ GIF updated with the best model summary: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_california.gif
🏆 Best Model: XGBoost with R² Score: 0.470


  plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
  image = imageio.imread(frame_path)
  final_image = imageio.imread(final_frame_path)


✅ GIF updated with the best model summary: model_comparison_california.gif


In [17]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import os
import imageio
import glob

# Load dataset
data = fetch_california_housing()
X = data.data[:, [0]]  # Feature: MedInc (for simplicity)
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Frame settings
n_frames = 20
output_dir = "model_frames_california"
os.makedirs(output_dir, exist_ok=True)

# Prediction storage
linear_preds_train, tree_preds_train, forest_preds_train, xgb_preds_train, knn_preds_train = [], [], [], [], []
linear_preds_test, tree_preds_test, forest_preds_test, xgb_preds_test, knn_preds_test = [], [], [], [], []
linear_scores_train, tree_scores_train, forest_scores_train, xgb_scores_train, knn_scores_train = [], [], [], [], []
linear_scores_test, tree_scores_test, forest_scores_test, xgb_scores_test, knn_scores_test = [], [], [], [], []

# Linear Regression - increasing data
for i in range(50, len(X_train), int(len(X_train) / n_frames)):
    model = LinearRegression()
    model.fit(X_train[:i], y_train[:i])
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    linear_preds_train.append(preds_train)
    linear_preds_test.append(preds_test)
    linear_scores_train.append(r2_score(y_train, preds_train))
    linear_scores_test.append(r2_score(y_test, preds_test))

# Decision Tree - increasing depth
for depth in range(1, n_frames + 1):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    tree_preds_train.append(preds_train)
    tree_preds_test.append(preds_test)
    tree_scores_train.append(r2_score(y_train, preds_train))
    tree_scores_test.append(r2_score(y_test, preds_test))

# Random Forest - increasing n_estimators
for n in range(1, n_frames + 1):
    model = RandomForestRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    forest_preds_train.append(preds_train)
    forest_preds_test.append(preds_test)
    forest_scores_train.append(r2_score(y_train, preds_train))
    forest_scores_test.append(r2_score(y_test, preds_test))

# XGBoost - boosting iterations
y_pred_total_train = np.zeros_like(y_train, dtype=np.float64)
y_pred_total_test = np.zeros_like(y_test, dtype=np.float64)
for i in range(n_frames):
    residuals_train = y_train - y_pred_total_train
    residuals_test = y_test - y_pred_total_test
    model = XGBRegressor(n_estimators=1, learning_rate=1.0, max_depth=3, verbosity=0)
    model.fit(X_train, residuals_train)
    y_pred_total_train += 0.3 * model.predict(X_train)
    y_pred_total_test += 0.3 * model.predict(X_test)
    xgb_preds_train.append(y_pred_total_train.copy())
    xgb_preds_test.append(y_pred_total_test.copy())
    xgb_scores_train.append(r2_score(y_train, y_pred_total_train))
    xgb_scores_test.append(r2_score(y_test, y_pred_total_test))

# KNN - increasing neighbors
for k in range(1, n_frames + 1):
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)
    knn_preds_train.append(preds_train)
    knn_preds_test.append(preds_test)
    knn_scores_train.append(r2_score(y_train, preds_train))
    knn_scores_test.append(r2_score(y_test, preds_test))

# Plot frames
for i in range(n_frames):
    plt.figure(figsize=(20, 12))

    def subplot_model(index, name, y_pred_train, y_pred_test, score_train, score_test, color, annotation):
        plt.subplot(3, 2, index)
        plt.scatter(X_test, y_test, color='lightgray', alpha=0.4, label='Test Data')
        plt.scatter(X_test, y_pred_test[i], color=color, alpha=0.6, label='Test Prediction')
        plt.scatter(X_train, y_pred_train[i], color='green', alpha=0.6, label='Train Prediction')
        plt.title(name, fontsize=14, fontweight='bold')
        plt.xlabel("Median Income")
        plt.ylabel("Target")
        plt.legend()
        plt.grid(True)
        plt.text(0.05, 0.9, f"Train R²: {score_train[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.8, f"Test R²: {score_test[i]:.3f}", transform=plt.gca().transAxes,
                 fontsize=10, bbox=dict(facecolor='white', edgecolor='black'))
        plt.text(0.05, 0.7, annotation, transform=plt.gca().transAxes,
                 fontsize=9, bbox=dict(facecolor='white', alpha=0.7))

    def check_overfitting(score_train, score_test):
        if score_train[i] > score_test[i] + 0.1:
            return "Overfitting"
        elif score_train[i] < score_test[i] - 0.1:
            return "Underfitting"
        else:
            return "Good Fit"

    subplot_model(1, "Linear Regression", linear_preds_train, linear_preds_test, linear_scores_train, linear_scores_test, 'red',
                  f"Fast, linear, struggles with non-linearity\n{check_overfitting(linear_scores_train, linear_scores_test)}")
    subplot_model(2, "Decision Tree", tree_preds_train, tree_preds_test, tree_scores_train, tree_scores_test, 'blue',
                  f"Non-linear splits, interpretable, prone to overfitting\n{check_overfitting(tree_scores_train, tree_scores_test)}")
    subplot_model(3, "Random Forest", forest_preds_train, forest_preds_test, forest_scores_train, forest_scores_test, 'yellow',
                  f"Ensemble trees, robust, handles noise\n{check_overfitting(forest_scores_train, forest_scores_test)}")
    subplot_model(4, "XGBoost", xgb_preds_train, xgb_preds_test, xgb_scores_train, xgb_scores_test, 'blue',
                  f"Boosted learners, accurate, optimized\n{check_overfitting(xgb_scores_train, xgb_scores_test)}")
    subplot_model(5, "KNN", knn_preds_train, knn_preds_test, knn_scores_train, knn_scores_test, 'red',
                  f"Local method, sensitive to k\n{check_overfitting(knn_scores_train, knn_scores_test)}")

    plt.suptitle(f"Model Evolution – Frame {i+1}/{n_frames}", fontsize=18, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f"{output_dir}/frame_{i+1:02d}.png")
    plt.close()

# Create GIF
frame_paths = sorted(glob.glob(f"{output_dir}/frame_*.png"))
gif_path = "model_comparison_california.gif"

with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)

# Determine the best model based on the final test R² score
best_model_index = np.argmax([linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]])
best_model_name = ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost", "KNN"][best_model_index]
best_model_score = [linear_scores_test[-1], tree_scores_test[-1], forest_scores_test[-1], xgb_scores_test[-1], knn_scores_test[-1]][best_model_index]

print(f"✅ GIF saved as: {gif_path}")
print(f"🏆 Best Model: {best_model_name} with R² Score: {best_model_score:.3f}")

# Add final frame with the best model summary
plt.figure(figsize=(20, 12))
plt.text(0.5, 0.5, f"🏆 Best Model: {best_model_name}\n\nR² Score: {best_model_score:.3f}\n\nWhy: {best_model_name} performed the best because it had the highest test R² score, indicating it generalizes well to unseen data.", 
         fontsize=18, ha='center', va='center', bbox=dict(facecolor='white', edgecolor='black'))
plt.axis('off')
plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
plt.close()

# Add the final frame to the GIF
with imageio.get_writer(gif_path, mode='I', duration=0.6, loop=0) as writer:
    for frame_path in frame_paths:
        image = imageio.imread(frame_path)
        writer.append_data(image)
    final_frame_path = f"{output_dir}/frame_{n_frames+1:02d}.png"
    final_image = imageio.imread(final_frame_path)
    writer.append_data(final_image)

print(f"✅ GIF updated with the best model summary: {gif_path}")

  image = imageio.imread(frame_path)


✅ GIF saved as: model_comparison_california.gif
🏆 Best Model: XGBoost with R² Score: 0.470


  plt.savefig(f"{output_dir}/frame_{n_frames+1:02d}.png")
  image = imageio.imread(frame_path)
  final_image = imageio.imread(final_frame_path)


✅ GIF updated with the best model summary: model_comparison_california.gif
