In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load dataset
# Assuming 'df' is the dataset containing the features and target variable
# df = pd.read_csv("your_dataset.csv")

# Preprocess data (Example: handling missing values and encoding categorical variables)
# df.fillna(df.mean(), inplace=True)
# df = pd.get_dummies(df, drop_first=True)

# Splitting features and target variable
# Assuming 'target' is the column to predict
# X = df.drop(columns=['target'])
# y = df['target']

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train SGDRegressor with warm_start=True
lin_reg = SGDRegressor(max_iter=1, tol=None, warm_start=True, learning_rate='constant', eta0=0.01)
loss_curve = []

for i in range(100):  # 100 iterations
    lin_reg.partial_fit(X_train, y_train)
    y_pred = lin_reg.predict(X_train)
    loss = mean_squared_error(y_train, y_pred)
    loss_curve.append(loss)

# Evaluate model
y_test_pred = lin_reg.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test R^2 Score: {r2:.4f}")

# Plot loss curve
plt.plot(loss_curve, label='Train Loss')
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.title("Linear Regression Loss Curve")
plt.legend()
plt.show()

# Train and evaluate other models
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f"{name} - MSE: {mse:.4f}, R^2 Score: {r2:.4f}")

# Save best model
joblib.dump(lin_reg, "best_model.pkl")

NameError: name 'X' is not defined