In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Load and Prepare Data ---
# Load your data from a local CSV file
# IMPORTANT: Make sure to change 'your_ames_housing.csv' to the correct file path.
file_path = '/content/AmesHousing.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    print("Please make sure the CSV file is in the same directory as the script or provide the full path.")
    # Exit or handle the error as needed
    exit()


# Select the same subset of features for the example.
# You can adjust this list for your specific needs.
features = [
    'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
    'BsmtFin SF 1', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area',
    'Full Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces',
    'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'MS Zoning', 'Neighborhood'
]
target = 'SalePrice'

# Ensure all selected features and the target exist in the DataFrame
required_columns = features + [target]
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
    print(f"Error: The following required columns are missing from your CSV file: {missing_cols}")
    exit()

df_subset = df[required_columns].copy()


# --- 2. Preprocessing ---

# A) Log-transform the target variable
df_subset['SalePrice'] = np.log1p(df_subset['SalePrice'])

# B) Separate numerical and categorical features
numerical_features = df_subset.select_dtypes(include=np.number).columns.drop('SalePrice')
categorical_features = df_subset.select_dtypes(include='object').columns

# C) Handle Missing Values
for col in numerical_features:
    df_subset[col] = df_subset[col].fillna(df_subset[col].median())
for col in categorical_features:
    df_subset[col] = df_subset[col].fillna('None')

# D) One-Hot Encode Categorical Features
df_processed = pd.get_dummies(df_subset, columns=categorical_features, drop_first=True)

# --- 3. Splitting and Scaling ---

X = df_processed.drop('SalePrice', axis=1)
y = df_processed['SalePrice']

# A) Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# B) Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- 4. Modeling ---

alphas = [0.001, 0.01, 0.1, 1, 10, 100]

print("Training Ridge model...")
ridge = RidgeCV(alphas=alphas, store_cv_values=True)
ridge.fit(X_train, y_train)

print("Training Lasso model...")
lasso = LassoCV(alphas=alphas, cv=5, random_state=42)
lasso.fit(X_train, y_train)

print("Training ElasticNet model...")
elastic_net = ElasticNetCV(alphas=alphas, cv=5, random_state=42)
elastic_net.fit(X_train, y_train)

# --- 5. Evaluation ---

models = {'Ridge': ridge, 'Lasso': lasso, 'ElasticNet': elastic_net}
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_orig = np.expm1(y_pred)
    y_test_orig = np.expm1(y_test)
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
    r2 = r2_score(y_test_orig, y_pred_orig)

    print(f"\n--- {name} Results ---")
    print(f"Best Alpha: {model.alpha_}")
    print(f"R-squared (R²): {r2:.4f}")
    print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")

# --- 6. Lasso Feature Selection ---
lasso_coefs = pd.Series(lasso.coef_, index=X.columns)
removed_features = lasso_coefs[lasso_coefs == 0].index.tolist()

print("\n--- Lasso Feature Selection ---")
print(f"Lasso removed {len(removed_features)} features by setting their coefficients to zero.")

Training Ridge model...
Training Lasso model...




Training ElasticNet model...

--- Ridge Results ---
Best Alpha: 100.0
R-squared (R²): 0.8693
Root Mean Squared Error (RMSE): $32,365.32

--- Lasso Results ---
Best Alpha: 0.001
R-squared (R²): 0.8627
Root Mean Squared Error (RMSE): $33,176.13

--- ElasticNet Results ---
Best Alpha: 0.001
R-squared (R²): 0.8640
Root Mean Squared Error (RMSE): $33,025.48

--- Lasso Feature Selection ---
Lasso removed 4 features by setting their coefficients to zero.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Generate a Synthetic Dataset ---
# This mimics the properties of the Ames Housing data.
print("Generating a synthetic dataset...")
np.random.seed(42)
n_samples = 1500

# Key predictive features
overall_qual = np.random.randint(1, 11, n_samples)
gr_liv_area = np.random.normal(1500, 400, n_samples) + overall_qual * 100
garage_cars = np.random.randint(0, 5, n_samples)

# Create multicollinearity
total_bsmt_sf = gr_liv_area * np.random.normal(0.6, 0.1, n_samples)
first_flr_sf = total_bsmt_sf * np.random.normal(0.7, 0.1, n_samples)

# Categorical feature
neighborhoods = ['Urban', 'Suburban', 'Rural']
neighborhood = np.random.choice(neighborhoods, n_samples, p=[0.5, 0.4, 0.1])
neighborhood_encoded = pd.get_dummies(neighborhood, drop_first=True, prefix='Neighborhood')

# Noise features (less predictive)
lot_area = np.random.normal(10000, 3000, n_samples)
year_built = np.random.randint(1950, 2022, n_samples)

# Define the relationship for the target variable (SalePrice)
log_sale_price = (3 +
                  overall_qual * 0.2 +
                  gr_liv_area * 0.001 +
                  garage_cars * 0.15 +
                  total_bsmt_sf * 0.0005 +
                  neighborhood_encoded['Neighborhood_Suburban'] * 0.1 +
                  neighborhood_encoded['Neighborhood_Urban'] * 0.2 +
                  np.random.normal(0, 0.1, n_samples)) # noise

# Create the skewed SalePrice by taking the exponent
sale_price = np.exp(log_sale_price)

# Assemble the DataFrame
df = pd.DataFrame({
    'SalePrice': sale_price,
    'Overall Qual': overall_qual,
    'Gr Liv Area': gr_liv_area,
    'Garage Cars': garage_cars,
    'Total Bsmt SF': total_bsmt_sf,
    '1st Flr SF': first_flr_sf,
    'Lot Area': lot_area,
    'Year Built': year_built,
    'Neighborhood': neighborhood
})

# Introduce some missing values
df.loc[df.sample(frac=0.05).index, 'Total Bsmt SF'] = np.nan
print("Synthetic dataset created successfully.")

# --- Visualization 1: Target Variable Distribution ---
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['SalePrice'], kde=True, bins=50)
plt.title('Distribution of SalePrice (Original Synthetic)')

df['SalePrice_log'] = np.log1p(df['SalePrice'])

plt.subplot(1, 2, 2)
sns.histplot(df['SalePrice_log'], kde=True, color='green', bins=50)
plt.title('Distribution of SalePrice (Log-Transformed)')
plt.xlabel('Log(SalePrice + 1)')
plt.tight_layout()
plt.savefig('saleprice_distribution.png')
print("Saved SalePrice distribution plot to saleprice_distribution.png")
plt.close()


# --- Visualization 2: Feature Correlation Heatmap ---
corrmat = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corrmat, cbar=True, annot=True, fmt='.2f', cmap='viridis')
plt.title('Correlation Heatmap of Synthetic Features')
plt.savefig('correlation_heatmap.png')
print("Saved correlation heatmap to correlation_heatmap.png")
plt.close()


# --- 2. Preprocessing ---
df_subset = df.drop('SalePrice', axis=1)
df_subset = df_subset.rename(columns={'SalePrice_log': 'SalePrice'})

# Handle missing values for numerical columns
for col in df_subset.select_dtypes(include=[np.number]).columns:
    df_subset[col] = df_subset[col].fillna(df_subset[col].median())

# One-hot encode categorical features
df_processed = pd.get_dummies(df_subset, drop_first=True)

# --- 3. Splitting and Scaling ---
X = df_processed.drop('SalePrice', axis=1)
y = df_processed['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# --- 4. Modeling (Lasso) ---
alphas = [0.0001, 0.0005, 0.001, 0.01, 0.1]
print("Training Lasso model...")
lasso = LassoCV(alphas=alphas, cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train_scaled, y_train)

print(f"\n--- Lasso Model Results ---")
print(f"Best Alpha found by Cross-Validation: {lasso.alpha_}")


# --- 5. Evaluation & Visualization ---
y_pred_log = lasso.predict(X_test_scaled)
y_pred_orig = np.expm1(y_pred_log)
y_test_orig = np.expm1(y_test)

rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
r2 = r2_score(y_test_orig, y_pred_orig)

print(f"R-squared (R²): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")

# --- Visualization 3: Predicted vs. Actual Values ---
plt.figure(figsize=(8, 8))
plt.scatter(y_test_orig, y_pred_orig, alpha=0.5, color='blue')
plt.plot([min(y_test_orig), max(y_test_orig)], [min(y_test_orig), max(y_test_orig)], '--', color='red', lw=2)
plt.xlabel('Actual Sale Price ($)')
plt.ylabel('Predicted Sale Price ($)')
plt.title('Lasso Model: Actual vs. Predicted Prices')
plt.savefig('actual_vs_predicted.png')
print("Saved Actual vs. Predicted plot to actual_vs_predicted.png")
plt.close()

# --- Visualization 4: Residuals Plot ---
residuals = y_test_orig - y_pred_orig
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred_orig, y=residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Sale Price ($)')
plt.ylabel('Residuals ($)')
plt.title('Lasso Model: Residuals Plot')
plt.savefig('residuals_plot.png')
print("Saved Residuals plot to residuals_plot.png")
plt.close()

# --- Visualization 5: Feature Importance (Coefficients) ---
lasso_coefs = pd.Series(lasso.coef_, index=X.columns)
important_coefs = lasso_coefs[lasso_coefs.abs() > 0].sort_values()

plt.figure(figsize=(10, 8))
important_coefs.plot(kind='barh', color=important_coefs.apply(lambda x: 'g' if x > 0 else 'r'))
plt.title('Lasso Model: Feature Coefficients (Synthetic Data)')
plt.xlabel('Coefficient Value (Impact on Log-Price)')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('feature_importance.png')
print("Saved Feature Importance plot to feature_importance.png")
plt.close()

Generating a synthetic dataset...
Synthetic dataset created successfully.
Saved SalePrice distribution plot to saleprice_distribution.png
Saved correlation heatmap to correlation_heatmap.png
Training Lasso model...

--- Lasso Model Results ---
Best Alpha found by Cross-Validation: 0.0001
R-squared (R²): 0.9853
Root Mean Squared Error (RMSE): $391.83
Saved Actual vs. Predicted plot to actual_vs_predicted.png
Saved Residuals plot to residuals_plot.png
Saved Feature Importance plot to feature_importance.png
