# 🏠 Housing Price Prediction - Advanced Regression

This notebook walks through predicting housing prices using machine learning regression models.

**Dataset:** [Kaggle: House Prices - Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

**Skills:**
- Data preprocessing
- Feature engineering
- Linear, Ridge, and Lasso regression
- Cross-validation
- Feature importance visualization


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv("train.csv")
df.head()


In [None]:
# Drop high-missing columns
df.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, inplace=True)

# Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)

# Convert categoricals to dummies
df = pd.get_dummies(df)

# Prepare data
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
preds = lr.predict(X_test)
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, preds)))


In [None]:
# Ridge tuning
ridge_grid = GridSearchCV(Ridge(), {'alpha': [0.01, 0.1, 1, 10, 100]}, scoring='neg_root_mean_squared_error', cv=5)
ridge_grid.fit(X_train, y_train)
print("Best Ridge alpha:", ridge_grid.best_params_['alpha'])

# Lasso tuning
lasso_grid = GridSearchCV(Lasso(), {'alpha': [0.001, 0.01, 0.1, 1]}, scoring='neg_root_mean_squared_error', cv=5)
lasso_grid.fit(X_train, y_train)
print("Best Lasso alpha:", lasso_grid.best_params_['alpha'])


In [None]:
# Ridge Feature Importance
ridge = Ridge(alpha=ridge_grid.best_params_['alpha'])
ridge.fit(X_train, y_train)

coef = ridge.coef_
features = X.columns
top_idx = np.argsort(np.abs(coef))[-20:]

plt.figure(figsize=(10, 6))
plt.barh(features[top_idx], coef[top_idx])
plt.title("Top 20 Feature Importances (Ridge)")
plt.xlabel("Coefficient")
plt.tight_layout()
plt.show()
