In [None]:
# House Prices - Advanced Regression Techniques
# Full Data Preprocessing, Feature Engineering, and Baseline Modeling

# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load Dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")

# Check Missing Values
missing = train.isnull().sum().sort_values(ascending=False)
print("Missing Values:\n", missing[missing > 0])

# Fill Missing Values
cols_none = ['GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond',
             'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
             'MasVnrType','FireplaceQu', 'Alley', 'PoolQC', 'Fence', 'MiscFeature']

for col in cols_none:
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

for col in ['MasVnrArea', 'GarageCars', 'GarageArea']:
    train[col].fillna(0, inplace=True)
    test[col].fillna(0, inplace=True)

# Fill remaining missing values with mode (for categoricals)
for dataset in [train, test]:
    for col in dataset.columns:
        if dataset[col].isnull().sum() > 0:
            dataset[col].fillna(dataset[col].mode()[0], inplace=True)

# Log-transform the target
sns.histplot(train['SalePrice'], kde=True)
plt.title('SalePrice Distribution Before Log Transform')
plt.show()

train['SalePrice'] = np.log1p(train['SalePrice'])

# Feature Engineering
train['LivingArea'] = train['GrLivArea'] + train['TotalBsmtSF']
test['LivingArea'] = test['GrLivArea'] + test['TotalBsmtSF']

train['HouseAge'] = train['YrSold'] - train['YearBuilt']
test['HouseAge'] = test['YrSold'] - test['YearBuilt']

train['IsRemodeled'] = (train['YearBuilt'] != train['YearRemodAdd']).astype(int)
test['IsRemodeled'] = (test['YearBuilt'] != test['YearRemodAdd']).astype(int)

train['TotalBath'] = (train['FullBath'] + 0.5 * train['HalfBath'] + train['BsmtFullBath'] + 0.5 * train['BsmtHalfBath'])
test['TotalBath'] = (test['FullBath'] + 0.5 * test['HalfBath'] + test['BsmtFullBath'] + 0.5 * test['BsmtHalfBath'])

# Ordinal Encoding for Quality Features
quality_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
quality_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu']

for col in quality_features:
    train[col] = train[col].map(quality_map)
    test[col] = test[col].map(quality_map)

# One-Hot Encoding
train = pd.get_dummies(train)
test = pd.get_dummies(test)

train, test = train.align(test, join='left', axis=1, fill_value=0)

X = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']
X_test = test.drop(['Id'], axis=1)

# Feature Scaling (optional for linear models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Feature Selection using RandomForest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns)
top_features = importances.nlargest(50).index

X_sel = X[top_features]
X_test_sel = X_test[top_features]

# Model Training with LassoCV
model = LassoCV(cv=5, random_state=42)
scores = cross_val_score(model, X_sel, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = [(-score) ** 0.5 for score in scores]
print(f'Cross-Validation RMSE: {np.mean(rmse_scores)}')

model.fit(X_sel, y)
y_pred = model.predict(X_sel)
rmse_train = mean_squared_error(y, y_pred, squared=False)
print(f'Training RMSE: {rmse_train}')

# Final Predictions on Test Data
test_preds = np.expm1(model.predict(X_test_sel))

# Submission File
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': test_preds})
submission.to_csv('submission.csv', index=False)
print('Submission file generated!')
