In [1]:
# Install necessary libraries (if not installed)
# !pip install pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import RFE

# Load dataset (Replace with actual dataset path)
df = pd.read_csv("house_prices.csv")

# Display first few rows
print(df.head())

# Identify numerical and categorical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Ensure 'Price' is not in features
if 'Price' in num_cols:
    num_cols.remove('Price')

# Handling missing values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Scaling numerical data
scaler = StandardScaler()

# Encoding categorical data
encoder = OneHotEncoder(handle_unknown="ignore")

# Create transformers
num_pipeline = Pipeline([
    ("imputer", num_imputer),
    ("scaler", scaler)
])

cat_pipeline = Pipeline([
    ("imputer", cat_imputer),
    ("encoder", encoder)
])

# Combine transformers
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

# Splitting data
X = df.drop("Price", axis=1)  # Features
y = df["Price"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Retrieve numerical feature names (since StandardScaler does not change names)
num_features = num_cols

# Retrieve categorical feature names from OneHotEncoder
cat_features = preprocessor.transformers_[1][1].get_feature_names_out(cat_cols)

# Combine all feature names
transformed_feature_names = list(num_features) + list(cat_features)

# Convert transformed data back into a DataFrame for clarity
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_feature_names)

# Train multiple models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)

    print(f"\n{name} Model Performance:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score:", r2_score(y_test, y_pred))

# Perform Recursive Feature Elimination (RFE)
model = LinearRegression()
selector = RFE(model, n_features_to_select=10)  # Select top 10 features
selector.fit(X_train_transformed_df, y_train)

# Retrieve the selected features from the transformed column names
selected_features = [transformed_feature_names[i] for i in range(len(transformed_feature_names)) if selector.support_[i]]
print("\nSelected Features:", selected_features)


   Size       Location  Rooms  YearBuilt   Price
0  1400       New York      3       2005  350000
1  1600    Los Angeles      4       2010  420000
2  1700  San Francisco      3       2015  600000
3  1200          Miami      2       2000  250000
4  2000        Chicago      4       2012  480000

Linear Regression Model Performance:
MAE: 13902.439024390187
RMSE: 14579.165008661852
R2 Score: 0.9566220301326959

Decision Tree Model Performance:
MAE: 65000.0
RMSE: 73824.115301167
R2 Score: -0.11224489795918369

XGBoost Model Performance:
MAE: 51400.8671875
RMSE: 56208.895830282905
R2 Score: 0.35521633255926643

Selected Features: ['Size', 'Rooms', 'YearBuilt', 'Location_Chicago', 'Location_Los Angeles', 'Location_Miami', 'Location_New York', 'Location_San Francisco']
