In [None]:
# ✅ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [None]:
# ✅ Upload CSV from Local System (For Google Colab Users)
from google.colab import files
uploaded = files.upload()


In [None]:
# ✅ Load Dataset
df = pd.read_csv('Housing.csv')
df.head()


In [None]:
# ✅ Check Missing Values
df.isnull().sum()


In [None]:
# ✅ One-Hot Encoding for Categorical Columns
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()


In [None]:
# ✅ Split Features and Target
X = df_encoded.drop('price', axis=1)
y = df_encoded['price']


In [None]:
# ✅ Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# ✅ Train Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# ✅ Predict and Evaluate
y_pred_dt = dt_model.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
print(f"Decision Tree MSE: {mse_dt}")


In [None]:
# ✅ Visualize Decision Tree
plt.figure(figsize=(20,10))
plot_tree(dt_model, feature_names=X.columns, filled=True, max_depth=3)
plt.show()


In [None]:
# ✅ Decision Tree with max_depth=5
dt_model_limited = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model_limited.fit(X_train, y_train)

# ✅ Predict and Evaluate
y_pred_limited = dt_model_limited.predict(X_test)
mse_limited = mean_squared_error(y_test, y_pred_limited)
print(f"Decision Tree with max_depth=5 MSE: {mse_limited}")


In [None]:
# ✅ Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# ✅ Predict and Evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")


In [None]:
# ✅ Feature Importance Visualization
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False, inplace=True)

plt.figure(figsize=(10,6))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.title('Feature Importance from Random Forest')
plt.show()


In [None]:
# ✅ Cross-Validation for Random Forest
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validation MSE scores: {-cv_scores}")
print(f"Average CV MSE: {-cv_scores.mean()}")
