# 1) Data inspection

### 1-Import Libraries and Load Dataset

#### We import the pandas library for data manipulation.
#### We read the CSV file into a DataFrame called df.

In [1]:
import pandas as pd
df =pd.read_csv("housing.csv")

### 2-Preview Data
#### df.head() shows the first 5 rows to quickly check the data.
#### df.tail() shows the last 5 rows to ensure the data loaded correctly.
#### df.info() provides column names, data types, and non-null counts.
#### df.shape returns the number of rows and columns.

In [None]:
print("====First 5 Rows====")
print(df.head())


print("====Last 5 Rows====")
print(df.tail())


print(df.info())
print(df.shape)

### 3-Check Missing Values & Check Duplicated Data
#### df.isna().sum() calculates the number of missing values in each column.
#### Helps us identify which columns need cleaning or imputation.
#### df.duplicated().sum() counts duplicate rows in the dataset.
#### Duplicate data can affect analysis and should be handled.

In [None]:
print("Missing Values:\n")
print(df.isna().sum(),"\n")

print("Duplicated Data:",df.duplicated().sum())


### 4-Explore Categorical Columns
#### Selects columns with object type (categorical data).
#### value_counts().head(10) shows the 10 most frequent values for each categorical column.

In [None]:
object_cols = df.select_dtypes(include=['object'])
for col in object_cols:
    print(col)
    print(df[col].value_counts().head(10))

### 5-Unique Values per Column
#### nunique() returns the number of unique values in each column.
#### Helps understand diversity of values and detect potential categorical variables.

In [None]:
for col in df.columns:
    print(f"Columns:{col}")
    print("Unique:",df[col].nunique())
    print("-"*40)

### 6-Descriptive Statistics for Numeric Columns
#### df.describe() provides count, mean, std, min, max, and quartiles for numeric columns.
#### Useful for understanding the distribution of numeric data.

In [None]:
numeric_col = df.select_dtypes(include=['float64','int64'])
print(numeric_col.describe())

# 2) Data Cleaning

### 1-Import Libraries and Load Dataset
#### Import libraries for data analysis and visualization.
#### Load the housing dataset into a DataFrame.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('housing.csv')

### 2-Initial Data Exploration
#### Check missing values, duplicates, and basic statistics.

In [None]:
columns_to_plot = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population',
            'households','median_income','median_house_value']

print("==== DATA BEFORE CLEANING ====")
print("\nNull values per column:")
print(df.isnull().sum())

print("\nNumber of duplicate rows:", df.duplicated().sum())

print("\nNumeric summary:")
print(df.describe())

### 3-Outlier Detection Function
#### Define a function to count outliers using IQR.
#### Print outlier counts before cleaning.

In [None]:
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return len(outliers)

print("===== OUTLIERS BEFORE CLEANING =====")
for col in columns_to_plot:
    count = detect_outliers_iqr(df[col])
    print(f"{col}: {count} outliers")
    print("-"*40)

### 4-Visualize Outliers Before Cleaning
#### Boxplots show distribution and outliers visually.

In [None]:

fig, axes = plt.subplots(3, 3, figsize=(15,12))
for ax, col in zip(axes.flatten(), columns_to_plot):
    sns.boxplot(x=df[col], ax=ax)
    ax.set_title(f'{col} (Before Cleaning)')
    
plt.tight_layout(pad=5.0)
plt.show()


### 5- Clean Data
#### Fill missing total_bedrooms with mean.
#### Clip numeric columns to remove extreme outliers.
#### Display stats after cleaning.

In [None]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].mean())

num_cols_clean = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population',
            'households','median_income','median_house_value']

for col in num_cols_clean:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
  
    df[col] = df[col].clip(lower=lower, upper=upper)

print("\n===== DATA AFTER CLEANING =====")
print("\nNull values per column:")
print(df.isnull().sum())
print("\nNumber of duplicate rows:", df.duplicated().sum())
print("\nNumeric summary (cleaned columns):")
print(df[num_cols_clean].describe())

### 6-Visualize After Cleaning & Save Dataset
#### Visualize cleaned data with boxplots.
#### Print remaining outlier counts.
#### Save cleaned DataFrame to CSV.

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15,12))
for ax, col in zip(axes.flatten(), num_cols_clean):
    sns.boxplot(x=df[col], ax=ax)
    ax.set_title(f'{col} (After Cleaning)')
plt.tight_layout(pad=5.0)
plt.show()


print("\n===== FINAL OUTLIERS COUNT =====")
for col in num_cols_clean:
    outliers = detect_outliers_iqr(df[col])
    print(f"{col}: {outliers} outliers")


df.to_csv('cleaned_housing.csv', index=False)
print("\n===== CLEANED DATA SAVED SUCCESSFULLY =====")



# 3) Visualization

### 1-Import Libraries and Load Dataset
#### Loading the cleaned dataset ensures we work with ready-to-use data.
#### Knowing the target column (median_house_value) is crucial for any predictive model.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('cleaned_housing.csv')
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])
print("Target column:", "median_house_value")

### 2-Distribution of Target Variable
#### Visualizing the target helps understand its distribution.
#### Important for choosing model type (e.g., regression) and detecting skewness that may require transformation.

In [None]:
plt.hist(df['median_house_value'], bins=50, color='purple')
plt.title("Histogram of Median House Value")
plt.xlabel("Median House Value")
plt.ylabel("Frequency")
plt.show()


### 3-Correlation Heatmap
#### Shows relationships between features and the target.
#### Helps identify strong predictors for the model and potential multicollinearity.

In [None]:
numeric_features = df.select_dtypes(include=['int64', 'float64'])
corr_matrix = numeric_features.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


### 4-Median Income vs Target
#### Directly visualizes how median_income influences the target.
#### Strong correlation here indicates median_income is a key predictor for housing value.

In [None]:
plt.scatter(df['median_income'], df['median_house_value'], alpha=0.5,color="#F0749B")
plt.title("Median Income vs Median House Value")
plt.xlabel("Median Income")
plt.ylabel("Median House Value")
plt.show()

### 5-Categorical Feature vs Target
#### Shows how categorical features like ocean_proximity affect the target.
#### Useful for feature encoding and understanding impact on predictions.

In [None]:
sns.boxplot(x='ocean_proximity', y='median_house_value', data=df,color="teal")
plt.title("Ocean Proximity vs Median House Value")
plt.xlabel("Ocean Proximity")
plt.ylabel("Median House Value")
plt.show()


### 6-Geographical Visualization
#### Visualizes how location (longitude & latitude) influences house value.
#### Useful for models to capture spatial patterns or for feature engineering (e.g., clustering by location).

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(df['longitude'], df['latitude'], c=df['median_house_value'], cmap='viridis', alpha=0.5)
plt.colorbar(label='Median House Value')
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Location vs Median House Value")
plt.show()

# 4) prediction 


### 1-Import Libraries
#### This cell imports all necessary libraries for data handling, preprocessing, modeling, and evaluation.
#### It includes pandas, numpy, scikit-learn modules, matplotlib, and XGBoost.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

### 2-Load Dataset 
#### -Load your housing dataset.
#### -y is the target (what we want to predict).
#### -X contains all features.
#### -Separate features into numeric and categorical for preprocessing.


In [None]:
df = pd.read_csv("cleaned_housing.csv")

y = df["median_house_value"]

numeric_features = [
    "median_income", "longitude", "latitude", "housing_median_age",
    "total_rooms", "total_bedrooms", "population", "households"
]
categorical_features = ["ocean_proximity"]

X = df[numeric_features + categorical_features]

print(X.shape)
print(X.head())


### 3-Preprocessing
#### StandardScaler ‚Üí scales numeric values to a standard range.
#### OneHotEncoder ‚Üí converts categorical features into numbers.
#### ColumnTransformer ‚Üí applies transformations to the correct columns.

In [None]:
numeric_transformer=Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor= ColumnTransformer(
    transformers=[("num", numeric_transformer, numeric_features),("cat", categorical_transformer, categorical_features)])

### 4-Split Data
#### Split data into Train (60%), Validation (20%), and Test (20%) sets.
#### Validation is used for tuning models, Test is for final evaluation.

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)
X_cv, X_test, y_cv, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

### 5-Model Evaluation Function
#### Fits a model and calculates RMSE, MAE, R¬≤ for train, validation, and test sets.
#### Returns a dictionary of metrics for easy comparison.

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_cv, X_test,y_test ):
    model.fit(X_train, y_train)
    
    preds = {
        "train": model.predict(X_train),
        "val": model.predict(X_val),
        "test": model.predict(X_test)
    }
    
    metrics = {}
    for k in preds:
        metrics[k] = {
            "rmse": np.sqrt(mean_squared_error(eval(f"y_{k}"), preds[k])),
            "mae": mean_absolute_error(eval(f"y_{k}"), preds[k]),
            "r2": r2_score(eval(f"y_{k}"), preds[k])
        }
    
    return metrics

## 6-Hyperparameter Tuning Function
#### Function for tuning a hyperparameter for any model.
#### Plots train vs validation RMSE to visualize bias-variance tradeoff.
#### Returns the best model with its metrics.

In [None]:

def tune_model(model_name, model_class, param_name, param_values, extra_params=None):
    train_rmse, val_rmse = [], []
    
    for val in param_values:
        params = {param_name: val}
        if extra_params:
            params.update(extra_params)
        
        model = Pipeline([
            ("preprocessor", preprocessor),
            (model_name, model_class(**params))
        ])
        
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_cv)
        
        train_rmse.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
        val_rmse.append(np.sqrt(mean_squared_error(y_cv, y_val_pred)))
    

    plt.figure(figsize=(8,5))
    plt.plot(param_values, train_rmse, marker='o', label="Train RMSE")
    plt.plot(param_values, val_rmse, marker='o', label="Validation RMSE")
    plt.xscale("log" if param_name=="alpha" else "linear")  
    plt.xlabel(param_name)
    plt.ylabel("RMSE")
    plt.title(f"Bias-Variance Tradeoff ({model_name})")
    plt.legend()
    plt.grid(True)
    plt.show()
    
    best_idx = np.argmin(val_rmse)
    best_param = param_values[best_idx]
    
    print(f"Best {param_name}: {best_param}")
    print(f"Train RMSE: {train_rmse[best_idx]:.2f}")
    print(f"Validation RMSE: {val_rmse[best_idx]:.2f}")
    

    best_model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        (model_name, model_class(**{param_name: best_param}))
    ])
    
    best_model.fit(X_train, y_train)
    

    def evaluate(X, y):
        pred = best_model.predict(X)
        rmse = np.sqrt(mean_squared_error(y, pred))
        mae = mean_absolute_error(y, pred)
        r2 = r2_score(y, pred)
        return rmse, mae, r2
    
    train_metrics = evaluate(X_train, y_train)
    val_metrics = evaluate(X_cv, y_cv)
    test_metrics = evaluate(X_test, y_test)
    
    print("\nFinal Evaluation:")
    print(f"Train -> RMSE: {train_metrics[0]:.2f}, MAE: {train_metrics[1]:.2f}, R¬≤: {train_metrics[2]:.4f}")
    print(f"Val   -> RMSE: {val_metrics[0]:.2f}, MAE: {val_metrics[1]:.2f}, R¬≤: {val_metrics[2]:.4f}")
    print(f"Test  -> RMSE: {test_metrics[0]:.2f}, MAE: {test_metrics[1]:.2f}, R¬≤: {test_metrics[2]:.4f}")
    
    print("-"*50)
    
    return best_model, train_metrics, val_metrics, test_metrics

### 7- Train & Tune Models
#### Tune 4 different models: Ridge, Decision Tree, Random Forest, XGBoost.
#### Finds best hyperparameters and evaluates metrics.

In [None]:
#linear reg
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
best_model_ridge, ridge_train, ridge_val, ridge_test = tune_model(
    model_name="ridge",
    model_class=Ridge,
    param_name="alpha",
    param_values=alphas
)

# decision tree
depths = [2, 4, 6, 8, 10]
best_model_tree, tree_train, tree_val, tree_test = tune_model(
    model_name="tree",
    model_class=DecisionTreeRegressor,
    param_name="max_depth",
    param_values=depths
)
#random forest 
depths = [5, 10, 15, 20,25]

best_model_rf, rf_train, rf_val, rf_test = tune_model(
    model_name="rf",model_class=RandomForestRegressor, param_name="max_depth",param_values=depths,
    extra_params={"random_state": 42, "n_jobs": -1})

# xgboost
xgb_depths = [3, 5, 7, 9]
best_model_xgb, xgb_train_rmse, xgb_val_rmse, xgb_test_rmse = tune_model(model_name="xgb",model_class=XGBRegressor,
    param_name="max_depth",param_values=xgb_depths,
    extra_params={"n_estimators":6000,"learning_rate":0.01,"min_child_weight":6,"subsample":0.7,
        "colsample_bytree":0.85,"reg_alpha":0.5,"reg_lambda":3,"objective":"reg:squarederror","random_state":42,"n_jobs":-1})

## 8-Compare Models
#### Summarize all metrics in a dataframe for easy comparison.

In [None]:
results = pd.DataFrame({
    "Model": ["Ridge", "Decision Tree", "Random Forest", "XGBoost"],
    
    "Train RMSE": [ridge_train[0], tree_train[0], rf_train[0], xgb_train_rmse[0]],
    "Val RMSE":   [ridge_val[0], tree_val[0], rf_val[0], xgb_val_rmse[0]],
    "Test RMSE":  [ridge_test[0], tree_test[0], rf_test[0], xgb_test_rmse[0]],
    
    "Train MAE": [ridge_train[1], tree_train[1], rf_train[1], xgb_train_rmse[1]],
    "Val MAE":   [ridge_val[1], tree_val[1], rf_val[1], xgb_val_rmse[1]],
    "Test MAE":  [ridge_test[1], tree_test[1], rf_test[1], xgb_test_rmse[1]],
    
    "Train R2": [ridge_train[2], tree_train[2], rf_train[2], xgb_train_rmse[2]],
    "Val R2":   [ridge_val[2], tree_val[2], rf_val[2], xgb_val_rmse[2]],
    "Test R2":  [ridge_test[2], tree_test[2], rf_test[2], xgb_test_rmse[2]],
})

print(results)

### 9-Visualize Model Performance
#### Compare Train, Validation, Test RMSE visually.
#### Compare Train, Validation, Test MAE visually.
#### Compare Train, Validation, Test R¬≤ visually

In [None]:
models = ["Ridge", "Decision Tree", "Random Forest", "XGBoost"]

train_rmse_list = results["Train RMSE"].values
val_rmse_list   = results["Val RMSE"].values
test_rmse_list  = results["Test RMSE"].values

train_mae_list = results["Train MAE"].values
val_mae_list   = results["Val MAE"].values
test_mae_list  = results["Test MAE"].values

train_r2_list = results["Train R2"].values
val_r2_list   = results["Val R2"].values
test_r2_list  = results["Test R2"].values

# -------------------- RMSE Comparison --------------------
x = np.arange(len(models))
width = 0.25

plt.figure(figsize=(10,6))
plt.bar(x - width, train_rmse_list, width, label='Train RMSE', color="#7c35ae")
plt.bar(x, val_rmse_list, width, label='Validation RMSE', color='#9b59b6')
plt.bar(x + width, test_rmse_list, width, label='Test RMSE', color="#c37bdf")

plt.xlabel('Models')
plt.ylabel('RMSE')
plt.title('Comparison of Models Performance (RMSE)')
plt.xticks(x, models)
plt.legend()
plt.grid(axis='y')
plt.show()

# -------------------- MAE Comparison --------------------
plt.figure(figsize=(10,6))
plt.bar(x - width, train_mae_list, width, label='Train MAE', color="#27ae60")
plt.bar(x, val_mae_list, width, label='Validation MAE', color="#2ecc71")
plt.bar(x + width, test_mae_list, width, label='Test MAE', color="#58d68d")

plt.xlabel('Models')
plt.ylabel('MAE')
plt.title('Comparison of Models Performance (MAE)')
plt.xticks(x, models)
plt.legend()
plt.grid(axis='y')
plt.show()

# -------------------- R¬≤ Comparison --------------------
plt.figure(figsize=(10,6))
plt.bar(x - width, train_r2_list, width, label='Train R¬≤', color="#37abb1")
plt.bar(x, val_r2_list, width, label='Validation R¬≤', color="#52fbea")
plt.bar(x + width, test_r2_list, width, label='Test R¬≤', color="#8dfae4")

plt.xlabel('Models')
plt.ylabel('R¬≤')
plt.title('Comparison of Models Performance (R¬≤)')
plt.xticks(x, models)
plt.ylim(0,1)
plt.legend()
plt.grid(axis='y')
plt.show()

## 10- Scatter Plot 
#### To compare the actual house prices (y_test) with the predicted prices from each of your 4 models.
#### This helps you see if the predictions are close to reality.

In [None]:
model_objs = [best_model_ridge, best_model_tree, best_model_rf, best_model_xgb]

plt.figure(figsize=(12,10))
for i, model in enumerate(model_objs):
    y_pred = model.predict(X_test)
    
    plt.subplot(2, 2, i+1)
    plt.scatter(y_test, y_pred, alpha=0.5, color='#6a0dad')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel("Actual Price")
    plt.ylabel("Predicted Price")
    plt.title(f"{models[i]}: Actual vs Predicted")
    plt.grid(True)

plt.tight_layout()
plt.show()

### 11- Select Best Model
#### Pick the model with lowest validation RMSE.


In [None]:

all_val_rmse = val_rmse_list  

best_idx = int(np.argmin(all_val_rmse))  
best_model_name = models[best_idx]
best_val_rmse_value = all_val_rmse[best_idx]

model_dict = {
    "Ridge": best_model_ridge,
    "Decision Tree": best_model_tree,
    "Random Forest": best_model_rf,
    "XGBoost": best_model_xgb
}

best_model = model_dict[best_model_name]

print("\n" + "*"*95)
print(f"üéâ‚ú® Congratulations! The best model is '{best_model_name}' with a Validation RMSE of {best_val_rmse_value:.2f}! ‚ú®üéâ")
print("*"*95 + "\n")

## 12-Interactive House Price Prediction
### Allow the user to input house details and predict price using best XGBoost model

In [None]:
import pandas as pd

print("\n‚ú® Hey there! ‚ú®")
print("Let‚Äôs try something fun üöÄ")
print("Enter your own house details and let the model predict the price for you üè°üí∞\n")

def safe_float(prompt):
    while True:
        try:
            return float(input(prompt))
        except ValueError:
            print("‚ùå Please enter a valid number!")

# Ocean proximity choices
ocean_menu = {
    1: "<1H OCEAN",
    2: "INLAND",
    3: "NEAR BAY",
    4: "ISLAND",
    5: "NEAR OCEAN"
}

print("\nChoose Ocean Proximity:")
for k, v in ocean_menu.items():
    print(f"{k} - {v}")

while True:
    try:
        choice = int(input("Enter choice (1-5): "))
        if choice in ocean_menu:
            ocean_input = ocean_menu[choice]
            break
        else:
            print("‚ùå Choose a number between 1 and 5")
    except ValueError:
        print("‚ùå Enter numbers only!")

# Collect user input
user_data = {
    "longitude": safe_float("Longitude: "),
    "latitude": safe_float("Latitude: "),
    "housing_median_age": safe_float("Housing Median Age: "),
    "total_rooms": safe_float("Total Rooms: "),
    "total_bedrooms": safe_float("Total Bedrooms: "),
    "population": safe_float("Population: "),
    "households": safe_float("Households: "),
    "median_income": safe_float("Median Income: "),
    "ocean_proximity": ocean_input
}

# Convert to DataFrame
user_df = pd.DataFrame([user_data])

# --- Use the full pipeline with best_model ---
# best_model = pipeline with preprocessor + best selected model
predicted_price = best_model.predict(user_df)[0]

print("\nüè† Predicted House Price:")
print(f"üíµ ${predicted_price:,.2f}")

# Optional: compare to actual price if known
user_real_price = float(input("\nIf you know the real price, enter it (or 0 to skip): "))
if user_real_price > 0:
    error = abs(user_real_price - predicted_price)
    accuracy = 1 - (error / user_real_price)
    print(f"\n Absolute Error: ${error:,.2f}")
    print(f"Accuracy: {accuracy*100:.2f}%")
    print("\nüéâ Prediction completed successfully! ")
else:
    print("\nüéâ Prediction completed successfully! Try changing values to see new results üöÄ")
