In [73]:
import pandas as pd
from sklearn import set_config
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [74]:
set_config(transform_output="pandas")

It's important that we set the line above

`set_config(transform_output="pandas")`

Because it tells scikit-learn to keep our data as pandas DataFrames instead of converting them to plain numpy arrays.

By default, when you use sklearn transformers (like `StandardScaler`, `LabelEncoder`, etc.), they return numpy arrays that lose important information:

```python
# Without the config
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
print(type(X_scaled))        # <class 'numpy.ndarray'>
print(X_scaled.columns)      # ERROR! No columns attribute
```

#### What you lose:

- Column names disappear
- Index information is gone
- You just get numbers in arrays

```python
# With the config

sklearn.set_config(transform_output="pandas")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
print(type(X_scaled))        # <class 'pandas.DataFrame'>
print(X_scaled.columns)      # Works! Shows your column names
```

It makes debugging easier, we can easily retrieve column names, feature importance makes sense, our entire pipeline stays in the familiar pandas format rather than switching between pandas and numpy, and we don't have to learn "sklearn returns numpy arrays" as an extra concept - everything just stays as DataFrames.

## Getting Started

In this tutorial, we used the linear regression model for our prediction, and that's because linear regression finds the straight line that best fits through your data points - which is what we want. We have an assumption to each feature contributes (in some way) to the determination of the price of a house. Hence, the linear regression model is our "go-to" approach.

THe linear regression model is designed to create a pattern on how input variables (like area_sqm, number of bedrooms, house type, location, age, etc.) relate to an outcome (like price) and uses that pattern to make predictions.

**Here are some best scenarios to use it:**

1. When you expect roughly straight-line relationships between inputs and outputs (like in the case of the housing dataset, where we expect that each feature has a positive correlation with the target variable - being the property price)
2. When you need to understand which factors matter most (it tells you the importance of each variable)
3. When you want a simple, fast model that's easy to interpret and explain
4. For baseline predictions before trying more complex methods

**Examples:** Predicting sales based on advertising spend, estimating delivery times based on distance and traffic, or forecasting energy usage based on temperature and building size.

Linear regression works best when the relationships between features and the target variable are reasonably straight-forward, and you value simplicity and interpretability over perfect accuracy.

Interpretability refers to the ability to understand the reasoning behind a model's decisions.

In [75]:
def load_data(path):
    """Loads student data from a CSV file specified in the path argument"""

    return pd.read_csv(path)

In [76]:
def explore_data(df):
    """Perform initial data exploration"""
    print("="*50)
    print("DATASET OVERVIEW")
    print("="*50)
    print(f"Dataset Shape: {df.shape}")
    print(f"Number of properties: {len(df)}")

    print('\nDataset Highlight Information:')
    print(df.info())

    print("\nColumn Information:")
    for col in df.columns:
        print(f"- {col}: {df[col].dtype}")

    print("\nFirst 5 rows:")
    print(df.head())

    print("\nMissing Values:")
    missing_values = df.isnull().sum()
    if missing_values.sum() == 0:
        print("No missing values found!")
    else:
        print(missing_values[missing_values > 0])

    print("\nPrice Statistics (Target Variable):")
    print(f"Min Price: ₦{df['price'].min():,}")
    print(f"Max Price: ₦{df['price'].max():,}")
    print(f"Average Price: ₦{df['price'].mean():,.0f}")
    print(f"Median Price: ₦{df['price'].median():,}")

    return df

In [77]:
def one_hot_encoding_categorical_variables(df):
    """One-hot encode nominal categorical variables (location and house_type)"""

    print("One-hot encoding nominal categorical variables:")
    print(f"- location: {df['location'].unique()}")
    print(f"- house_type: {df['house_type'].unique()}")

    return pd.get_dummies(df, columns=['location', 'house_type'], prefix=['location', 'house_type'])

In [78]:
def preprocess_data(df):
    """Clean and preprocess the housing data"""

    print("\n" + "="*50)
    print("DATA PREPROCESSING")
    print("="*50)

    # Make a copy to avoid modifying original data
    df_processed = df.copy()

    # Remove property_id as it's just an identifier
    df_processed = df_processed.drop('property_id', axis=1)

    # One-hot encoding for nominal categories
    df_processed = one_hot_encoding_categorical_variables(df_processed)

    # Label encode binary categorical variables
    label_encoders = {}
    binary_columns = ['has_garage', 'has_pool']

    print("\nLabel encoding binary categorical variables:")
    for col in binary_columns:
        print(f"- {col}: {df_processed[col].unique()}")

        # Create a label encoder
        le = LabelEncoder()

        # Encode the binary columns into zeros and ones
        df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])

        # Associate the label encoder to with the column name
        label_encoders[col] = le

        # Show encoding mapping - i.e. show what values map to either zero or one
        unique_values = df_processed[col].unique()
        encoded_values = le.transform(unique_values)
        mapping = dict(zip(unique_values, encoded_values))
        print(f"  Encoding: {mapping}")

    # Drop original binary categorical columns
    df_processed = df_processed.drop(binary_columns, axis=1)

    print(f"\nProcessed dataset shape: {df_processed.shape}")
    print("Final columns:", df_processed.columns.tolist())

    return df_processed, label_encoders

In [79]:
def perform_cross_validation(model, X_train_scaled, y_train):
    """Retrieve multiple second opinions on our model's performance"""
    
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    print('\nList of 5 CV Scores')
    print(cv_scores)
    print(f"\n5-Fold Cross-Validation R² Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

When you split your data into train/test just once, you might get lucky (or unlucky) with that particular split. Maybe your test set happened to be really easy or really hard to predict.

With cross validation, you can split your training data into equal pieces - in our case we split it into 5 equal pieces with this argument: `cv=5`. After splitting, it trains and tests the splitted data 5 different times. Again, it's 5 times because we assigned 5 to the `cv` argument. Next, it gets 5  different performance scores for the 5 times it trains and tests our splitted data. Lastly, it averages them  for a more reliable estimate.

**Trains and tests 5 different times:**
- Round 1: Train on pieces 1,2,3,4. Test on piece 5
- Round 2: Train on pieces 1,2,3,5. Test on piece 4
- Round 3: Train on pieces 1,2,4,5 → Test on piece 3
- And so on...

And so when we print this:

`print(f"\n5-Fold Cross-Validation R² Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")`

1. `cv_scores.mean()` represents the average R^2 score across all 5 tests
2. `cv_scores.std() * 2` represents how much the 5 scores varied (standard deviation x 2). A small variation value means that the model is stable and reliable. A large variation value means that the model's performance is inconsistent.

Cross validation is like asking 5 different teachers to grade the same student's work instead of just one teacher - you get a fairer, more trustworthy assessment.

**For example**, if your single train/test split gave R² = 0.90, but cross-validation gives 0.75 (+/- 0.20), you know that 0.90 was probably just lucky, and your model's true performance is closer to 0.75.


**R² Score** means R-squared which can also be thought of as, the "Goodness of Fit" Measure. It helps answer the question "How well does my model explain the variation in the data?". Hence, here's how you would interpret the five CV scores:

- **R² = 1.0:** Perfect! The model explains 100% of the variation
- **R² = 0.8:** Good! The model explains 80% of why prices vary
- **R² = 0.5:** Okay. The model explains 50% of the variation
- **R² = 0.0:** The model is no better than guessing the average
- **R² < 0:** The model is actually worse than guessing the average (this is bad!)

In our case the five CV scores where all between  0.9 and 1.0. Meaning that our model can explain over 90% of the variation when predicting prices.

In [80]:
def make_predictions(model, X_train_scaled, X_test_scaled, y_train, y_test):
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Calculate metrics to validate model performance
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    print("\n" + "="*30)
    print("MODEL PERFORMANCE")
    print("="*30)
    print(f"Training R² Score: {train_r2:.4f}")
    print(f"Testing R² Score: {test_r2:.4f}")
    print(f"Training RMSE: ₦{train_rmse:,.0f}")
    print(f"Testing RMSE: ₦{test_rmse:,.0f}")
    print(f"Training MAE: ₦{train_mae:,.0f}")
    print(f"Testing MAE: ₦{test_mae:,.0f}")

    return y_test_pred

### The 2 `model.predict()` Prediction Lines

```python
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
```

#### What they do:

- These lines ask our trained model to make predictions
- First line: "What prices do you predict for the training data?"
- Second line: "What prices do you predict for the test data?"

#### Why we predict on both:

1.  Training predictions: To see how well the model learned from the data it was taught with
2.  Test predictions: To see how well it performs on completely new, unseen data (the real test!)

Think of it like a student taking practice tests (training) vs. the final exam (testing).


### The Metric Calculation Lines

```python
train_r2 = r2_score(y_train, y_train_pred)                       # Line 1
test_r2 = r2_score(y_test, y_test_pred)                          # Line 2
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))  # Line 3
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))     # Line 4
train_mae = mean_absolute_error(y_train, y_train_pred)           # Line 5
test_mae = mean_absolute_error(y_test, y_test_pred)              # Line 6
```

#### What these metrics mean:

1. **R² Score (R-squared):**

- Tells you "How much of the price variation can my model explain?"
- Scale: 0 to 1 (higher is better)
- Example: 0.85 means "I can explain 85% of why prices vary"

2. **RMSE (Root Mean Square Error):**

- Average prediction error in the same units as your target (₦ for prices)
- Lower is better
- Example: ₦50,000 RMSE means "On average, I'm off by about ₦50,000"

3. **MAE (Mean Absolute Error):**

- Average absolute difference between predicted and actual prices
- Also in same units, lower is better
- Example: ₦40,000 MAE means "Typically, I'm ₦40,000 away from the true price"


#### Why we calculate these metrics:

1. To know if our model is any good - Without metrics, we're flying blind
2. To compare different models - Which one performs better?
3. To detect overfitting - If training metrics are much better than test metrics, the model memorized rather than learned
4. To communicate results - "Our model predicts house prices with 85% accuracy"

**The key insight:** We always compare training vs. testing metrics. If they're similar, great! If training is much better than testing, our model might be "cheating" by memorizing the training data.

In [81]:
def train_linear_regression(df_processed):
    """Train linear regression model and evaluate performance"""

    print("\n" + "="*50)
    print("LINEAR REGRESSION TRAINING")
    print("="*50)

    # Retrieve all features excluding the price column
    features = df_processed.drop('price', axis=1)

    # Retrieve only the price column because price is the target variable, i.e., what we want to predict
    target_variable = df_processed['price']

    print("Features used for prediction:")
    for i, feature in enumerate(features    .columns):
        print(f"{i+1}. {feature}")

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target_variable, test_size=0.2, random_state=42)


    print(f"\nTraining set size: {X_train.shape[0]} samples")
    print(f"Testing set size: {X_test.shape[0]} samples")

    # This line creates the scaler to scale numerical features for better performance
    scaler = StandardScaler()

    # This fits the scaler on training data and transforms it
    # X_train_scaled is now a 2D array where each row is a training example and each column is a feature
    X_train_scaled = scaler.fit_transform(X_train)

    # This transforms test data using the same scaling parameters
    X_test_scaled = scaler.transform(X_test)

    # Create and train the linear regression model
    model = LinearRegression()

    # The fit() method is where the actual learning happens - it's how you train your linear regression model
    # It takes your training data (X_train_scaled and y_train) and finds the best (linear) line through it,
    # calculates the optimal coefficients (weights) for each feature, and determines the intercept (where the line crosses the y-axis)
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_test_pred = make_predictions(model, X_train_scaled, X_test_scaled, y_train, y_test)

    # Cross-validation
    perform_cross_validation(model, X_train_scaled, y_train)

    # Feature importance (coefficients)
    print("\n" + "="*30)
    print("FEATURE IMPORTANCE")
    print("="*30)
    feature_importance = pd.DataFrame({
        'Feature': features.columns,
        'Coefficient': model.coef_,
        'Abs_Coefficient': np.abs(model.coef_)
    }).sort_values('Abs_Coefficient', ascending=False)

    print(feature_importance[['Feature', 'Coefficient']])

    return model, scaler, X_test, y_test, y_test_pred, feature_importance

### train_test_split() Function Parameters

1. The first argument in the train_test_split() function above takes in all our feature matrix (all input variables, eg: area_sqm, bedrooms, bathrooms, has_garage_encoded, etc.)

2. The second argument in the train_test_split() function above takes in our target variable (the prices we want to predict)
 
3. test_size=0.2: Means 20% of data goes to testing, 80% to training

4. random_state=42: Sets a seed for reproducible results (you'll get the same split every time)

### train_test_split() Function Return

The function returns 4 arrays in this specific order:

1. X_train: Training features (80% of our feature data). For example, if we have 40 houses in our dataset, the X_train variable will hold training features for 32 houses

2. X_test: Testing features (20% of our feature data). For example, if we have 40 houses in our dataset, the X_test variable will hold testing features for 8 houses

3. y_train: Training targets (80% of our price data). For example, if we have 40 houses in our dataset, the y_train variable will hold prices for those same 32 houses in X_train. It is a 1D array of the actual outcomes.

4. y_test: Testing targets (20% of our price data). For example, if we have 40 houses in our dataset, the y_test variable will hold prices for those same 8 houses in X_test

### Why this split is important?
This split is important to:
1. **Train the model** using `X_train` and `y_train`
2. **Test the model** by predicting `X_test` and comparing with `y_test`
3. **Evaluate performance** - if the model predicts well on unseen data (`X_test`), it's likely to work well on completely new houses

### The matching relationship:

The split ensures that:

1. X_train[i] corresponds to y_train[i] (same house)
2. X_test[i] corresponds to y_test[i] (same house)
3. No house appears in both training and testing sets

This way, you're truly testing on "unseen" data, which gives you a realistic estimate of how your model will perform when predicting prices for new houses not in your original dataset.


### What happens when `model.fit(X_train_scaled, y_train)` runs?

The algorithm uses mathematical optimization to find the line that minimizes the difference between predicted values and actual values across all our training data.

After it runs, our model now "knows" the relationship between inputs and outputs. It has learned coefficients like "for every 1-unit increase in feature A, the prediction increases by 0.3" and so on.

Think of fit() as the "studying" phase - the model examines all our training examples to learn the patterns, so it can make predictions on new, unseen data later.

In [82]:
def predict_new_house(model, scaler, label_encoders, house_features):
    """Predict price for a new house"""

    print("\n" + "="*50)
    print("PREDICTING NEW HOUSE PRICE")
    print("="*50)

    # Create a DataFrame with the new house features
    new_house_df = pd.DataFrame([house_features])

    # One-hot encode location and house_type to match training data structure
    new_house_df = pd.get_dummies(new_house_df, columns=['location', 'house_type'], prefix=['location', 'house_type'])

    # Encode binary categorical variables using the same encoders
    binary_columns = ['has_garage', 'has_pool']
    for col in binary_columns:
        if col in new_house_df.columns:
            try:
                encoded_value = label_encoders[col].transform([new_house_df[col].iloc[0]])[0]
                new_house_df[col + '_encoded'] = encoded_value
                new_house_df = new_house_df.drop(col, axis=1)
            except ValueError as e:
                print(f"Error encoding {col}: {e}")
                print(f"Available values for {col}: {label_encoders[col].classes_}")
                return None

    # Get the feature columns from the training data (need to match exactly)
    expected_columns = model.feature_names_in_ if hasattr(model, 'feature_names_in_') else None

    if expected_columns is None:
        # If we can't get feature names, we need to manually create all possible dummy columns
        # This is a limitation - in practice, you'd want to save the column names from training
        print("Warning: Cannot determine exact feature names from model. Now using approximate matching.")

        # Add missing dummy columns with 0 values
        all_locations = ['Lagos-Mainland', 'Lagos-Island', 'Abuja-Central', 'Ibadan', 'Port Harcourt']
        all_house_types = ['Apartment', 'Duplex', 'Bungalow', 'Mansion']

        for location in all_locations:
            col_name = f'location_{location}'
            if col_name not in new_house_df.columns:
                new_house_df[col_name] = 0

        for house_type in all_house_types:
            col_name = f'house_type_{house_type}'
            if col_name not in new_house_df.columns:
                new_house_df[col_name] = 0

        # Ensure we have the basic numerical columns and encoded binary columns
        expected_order = ['area_sqm', 'bedrooms', 'bathrooms', 'age_years', 'location_Abuja-Central',
              'location_Ibadan', 'location_Lagos-Island', 'location_Lagos-Mainland', 'location_Port Harcourt',
              'house_type_Apartment', 'house_type_Bungalow', 'house_type_Duplex', 'house_type_Mansion',
              'has_garage_encoded', 'has_pool_encoded'
        ]

        # Reorder columns to match expected order and fill missing ones with 0
        for col in expected_order:
            if col not in new_house_df.columns:
                new_house_df[col] = 0

        new_house_df = new_house_df[expected_order]
    else:
        # Use the exact feature names from the trained model
        for col in expected_columns:
            if col not in new_house_df.columns:
                new_house_df[col] = 0
        new_house_df = new_house_df[expected_columns]

    # Scale the features
    new_house_scaled = scaler.transform(new_house_df)

    # Make prediction
    predicted_price = model.predict(new_house_scaled)[0]

    print("House Features:")
    for key, value in house_features.items():
        print(f"- {key}: {value}")

    print(f"\nPredicted Price: ₦{predicted_price:,.0f}")

    return predicted_price

In [83]:
def main():
    """Main function to run the complete pipeline"""

    # Load dataset into a dataframe
    housing_df = load_data('../../data/housing_data.csv')

    print()
    print("🏠 HOUSING PRICE PREDICTION USING LINEAR REGRESSION")
    print('\n')

    # Preprocess data
    df_processed, label_encoders = preprocess_data(housing_df)

    # Train model using linear regression
    model, scaler, X_test, y_test, y_test_pred, feature_importance = train_linear_regression(df_processed)

    new_house = {
        'area_sqm': 160,
        'bedrooms': 3,
        'bathrooms': 2,
        'age_years': 10,
        'location': 'Lagos-Island',
        'house_type': 'Duplex',
        'has_garage': 'Yes',
        'has_pool': 'No'
    }

    predict_new_house(model, scaler, label_encoders, new_house)

In [84]:
if __name__ == '__main__':
    main()


🏠 HOUSING PRICE PREDICTION USING LINEAR REGRESSION



DATA PREPROCESSING
One-hot encoding nominal categorical variables:
- location: ['Lagos-Mainland' 'Lagos-Island' 'Abuja-Central' 'Ibadan' 'Port Harcourt']
- house_type: ['Apartment' 'Duplex' 'Bungalow' 'Mansion']

Label encoding binary categorical variables:
- has_garage: ['No' 'Yes']
  Encoding: {'No': np.int64(0), 'Yes': np.int64(1)}
- has_pool: ['No' 'Yes']
  Encoding: {'No': np.int64(0), 'Yes': np.int64(1)}

Processed dataset shape: (40, 16)
Final columns: ['area_sqm', 'bedrooms', 'bathrooms', 'age_years', 'price', 'location_Abuja-Central', 'location_Ibadan', 'location_Lagos-Island', 'location_Lagos-Mainland', 'location_Port Harcourt', 'house_type_Apartment', 'house_type_Bungalow', 'house_type_Duplex', 'house_type_Mansion', 'has_garage_encoded', 'has_pool_encoded']

LINEAR REGRESSION TRAINING
Features used for prediction:
1. area_sqm
2. bedrooms
3. bathrooms
4. age_years
5. location_Abuja-Central
6. location_Ibadan
7. location_L