# Feature Engineering

## Purpose
This notebook focuses on creating new features to enhance the predictive power of models. Features are derived based on insights from the Exploratory Data Analysis (EDA).



In [14]:
import pandas as pd

In [15]:
#load cleaned dataset
file_path = '/Users/ebythomas/Documents/GitHub/Melbourne_Housing_Analytics/data/processed/cleaned_housing_data.csv'
housing_data = pd.read_csv(file_path)

#display the first few rows
housing_data.head()


Unnamed: 0,sale_year,median_price,transaction_count,type_encoded,small_area_encoded
0,2000,316250.0,116,0,0
1,2000,220000.0,309,1,0
2,2000,622500.0,42,0,2
3,2000,295000.0,139,1,2
4,2000,215250.0,258,0,3


### Rolling Average Price

- This feature calculates the 3-year moving average of `median_price` for each small area.
- It helps smooth out short-term fluctuations and highlight longer-term trends.


In [16]:
# Calculate 3-year rolling average of median price
housing_data['rolling_avg_price'] = housing_data.groupby('small_area_encoded')['median_price'] \
    .transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# Display the new feature
housing_data[['small_area_encoded', 'sale_year', 'median_price', 'rolling_avg_price']].head()


Unnamed: 0,small_area_encoded,sale_year,median_price,rolling_avg_price
0,0,2000,316250.0,316250.0
1,0,2000,220000.0,268125.0
2,2,2000,622500.0,622500.0
3,2,2000,295000.0,458750.0
4,3,2000,215250.0,215250.0


### Transaction Density

- This feature represents the proportion of transactions in each small area relative to the total transactions for that area.
- It highlights the activity level of each area in the housing market.


In [17]:
# Normalize transaction count by small area
area_transaction_sum = housing_data.groupby('small_area_encoded')['transaction_count'].transform('sum')
housing_data['transaction_density'] = housing_data['transaction_count'] / area_transaction_sum

# Display the new feature
housing_data[['small_area_encoded', 'transaction_count', 'transaction_density']].head()


Unnamed: 0,small_area_encoded,transaction_count,transaction_density
0,0,116,0.016445
1,0,309,0.043805
2,2,42,0.010041
3,2,139,0.03323
4,3,258,0.043165


In [18]:
housing_data.head()

Unnamed: 0,sale_year,median_price,transaction_count,type_encoded,small_area_encoded,rolling_avg_price,transaction_density
0,2000,316250.0,116,0,0,316250.0,0.016445
1,2000,220000.0,309,1,0,268125.0,0.043805
2,2000,622500.0,42,0,2,622500.0,0.010041
3,2000,295000.0,139,1,2,458750.0,0.03323
4,2000,215250.0,258,0,3,215250.0,0.043165


## Save Enhanced Dataset

The enhanced dataset with new features is saved for use in modeling.


In [19]:
# Save the enhanced dataset
housing_data.to_csv('/Users/ebythomas/Documents/GitHub/Melbourne_Housing_Analytics/data/processed/enhanced_data.csv', index=False)
print("Enhanced dataset saved to ~/processed/enhanced_data.csv")


Enhanced dataset saved to data/processed/enhanced_data.csv
