<a href="https://colab.research.google.com/github/ezinneanne/housing_dashboard/blob/housing/housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, mean_squared_error, r2_score

In [3]:
# Mounting gdrive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Set parameters
DATA_DIR = '/content/drive/MyDrive/'


# Reading the dataset
data = pd.read_csv(DATA_DIR + 'nigeria_houses_cc.csv', encoding='latin1')
# display the first 5 rows
data.head()

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,price,price Category
0,6.0,5.0,5.0,4.0,Detached Duplex,Mabushi,Abuja,450000000,1
1,4.0,5.0,5.0,4.0,Terraced Duplexes,Katampe,Abuja,800000000,1
2,4.0,5.0,5.0,4.0,Detached Duplex,Lekki,Lagos,120000000,1
3,4.0,4.0,5.0,6.0,Detached Duplex,Ajah,Lagos,40000000,1
4,4.0,4.0,5.0,2.0,Semi Detached Duplex,Lekki,Lagos,75000000,1


In [5]:
# Basic cleanup: Drop rows with missing values and duplicates
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13888 entries, 0 to 24324
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bedrooms        13888 non-null  float64
 1   bathrooms       13888 non-null  float64
 2   toilets         13888 non-null  float64
 3   parking_space   13888 non-null  float64
 4   title           13888 non-null  object 
 5   town            13888 non-null  object 
 6   state           13888 non-null  object 
 7   price           13888 non-null  int64  
 8   price Category  13888 non-null  int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 1.1+ MB


In [6]:
# Fix price format (remove commas or strings)
data['price'] = data['price'].astype(str).str.replace(",", "").astype(float)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13888 entries, 0 to 24324
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bedrooms        13888 non-null  float64
 1   bathrooms       13888 non-null  float64
 2   toilets         13888 non-null  float64
 3   parking_space   13888 non-null  float64
 4   title           13888 non-null  object 
 5   town            13888 non-null  object 
 6   state           13888 non-null  object 
 7   price           13888 non-null  float64
 8   price Category  13888 non-null  int64  
dtypes: float64(5), int64(1), object(3)
memory usage: 1.1+ MB


In [7]:
# Remove outliers in price using Z-score
from scipy.stats import zscore
data = data[(np.abs(zscore(data['price'])) < 3)]
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13880 entries, 0 to 24324
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bedrooms        13880 non-null  float64
 1   bathrooms       13880 non-null  float64
 2   toilets         13880 non-null  float64
 3   parking_space   13880 non-null  float64
 4   title           13880 non-null  object 
 5   town            13880 non-null  object 
 6   state           13880 non-null  object 
 7   price           13880 non-null  float64
 8   price Category  13880 non-null  int64  
dtypes: float64(5), int64(1), object(3)
memory usage: 1.1+ MB


In [8]:
data

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,price,price Category
0,6.0,5.0,5.0,4.0,Detached Duplex,Mabushi,Abuja,4.500000e+08,1
1,4.0,5.0,5.0,4.0,Terraced Duplexes,Katampe,Abuja,8.000000e+08,1
2,4.0,5.0,5.0,4.0,Detached Duplex,Lekki,Lagos,1.200000e+08,1
3,4.0,4.0,5.0,6.0,Detached Duplex,Ajah,Lagos,4.000000e+07,1
4,4.0,4.0,5.0,2.0,Semi Detached Duplex,Lekki,Lagos,7.500000e+07,1
...,...,...,...,...,...,...,...,...,...
24319,8.0,8.0,9.0,4.0,Detached Duplex,Guzape District,Abuja,1.000000e+09,1
24320,3.0,4.0,4.0,5.0,Detached Duplex,Lekki,Lagos,8.000000e+07,1
24321,2.0,2.0,2.0,4.0,Block of Flats,Kabusa,Abuja,1.500000e+07,1
24322,4.0,5.0,5.0,4.0,Block of Flats,Ado-Odo/Ota,Ogun,2.500000e+07,1


In [9]:
# Create Relative Price Category
# Group by 'state' and 'town' to get local thresholds
data['town_median'] = data.groupby(['state', 'town'])['price'].transform('median')

# Define price as "affordable" (0) or "expensive" (1) relative to town median
data['relative_price_category'] = (data['price'] > data['town_median']).astype(int)

# Drop old price Category
data.drop(columns=['price Category'], errors='ignore', inplace=True)
data

Unnamed: 0,bedrooms,bathrooms,toilets,parking_space,title,town,state,price,town_median,relative_price_category
0,6.0,5.0,5.0,4.0,Detached Duplex,Mabushi,Abuja,4.500000e+08,135000000.0,1
1,4.0,5.0,5.0,4.0,Terraced Duplexes,Katampe,Abuja,8.000000e+08,160000000.0,1
2,4.0,5.0,5.0,4.0,Detached Duplex,Lekki,Lagos,1.200000e+08,95000000.0,1
3,4.0,4.0,5.0,6.0,Detached Duplex,Ajah,Lagos,4.000000e+07,48000000.0,0
4,4.0,4.0,5.0,2.0,Semi Detached Duplex,Lekki,Lagos,7.500000e+07,95000000.0,0
...,...,...,...,...,...,...,...,...,...,...
24319,8.0,8.0,9.0,4.0,Detached Duplex,Guzape District,Abuja,1.000000e+09,125000000.0,1
24320,3.0,4.0,4.0,5.0,Detached Duplex,Lekki,Lagos,8.000000e+07,95000000.0,0
24321,2.0,2.0,2.0,4.0,Block of Flats,Kabusa,Abuja,1.500000e+07,35000000.0,0
24322,4.0,5.0,5.0,4.0,Block of Flats,Ado-Odo/Ota,Ogun,2.500000e+07,15000000.0,1


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13880 entries, 0 to 24324
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bedrooms                 13880 non-null  float64
 1   bathrooms                13880 non-null  float64
 2   toilets                  13880 non-null  float64
 3   parking_space            13880 non-null  float64
 4   title                    13880 non-null  object 
 5   town                     13880 non-null  object 
 6   state                    13880 non-null  object 
 7   price                    13880 non-null  float64
 8   town_median              13880 non-null  float64
 9   relative_price_category  13880 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 1.2+ MB


In [11]:
# Prepare Features and Targets
features = ['bedrooms', 'bathrooms', 'toilets', 'parking_space', 'title', 'town', 'state']
X = data[features]

# For regression
y_reg = data['price']

# For classification
y_clf = data['relative_price_category']

In [12]:
# Preprocessing
numerical_features = ['bedrooms', 'bathrooms', 'toilets', 'parking_space']
categorical_features = ['title', 'town', 'state']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [13]:
# Modeling Pipelines
# Regression
reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Classification
clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

In [14]:
# Train-Test Split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42)

In [15]:
# Train Models
reg_pipeline.fit(X_train_reg, y_train_reg)
clf_pipeline.fit(X_train_clf, y_train_clf)

In [16]:
# Regression
y_pred_reg = reg_pipeline.predict(X_test_reg)
print("Random Forest Regression MSE:", mean_squared_error(y_test_reg, y_pred_reg))
print("Random Forest Regression R2:", r2_score(y_test_reg, y_pred_reg))

Random Forest Regression MSE: 7.891693458217243e+17
Random Forest Regression R2: 0.04370499832909103


In [17]:
# Classification
y_pred_clf = clf_pipeline.predict(X_test_clf)
print("Classification Report:\n", classification_report(y_test_clf, y_pred_clf))

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.76      0.75      1474
           1       0.72      0.71      0.72      1302

    accuracy                           0.74      2776
   macro avg       0.74      0.74      0.74      2776
weighted avg       0.74      0.74      0.74      2776



In [22]:
from google.colab import files
import joblib


#joblib.dump(reg_pipeline, "regression_model.pkl")
#joblib.dump(clf_pipeline, "classification_model.pkl")


files.download("regression_model.pkl")
#files.download("classification_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
# Save the cleaned data
data.to_csv("nigeria_houses_cc_cleaned.csv", index=False)
files.download("nigeria_houses_cc_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>