In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder, StandardScaler
from fuzzywuzzy import process  

In [2]:
# Load data from JSON file or your data source
property_data = pd.read_json('properties.json')

In [3]:
# Adjusted function to clean the 'rooms' field
def clean_rooms_info(rooms):
    if pd.isna(rooms) or not isinstance(rooms, str):
        return [np.nan, np.nan, np.nan, np.nan]
    
    # Remove ' Save' and any irrelevant trailing text
    rooms = rooms.replace(' Save', '').strip()
    
    # Extract numbers associated with room types using regex
    bedrooms = bathrooms = toilets = parking_spaces = np.nan

    if 'Bedrooms' in rooms:
        bedrooms_match = re.search(r'(\d+)\s*Bedrooms?', rooms)
        if bedrooms_match:
            bedrooms = int(bedrooms_match.group(1))
    
    if 'Bathrooms' in rooms:
        bathrooms_match = re.search(r'(\d+)\s*Bathrooms?', rooms)
        if bathrooms_match:
            bathrooms = int(bathrooms_match.group(1))

    if 'Toilets' in rooms:
        toilets_match = re.search(r'(\d+)\s*Toilets?', rooms)
        if toilets_match:
            toilets = int(toilets_match.group(1))

    if 'Parking Spaces' in rooms:
        parking_spaces_match = re.search(r'(\d+)\s*Parking Spaces?', rooms)
        if parking_spaces_match:
            parking_spaces = int(parking_spaces_match.group(1))
    
    return [bedrooms, bathrooms, toilets, parking_spaces]

# Apply the cleaning function to the 'rooms' field
property_data[['Bedrooms', 'Bathrooms', 'Toilets', 'Parking Spaces']] = property_data['rooms'].apply(
    lambda x: pd.Series(clean_rooms_info(x))
)

In [4]:
# Convert numerical columns to proper types
property_data['Bedrooms'] = pd.to_numeric(property_data['Bedrooms'], errors='coerce')
property_data['Bathrooms'] = pd.to_numeric(property_data['Bathrooms'], errors='coerce')
property_data['Toilets'] = pd.to_numeric(property_data['Toilets'], errors='coerce')
property_data['Parking Spaces'] = pd.to_numeric(property_data['Parking Spaces'], errors='coerce')

# Clean the 'price' column (remove non-numeric characters and convert to numeric)
property_data['price'] = property_data['price'].replace('[\₦\$,]', '', regex=True).astype(float)

# Encode categorical 'location' feature
label_encoder = LabelEncoder()
property_data['location_encoded'] = label_encoder.fit_transform(property_data['location'])

# Fill missing values in numerical columns
property_data['Bedrooms'] = property_data['Bedrooms'].fillna(property_data['Bedrooms'].median())
property_data['Bathrooms'] = property_data['Bathrooms'].fillna(property_data['Bathrooms'].median())
property_data['Toilets'] = property_data['Toilets'].fillna(property_data['Toilets'].median())
property_data['Parking Spaces'] = property_data['Parking Spaces'].fillna(property_data['Parking Spaces'].median())

# Remove rows with missing price
property_data.dropna(subset=['price'], inplace=True)

# Prepare features (X) and target (y)
X = property_data[['Bedrooms', 'Bathrooms', 'Toilets', 'Parking Spaces', 'location_encoded']]
y = property_data['price']

# Ensure no missing values in features
valid_index = X.dropna().index
X_clean = X.loc[valid_index].reset_index(drop=True)
y_clean = y.loc[valid_index].reset_index(drop=True)


In [5]:
# Normalize numerical features
scaler = StandardScaler()
X_clean_scaled = pd.DataFrame(scaler.fit_transform(X_clean[['Bedrooms', 'Bathrooms', 'Toilets', 'Parking Spaces']]), 
                              columns=['Bedrooms', 'Bathrooms', 'Toilets', 'Parking Spaces'])

# Add back the encoded location column without normalization
X_clean_scaled['location_encoded'] = X_clean['location_encoded']

# Check if the dataset is empty after preprocessing
if X_clean_scaled.empty or y_clean.empty:
    print("\nError: The dataset is empty after preprocessing. Please check data integrity.")
else:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_clean_scaled, y_clean, test_size=0.2, random_state=42)

    # Train a regression model (Random Forest)
    price_model = RandomForestRegressor(n_estimators=100, random_state=42)
    price_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = price_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error for Price Prediction: {rmse}")

Root Mean Squared Error for Price Prediction: 12390017276.838114


In [6]:

    # Function to find the closest location match using fuzzy matching
    def get_closest_location(input_location, location_list):
        best_match, score = process.extractOne(input_location, location_list)
        if score >= 80:  # A threshold of 80% match
            return best_match
        return None

    # Recommendation system using Nearest Neighbors (content-based)
    nn_model = NearestNeighbors(n_neighbors=5, algorithm='auto')
    nn_model.fit(X_clean_scaled)


In [7]:
 # Function to recommend similar properties with a fallback strategy
def recommend_properties_with_fuzzy_matching(example_property, tolerance=0.1):
        example_encoded = example_property.copy()
        
        # Prepare example features with consistent feature names
        example_features = {
            'Bedrooms': example_encoded.get('Bedrooms', 0),
            'Bathrooms': example_encoded.get('Bathrooms', 0),
            'Toilets': example_encoded.get('Toilets', 0),
            'Parking Spaces': example_encoded.get('Parking Spaces', 0)
        }

        # Normalize example features using the same scaler
        example_features_scaled = scaler.transform(pd.DataFrame([example_features]))
        example_features_scaled = pd.DataFrame(example_features_scaled, columns=['Bedrooms', 'Bathrooms', 'Toilets', 'Parking Spaces'])

        # Attempt to find the closest matching location
        closest_location = get_closest_location(example_encoded['location'], property_data['location'].unique())
        if closest_location:
            location_encoded = label_encoder.transform([closest_location])[0]
            example_features_scaled['location_encoded'] = location_encoded
        else:
            print("Location not found in dataset. Recommending properties based on other features.")
            example_features_scaled['location_encoded'] = -1  # A placeholder to indicate unknown location

        # Ensure the feature structure matches the training data
        if example_features_scaled['location_encoded'].iloc[0] == -1:
            X_search = X_clean_scaled.drop(columns=['location_encoded'])  # Exclude location
            example_features_scaled = example_features_scaled.drop(columns=['location_encoded'])
        else:
            X_search = X_clean_scaled  # Use full feature set

        # Adjust nearest neighbor model to match feature columns
        nn_model_adjusted = NearestNeighbors(n_neighbors=5, algorithm='auto')
        nn_model_adjusted.fit(X_search)

        # Convert example_features_scaled to match the expected input format
        example_features_array = example_features_scaled.to_numpy()

        # Find the nearest neighbors
        distances, indices = nn_model_adjusted.kneighbors(example_features_array)
        valid_indices = [idx for idx in indices[0] if idx < len(property_data)]

        if not valid_indices:
            print("No valid recommendations found.")
            return pd.DataFrame()

        # Return the properties based on found indices
        recommended_properties = property_data.iloc[valid_indices]
        return recommended_properties[['location', 'Bedrooms', 'Bathrooms', 'Toilets', 'price']]

In [13]:
 # Example property for recommendation
example_property = {
        "location": "Ikeja",  # A partial location, not an exact match
        "Bedrooms": 4,
        "Bathrooms": 4,
        "Toilets": 4,
        "Parking Spaces": 0
    }

    # Show recommended properties with fuzzy matching
recommended_properties = recommend_properties_with_fuzzy_matching(example_property)
if not recommended_properties.empty:
        print("\nRecommended Properties with Fuzzy Matching:")
        print(recommended_properties[['location', 'Bedrooms', 'Bathrooms', 'Toilets', 'price']])
else:
        print("No matching property found.")



Recommended Properties with Fuzzy Matching:
                location  Bedrooms  Bathrooms  Toilets        price
384   Ogba, Ikeja, Lagos       4.0        4.0      4.0  110000000.0
1411  Ogba, Ikeja, Lagos       4.0        4.0      4.0  160000000.0
3115  Ogba, Ikeja, Lagos       4.0        4.0      4.0  285000000.0
162   Ogba, Ikeja, Lagos       4.0        4.0      5.0  180000000.0
434   Ogba, Ikeja, Lagos       4.0        4.0      5.0   90000000.0


