In [4]:
# regression_anomaly_detection.py

# Importing necessary libraries for regression and anomaly detection
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import IsolationForest
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
import pandas as pd

# Load the preprocessed data
filepath = 'real_estate_data.csv'  # Specify the correct path to your preprocessed data
data = pd.read_csv(filepath)

# Prepare features (X) and target (y)
X = data[[
    'Full Address', 
    'Project Name', 
    'Primary Photo', 
    'First Two Photos', 
    'Floor Plan', 
    'Property Type', 
    'For Sale or For Rent', 
    # 'Asking Price (PHP)',
    # 'Rent Price (PHP)', 
    # 'Price per Sqm (PHP)', 
    # 'Down payment Php', 
    # 'Down payment %', 
    # 'Advanced Payment (months)', 
    # 'Security Deposit (months)', 
    # 'Minimum Rent duration (months)', 
    'Generic Address', 
    'Approximated GPS Pin', 
    'Size (sqm)', 
    'Property Specifications', 
    'Property Features', 
    'Property Category', 
    'Created / Updated Date', 
    'Full Description', 
    'Developer', 
    'Verified Status', 
    'Badges', 
    'More Photos', 
    'Video Tour', 
    'Related Project', 
    'Broker', 
    'Nearness to landmarks/key locations', 
    'Similar Properties', 
    'Commission PHP', 
    'Commission %', 
    'Price vs Zonal Value'
]] 

y = data['Asking Price (PHP)']  # Target variable

# Display the first few rows of features and target to confirm
print(X.head())
print(y.head())


                                      Full Address  \
0          30188 Johnson Green\nBuckland, WV 59978   
1  589 Shannon Squares\nEast Kristenland, AK 16494   
2          24228 Julia Forge\nWest Jerry, ME 05645   
3               377 Megan Way\nShawnfurt, ND 51619   
4     221 Carr Extensions\nJarvisborough, UT 82903   

                 Project Name Primary Photo           First Two Photos  \
0             Richardson-Reid   theory.jpeg       short.jpg,decide.png   
1                    Good Inc     trial.jpg        later.jpeg,pass.jpg   
2                Reynolds PLC     check.bmp  outside.jpg,scientist.bmp   
3                  Diaz-Owens      lead.gif     available.jpg,seat.bmp   
4  Dunn, Johnson and Williams    strong.png  wonder.jpeg,consider.jpeg   

           Floor Plan  Property Type For Sale or For Rent  Rent Price (PHP)  \
0     contain.odt.pdf      Townhouse             For Sale               NaN   
1       build.mp4.pdf      Townhouse             For Sale               

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1600, 34), (400, 34), (1600,), (400,))

In [7]:
# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error (MSE) to evaluate the model
mse = mean_squared_error(y_test, y_pred)
mse


ValueError: could not convert string to float: '006 Russo Parkways\nNew Alan, IA 72297'

In [None]:
# Visualizing the predicted vs actual prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.title('Predicted vs Actual Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.show()


In [None]:
# Anomaly detection using Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(X)

# Marking anomalies as -1 and normal points as 1
data['anomaly'] = outliers

# Display rows with anomalies
anomalies = data[data['anomaly'] == -1]
anomalies.head()


In [None]:
# Visualizing anomalies in the price vs square footage scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['square_footage'], y=data['price'], hue=data['anomaly'], palette='coolwarm', legend=False)
plt.title('Price vs Square Footage with Anomalies')
plt.xlabel('Square Footage')
plt.ylabel('Price')
plt.show()
