In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [2]:
# Load the dataset
df = pd.read_csv('eng_pp_9523_eda_pivoted.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,price,Old/New,Detached,Semi-detached,Terraced,Flat,Others,Street,Locality,Town/City,District,County
0,0,1995,February,44500,N,0,0,1,0,0,HOWICK PARK,SUNDERLAND,SUNDERLAND,SUNDERLAND,TYNE AND WEAR
1,1,1995,January,56500,N,0,0,1,0,0,BRICK KILN CLOSE,COGGESHALL,COLCHESTER,BRAINTREE,ESSEX
2,2,1995,July,58000,N,0,0,1,0,0,RAINSBROOK DRIVE,SHIRLEY,SOLIHULL,SOLIHULL,WEST MIDLANDS
3,3,1995,June,51000,N,0,1,0,0,0,MERRY HILL,BRIERLEY HILL,BRIERLEY HILL,DUDLEY,WEST MIDLANDS
4,4,1995,March,17000,N,0,0,1,0,0,DENMAN STREET,ROTHERHAM,ROTHERHAM,ROTHERHAM,SOUTH YORKSHIRE


In [4]:
def predict_demand_cv(target_var):
  # Filter data for specific property type
  data = df[["Year", "Month"] + [target_var]]

  # Convert Month to numerical format (assuming consistent formatting)
  data["Month"] = pd.to_datetime(data["Month"], format='%B').dt.month

  # Create features
  features = ["Month"]

  # K-Fold cross-validation setup
  kfold = KFold(n_splits=10, shuffle=True, random_state=42)  # Adjust n_splits as needed

  # Model dictionary
  models = {
      "Linear Regression": LinearRegression(),
      "Random Forest": RandomForestRegressor(n_estimators=100),
  }

  # Evaluate models using KFold cross-validation
  for name, model in models.items():
    mse_scores = cross_val_score(model, data[features], data[target_var], cv=kfold, scoring="neg_mean_squared_error")
    average_mse = np.mean(mse_scores) * -1  # Convert negative MSE to positive
    print(f"{name} - Average MSE for {target_var} (KFold CV): {average_mse:.2f}")

  # Prediction and visualization can be implemented here based on the best performing model



In [18]:
# Example usage (assuming 'Detached' is the target property type)
predict_demand_cv("Detached")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Month"] = pd.to_datetime(data["Month"], format='%B').dt.month


Linear Regression - Average MSE for Detached (KFold CV): 0.18
Random Forest - Average MSE for Detached (KFold CV): 0.18


In [5]:
predict_demand_cv("Semi-detached")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Month"] = pd.to_datetime(data["Month"], format='%B').dt.month


Linear Regression - Average MSE for Semi-detached (KFold CV): 0.20
Random Forest - Average MSE for Semi-detached (KFold CV): 0.20


In [6]:
predict_demand_cv("Terraced")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Month"] = pd.to_datetime(data["Month"], format='%B').dt.month


Linear Regression - Average MSE for Terraced (KFold CV): 0.21
Random Forest - Average MSE for Terraced (KFold CV): 0.21


In [5]:
predict_demand_cv("Flat")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Month"] = pd.to_datetime(data["Month"], format='%B').dt.month


Linear Regression - Average MSE for Flat (KFold CV): 0.15
Random Forest - Average MSE for Flat (KFold CV): 0.15


In [6]:
predict_demand_cv("Others")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Month"] = pd.to_datetime(data["Month"], format='%B').dt.month


Linear Regression - Average MSE for Others (KFold CV): 0.02
Random Forest - Average MSE for Others (KFold CV): 0.02
