In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
# Load data
df = pd.read_csv("eng_pp_9523_eda_pivoted.csv")

In [3]:
# Define target variables
target_vars = ["Terraced", "Semi-detached", "Detached", "Flat", "Others"]

In [4]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
le = LabelEncoder()

# Encode the "Month" column
df["Month"] = le.fit_transform(df["Month"])

In [9]:
# Function to train and evaluate models
def predict_demand(target_var):
    # Filter data for a specific property type
    data = df[["Year", "Month"] + [target_var]]
    
    # Create features (you can add more features based on your analysis)
    features = ["Month"]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data[features], data[target_var], test_size=0.2)
    
    # Train the Random Forest model
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    
    # Predict on training data for plotting
    y_pred_train = model.predict(X_train)
    
    # Predict on test data
    y_pred_test = model.predict(X_test)
    
    # Evaluate model performance
    mse_test = mean_squared_error(y_test, y_pred_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    print(f"Random Forest - MSE for {target_var} (Test): {mse_test}")
    print(f"Random Forest - MSE for {target_var} (Train): {mse_train}")

    # Predict for the year 2023
    future_months = range(1, 13)
    future_data = pd.DataFrame({"Month": future_months})
    future_pred = model.predict(future_data)

    # Plot predictions for test data
    plt.plot(X_test["Month"], y_test, label=f"Actual {target_var} (Test)", marker='o')
    plt.plot(X_test["Month"], y_pred_test, label=f"Predicted {target_var} (Test) - Random Forest")
    plt.xlabel("Month")
    plt.ylabel(f"Demand for {target_var}")
    plt.title(f"Predicted Demand for {target_var} (Test) - Random Forest")
    plt.legend()
    plt.show()

    # Plot predictions for training data
    #plt.plot(X_train["Month"], y_train, label=f"Actual {target_var} (Train)", marker='o')
    #plt.plot(X_train["Month"], y_pred_train, label=f"Predicted {target_var} (Train) - Random Forest")
    #plt.xlabel("Month")
    #plt.ylabel(f"Demand for {target_var}")
    #plt.title(f"Predicted Demand for {target_var} (Train) - Random Forest")
    #plt.legend()
    #plt.show()


In [10]:
# Set larger values for Agg backend parameters
plt.rcParams['agg.path.chunksize'] = 20000
plt.rcParams['path.simplify_threshold'] = 1.0

In [11]:
# Predict demand for each property type
for target_var in target_vars:
    predict_demand(target_var)

KeyboardInterrupt: 