In [2]:
import sys
import os
import typing as tp
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

project_root = Path.cwd().parent  
base_data_dir = project_root / 'data'
data_dir = base_data_dir / 'processed'
plots_dir = project_root / 'results' / 'plots'

# Check if the directories exist
print(f"Data directory exists: {data_dir.exists()}")
print(f"Plots directory exists: {plots_dir.exists()}")


Data directory exists: True
Plots directory exists: True


In [3]:
state_file = data_dir / 'state_variables.csv'
if state_file.exists():
    state_df = pd.read_csv(state_file)
    print(state_df.head())
else:
    print(f"State data not found at {state_file}")

State data not found at /Users/claribelgonell/PycharmProjects/DigitalTwin_CRE_RL/data/processed/state_variables.csv


In [3]:
lease_file_boom = data_dir / 'leases_df_boom.csv'
lease_file_recession = data_dir / 'leases_df_recession.csv'
lease_file_stability = data_dir / 'leases_df_stability.csv'

In [4]:
lease_df_boom = pd.read_csv(lease_file_boom)
lease_df_recession = pd.read_csv(lease_file_recession)
lease_df_stability = pd.read_csv(lease_file_stability)


In [5]:
state_file_boom = data_dir / 'state_variables_boom.csv'
state_file_recession = data_dir / 'state_variables_recession.csv'
state_file_stability = data_dir / 'state_variables_stability.csv'

In [6]:
state_df_boom = pd.read_csv(state_file_boom)
state_df_recession = pd.read_csv(state_file_recession)
state_df_stability = pd.read_csv(state_file_stability)

In [7]:
economic_factors_boom = data_dir / 'economic_factors_boom.csv'
economic_factors_recession = data_dir / 'economic_factors_recession.csv'
economic_factors_stability = data_dir / 'economic_factors_stability.csv'

In [8]:
economic_factors_boom = pd.read_csv(economic_factors_boom)
economic_factors_recession = pd.read_csv(economic_factors_recession)
economic_factors_stability = pd.read_csv(economic_factors_stability)

In [9]:
lease_df_combined = pd.concat([lease_df_stability, lease_df_recession, lease_df_boom])
state_df_combined = pd.concat([state_df_stability, state_df_recession, state_df_boom])
econ_factors_combined = pd.concat([economic_factors_stability, economic_factors_recession, economic_factors_boom])

In [10]:
# Add TimeStep column to state_df based on index
state_df_combined['TimeStep'] = state_df_combined.index

# Add TimeStep column to leases_df based on LeaseYear and StartYear
lease_df_combined['TimeStep'] = lease_df_combined['LeaseYear'] - lease_df_combined['StartYear']

In [11]:

# Merge state and lease DataFrames
merged_state_lease_df = pd.merge(
    state_df_combined, 
    lease_df_combined, 
    on='TimeStep', 
    how='inner'
)


In [12]:
econ_factors_combined = econ_factors_combined.rename(columns={'Year': 'LeaseYear'})


In [13]:
# Merge with economic factors
merged_df = pd.merge(
    merged_state_lease_df,
    econ_factors_combined,
    on='LeaseYear',
    how='inner'
)

In [14]:
lease_df_stability['Scenario'] = 'Stability'
lease_df_recession['Scenario'] = 'Recession'
lease_df_boom['Scenario'] = 'Boom'

In [15]:
lease_df_combined2 = pd.concat([lease_df_stability, lease_df_recession, lease_df_boom])


In [16]:
# Add TimeStep column to leases_df based on LeaseYear and StartYear
lease_df_combined2['TimeStep'] = lease_df_combined2['LeaseYear'] - lease_df_combined2['StartYear']

In [17]:

# Merge state and lease DataFrames
merged_state_lease_df2 = pd.merge(
    state_df_combined, 
    lease_df_combined2, 
    on='TimeStep', 
    how='inner'
)


In [18]:
merged_state_lease_df2

Unnamed: 0,IndustryGrowth,LeaseLength,RSFOccupied,OccupancyRate,IncentivesAvailable,EconomicIndicator,VacancyRate,TimeStep,LeaseID,TenantID,StartYear,LeaseYear,FloorLevel,SuiteID,SuiteSquareFootage,RentAmount,AnnualRent,Occupied,VacantSpace,Scenario
0,0.040936,15.0,1457.256398,0.647886,2.0,0.046292,0.018670,0,1.0,1.0,1990,1990,14,14_1,1665.753811,12.00,19989.05,True,18334.25,Stability
1,0.040936,15.0,1457.256398,0.647886,2.0,0.046292,0.018670,0,2.0,1.0,1990,1990,1,1_2,4521.061098,12.00,54252.73,True,15478.94,Stability
2,0.040936,15.0,1457.256398,0.647886,2.0,0.046292,0.018670,0,3.0,1.0,1990,1990,20,20_3,2033.459200,12.00,24401.51,True,17966.54,Stability
3,0.040936,15.0,1457.256398,0.647886,2.0,0.046292,0.018670,0,4.0,2.0,1990,1990,20,20_3,2033.459200,12.00,24401.51,True,17966.54,Stability
4,0.040936,15.0,1457.256398,0.647886,2.0,0.046292,0.018670,0,5.0,2.0,1990,1990,10,10_4,3794.181084,12.00,45530.17,True,16205.82,Stability
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13078,0.047170,15.0,1499.651554,0.598730,1.0,0.057095,0.025085,14,96.0,26.0,1999,2013,7,7_3,3931.557017,28.53,112162.44,True,5046.69,Boom
13079,0.047170,15.0,1499.651554,0.598730,1.0,0.057095,0.025085,14,126.0,42.0,2001,2015,9,9_4,2059.392950,31.22,64302.68,True,12360.46,Boom
13080,0.047170,15.0,1499.651554,0.598730,1.0,0.057095,0.025085,14,152.0,50.0,2003,2017,20,20_4,3220.664682,34.17,110043.26,True,10469.88,Boom
13081,0.047170,15.0,1499.651554,0.598730,1.0,0.057095,0.025085,14,171.0,57.0,2005,2019,20,20_2,4857.568530,37.38,181577.68,True,5612.32,Boom


In [19]:
# Filter by economic scenario and include 'LeaseYear', 'VacantSpace', and 'OccupancyRate'
vacant_space_stability = merged_state_lease_df2[
    merged_state_lease_df2['Scenario'] == 'Stability'
][['LeaseYear', 'VacancyRate', 'VacantSpace', 'OccupancyRate']]

vacant_space_boom = merged_state_lease_df2[
    merged_state_lease_df2['Scenario'] == 'Boom'
][['LeaseYear', 'VacancyRate','VacantSpace', 'OccupancyRate']]

vacant_space_recession = merged_state_lease_df2[
    merged_state_lease_df2['Scenario'] == 'Recession'
][['LeaseYear', 'VacancyRate','VacantSpace', 'OccupancyRate']]

# Aggregate by year
vacant_space_stability = vacant_space_stability.groupby('LeaseYear').sum().reset_index()
vacant_space_boom = vacant_space_boom.groupby('LeaseYear').sum().reset_index()
vacant_space_recession = vacant_space_recession.groupby('LeaseYear').sum().reset_index()

# Display the updated DataFrames
print(vacant_space_stability.head())
print(vacant_space_boom.head())
print(vacant_space_recession.head())


   LeaseYear  VacancyRate  VacantSpace  OccupancyRate
0       1990     0.516788    344109.60      15.569853
1       1991     1.019855    777039.54      32.817080
2       1992     1.068692    942442.77      41.211903
3       1993     1.388951   1164364.65      56.013353
4       1994     1.741093   1423579.08      70.816927
   LeaseYear  VacancyRate  VacantSpace  OccupancyRate
0       1990     0.738269    496534.68      22.242648
1       1991     1.006133    753215.88      34.494454
2       1992     1.263444   1016526.72      50.518212
3       1993     1.673236   1332971.70      68.135407
4       1994     2.034410   1592802.69      79.866404
   LeaseYear  VacancyRate  VacantSpace  OccupancyRate
0       1990     0.590615    415297.38      17.794118
1       1991     0.710825    591710.13      25.597395
2       1992     1.402569   1053465.27      48.932600
3       1993     1.521375   1252350.45      57.615462
4       1994     1.612649   1409735.34      67.182187


In [20]:
economic_factors_stability = economic_factors_stability.rename(columns={'Year': 'LeaseYear'})
economic_factors_recession = economic_factors_recession.rename(columns={'Year': 'LeaseYear'})
economic_factors_boom = economic_factors_boom.rename(columns={'Year': 'LeaseYear'})


In [21]:
vacant_space_stability = pd.merge(vacant_space_stability, economic_factors_stability, on='LeaseYear', how='left')
vacant_space_recession = pd.merge(vacant_space_recession, economic_factors_recession, on='LeaseYear', how='left')
vacant_space_boom = pd.merge(vacant_space_boom, economic_factors_boom, on='LeaseYear', how='left')


In [22]:
import pandas as pd

def add_interaction_terms(df):
    """
    Add specified interaction terms to the original dataset.
    
    Parameters:
    - df: pandas DataFrame, original dataset with columns 'GDP Growth', 'Inflation',
          'Unemployment Rate', 'OccupancyRate', and 'VacancyRate'.
          
    Returns:
    - df: pandas DataFrame, dataset with new interaction terms added.
    """
    # Add the specified interaction terms to the dataset
    df['GDP_Inflation'] = df['GDP Growth'] * df['Inflation']
    df['GDP_Unemployment'] = df['GDP Growth'] * df['Unemployment Rate']
    df['Inflation_Unemployment'] = df['Inflation'] * df['Unemployment Rate']
    df['Occupancy_Vacancy'] = df['OccupancyRate'] * df['VacancyRate']
    df['GDP_Occupancy'] = df['GDP Growth'] * df['OccupancyRate']
    df['Inflation_Vacancy'] = df['Inflation'] * df['VacancyRate']
    
    return df



In [23]:
vacant_space_stability = add_interaction_terms(vacant_space_stability)
vacant_space_recession = add_interaction_terms(vacant_space_recession)
vacant_space_boom = add_interaction_terms(vacant_space_boom)


In [35]:
df = vacant_space_recession.copy()

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer

# Define the target variable and predictors
target_var = 'VacancyRate'
X = df.drop(columns=[target_var])
y = df[target_var]

# Define the models for comparison
models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01),
    'ElasticNet': ElasticNet(alpha=0.01, l1_ratio=0.5),
    'RandomForest': RandomForestRegressor(n_estimators=50, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=50, random_state=42)
}

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the custom RMSE scorer
rmse_scorer = make_scorer(mean_squared_error, squared=False)

# Dictionary to store cross-validation RMSE results for each model
cv_rmse_results = []

# Perform 5-fold cross-validation for each model
for model_name, model in models.items():
    # Calculate RMSE for each fold using cross-validation
    cv_rmse_scores = cross_val_score(model, X, y, cv=kf, scoring=rmse_scorer)
    
    # Calculate mean and standard deviation of RMSE across folds
    mean_rmse = np.mean(cv_rmse_scores)
    std_rmse = np.std(cv_rmse_scores)
    
    # Append results to list
    cv_rmse_results.append({
        'Model': model_name,
        'Mean RMSE': mean_rmse,
        'Std Dev RMSE': std_rmse
    })

# Convert results to a DataFrame
cv_rmse_df = pd.DataFrame(cv_rmse_results)

# Export the DataFrame to CSV
cv_rmse_df.to_csv('cv_rmse_results_boom.csv', index=False)

# Display the DataFrame
print(cv_rmse_df)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


              Model  Mean RMSE  Std Dev RMSE
0             Ridge   0.126398      0.030356
1             Lasso   0.119940      0.026294
2        ElasticNet   0.123004      0.027822
3      RandomForest   0.202801      0.074425
4  GradientBoosting   0.165810      0.046227




In [37]:
# Train RandomForest and GradientBoosting models on the entire dataset to extract feature importance
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the models on the entire dataset
rf_model.fit(X, y)
gb_model.fit(X, y)

# Extract feature importance from RandomForest and GradientBoosting
rf_feature_importance = rf_model.feature_importances_
gb_feature_importance = gb_model.feature_importances_

# Create a DataFrame to display feature importance for both models
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'RandomForest Importance': rf_feature_importance,
    'GradientBoosting Importance': gb_feature_importance
})

# Sort by GradientBoosting Importance for better visualization
feature_importance_df = feature_importance_df.sort_values(by='GradientBoosting Importance', ascending=False)

feature_importance_df.to_csv('feature_importance_df_recession.csv', index=False)


In [38]:
feature_importance_df

Unnamed: 0,Feature,RandomForest Importance,GradientBoosting Importance
9,Occupancy_Vacancy,0.40683,0.473074
2,OccupancyRate,0.399704,0.38497
1,VacantSpace,0.096005,0.098575
11,Inflation_Vacancy,0.045614,0.019959
0,LeaseYear,0.025774,0.01871
10,GDP_Occupancy,0.004995,0.002092
3,GDP Growth,0.004112,0.001022
5,Inflation,0.001867,0.000544
7,GDP_Unemployment,0.005289,0.000358
6,GDP_Inflation,0.006366,0.000346
