In [162]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance


In [163]:
df = pd.read_csv("charlotte_cleaned_data.csv")
target_col = 'median_sale_price'

In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11765 entries, 0 to 11764
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   period_begin             11765 non-null  object 
 1   period_end               11765 non-null  object 
 2   region                   11765 non-null  object 
 3   state_code               11765 non-null  object 
 4   median_sale_price        11765 non-null  float64
 5   median_list_price        11501 non-null  float64
 6   median_ppsf              11765 non-null  float64
 7   homes_sold               11765 non-null  int64  
 8   inventory                11481 non-null  float64
 9   new_listings             11502 non-null  float64
 10  months_of_supply         0 non-null      float64
 11  median_dom               11745 non-null  float64
 12  pending_sales            0 non-null      float64
 13  avg_sale_to_list         11759 non-null  float64
 14  sold_above_list       

In [165]:
df.head()

Unnamed: 0,period_begin,period_end,region,state_code,median_sale_price,median_list_price,median_ppsf,homes_sold,inventory,new_listings,...,inventory_mom,inventory_yoy,median_dom_mom,median_dom_yoy,price_drops_mom,price_drops_yoy,pending_sales_mom,pending_sales_yoy,new_listings_mom,new_listings_yoy
0,2015-12-01,2016-02-29,Zip Code: 29706,SC,111000.0,82500.0,66.41572,6,29.0,20.0,...,0.035714,0.611111,-47.5,-278.5,,,,,0.176471,0.666667
1,2013-02-01,2013-04-30,Zip Code: 28625,NC,125500.0,139900.0,69.139194,75,274.0,141.0,...,,,,,,,,,,
2,2021-04-01,2021-06-30,Zip Code: 28281,NC,1097500.0,340000.0,479.339833,2,5.0,3.0,...,-0.166667,0.666667,0.0,128.5,,,,,0.0,0.5
3,2021-01-01,2021-03-31,Zip Code: 28081,NC,240000.0,225000.0,135.663294,110,53.0,125.0,...,0.06,-0.417582,-9.0,-28.0,,,,,0.22549,0.0
4,2019-04-01,2019-06-30,Zip Code: 28117,NC,373500.0,386255.0,136.935276,382,496.0,430.0,...,0.066667,-0.07635,-4.0,9.0,,,,,0.007026,-0.122449


In [166]:
df['period_begin'] = pd.to_datetime(df['period_begin'])
df['year'] = df['period_begin'].dt.year
df['month'] = df['period_begin'].dt.month
df['time_index'] = (df['period_begin'] - df['period_begin'].min()).dt.days

In [167]:
target = 'median_sale_price'
numeric_features = ['median_list_price', 'median_ppsf', 'year', 'month', 'time_index']


In [168]:
df_cleaned_cols = df.dropna(axis=1, how='all')


df_final = df_cleaned_cols.dropna(subset=numeric_features)
df_final = df_final.dropna(subset=[target_col])


In [169]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11501 entries, 0 to 11764
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   period_begin             11501 non-null  datetime64[ns]
 1   period_end               11501 non-null  object        
 2   region                   11501 non-null  object        
 3   state_code               11501 non-null  object        
 4   median_sale_price        11501 non-null  float64       
 5   median_list_price        11501 non-null  float64       
 6   median_ppsf              11501 non-null  float64       
 7   homes_sold               11501 non-null  int64         
 8   inventory                11396 non-null  float64       
 9   new_listings             11501 non-null  float64       
 10  median_dom               11485 non-null  float64       
 11  avg_sale_to_list         11495 non-null  float64       
 12  sold_above_list          11501 non-nu

In [170]:
X = df[numeric_features]
y = df[target]

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [172]:
# test_size_ratio = 0.2
# test_size = max(1, int(len(X) * test_size_ratio))

# X_train = X[:-test_size]
# X_test = X[-test_size:]
# y_train = y[:-test_size]
# y_test = y[-test_size:]

In [173]:
print(X_train.shape)
print(y_train.shape)


(9412, 5)
(9412,)


In [174]:
model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    max_depth=10,
    min_samples_split=5
)

model.fit(X_train, y_train)


y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [None]:
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr_model.fit(X_train, y_train)
gbr_pred_test = gbr_model.predict(X_test)

In [175]:
def evaluate_model(y_true, y_pred, set_name):
    """Calculates R-squared, RMSE, and MAE."""
    
    r2 = r2_score(y_true, y_pred)
    

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
 
    mae = mean_absolute_error(y_true, y_pred)
    
   
    return {
        'Set': set_name,
        'R-squared (R^2)': r2,
        'RMSE': rmse,
        'MAE': mae
    }

In [None]:
evaluate_model(y_test, gbr_pred_test, 'test_set')


In [176]:
evaluate_model(y_test, y_pred_test, 'test_set')

{'Set': 'test_set',
 'R-squared (R^2)': 0.8617314073810908,
 'RMSE': np.float64(60894.059703904815),
 'MAE': 22308.833724426695}

In [177]:
evaluate_model(y_train, y_pred_train, 'train_set')

{'Set': 'train_set',
 'R-squared (R^2)': 0.9500655415321211,
 'RMSE': np.float64(34156.42892855856),
 'MAE': 18983.88050751316}

In [None]:



X = df[numeric_features ]
y = df[target]

# --- Preprocessing ---
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore')),
        ('num', 'passthrough', numeric_features)
    ]
)

# --- Model ---
model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)

pipeline = Pipeline([
    ('prep', preprocess),
    ('rf', model)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

print('Train R²:', pipeline.score(X_train, y_train))
print('Test R²:', pipeline.score(X_test, y_test))

In [None]:
# --- Permutation Importance ---
results = permutation_importance(pipeline, X_test, y_test, n_repeats=8, random_state=42, n_jobs=-1)

importances = results.importances_mean

# Map back feature names
encoded_cat = pipeline.named_steps['prep'].named_transformers_['cat'].get_feature_names_out()
feature_names = list(encoded_cat) + numeric_features

importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values('importance', ascending=False)

plt.figure(figsize=(10,6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.gca().invert_yaxis()
plt.title('Permutation Importance')
plt.show()

importance_df.head()