In [10]:
import mlflow
import mlflow.sklearn
import mlflow.pyfunc
from mlflow.tracking import MlflowClient

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from scipy.stats import ks_2samp, chi2_contingency

df = pd.read_parquet('./data/house_data.parquet')
df.head()

Unnamed: 0,price,city,state,bedrooms,bathrooms,area_sqft,lot_size,year_built,days_on_market,property_type,listing_agent,status,zipcode_encoded,mls_id,event_timestamp
0,554217,3,0,1,3,772,4757,1959,101,4,0,0,554217.0,104635,2000-01-01 00:00:00+00:00
1,164454,0,2,1,1,2348,3615,1969,46,0,1,2,164454.0,535721,2000-01-02 00:00:00+00:00
2,1249331,2,4,6,1,3630,9369,1990,59,4,4,0,1249331.0,900458,2000-01-03 00:00:00+00:00
3,189267,0,4,2,1,605,8804,1958,119,0,3,1,189267.0,318589,2000-01-04 00:00:00+00:00
4,465778,1,0,3,2,1711,9260,2020,26,4,3,1,465778.0,899716,2000-01-05 00:00:00+00:00


In [11]:
X = df.drop(columns = ['price', 'event_timestamp'], axis = 1)
y = df['price']
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

model.fit(X, y)
df['predicted_price'] = model.predict(X)
df.head()

Unnamed: 0,price,city,state,bedrooms,bathrooms,area_sqft,lot_size,year_built,days_on_market,property_type,listing_agent,status,zipcode_encoded,mls_id,event_timestamp,predicted_price
0,554217,3,0,1,3,772,4757,1959,101,4,0,0,554217.0,104635,2000-01-01 00:00:00+00:00,552481.67
1,164454,0,2,1,1,2348,3615,1969,46,0,1,2,164454.0,535721,2000-01-02 00:00:00+00:00,164415.03
2,1249331,2,4,6,1,3630,9369,1990,59,4,4,0,1249331.0,900458,2000-01-03 00:00:00+00:00,1249285.3
3,189267,0,4,2,1,605,8804,1958,119,0,3,1,189267.0,318589,2000-01-04 00:00:00+00:00,189299.84
4,465778,1,0,3,2,1711,9260,2020,26,4,3,1,465778.0,899716,2000-01-05 00:00:00+00:00,466585.84


In [25]:
# # To do it with the 'current data'
# cutoff_date = pd.Timestamp.now(tz='UTC') - timedelta(days=14)
# df_baseline = df[df['event_timestamp'] < cutoff_date]
# df_current = df[df['event_timestamp'] >= cutoff_date]

# Blunt way of doing it
sorted_df = df.sort_values('event_timestamp')
df_baseline = df.iloc[:-14]
df_current = df.iloc[14:]

numeric_features = ['bedrooms', 'bathrooms', 'area_sqft', 'lot_size',
                    'days_on_market']
# categorical_features = ['zipcode_encoded', 'status', 'listing_agent',
#                         'property_type', 'city', 'state']

categorical_features = ['zipcode_encoded', 'status', 'listing_agent',
                        'property_type', 'city', 'state']

drift_results = {}

# Testing numeric features with KS
for feature in numeric_features:
    stat, p_value = ks_2samp(df_baseline[feature].dropna(), df_current[feature].dropna())
    drift_results[feature] = {'test': 'KS', 'statistic': stat, 'p_value': p_value}

# Testing categorical features with chi-squared
for feature in categorical_features:
    base_counts = df_baseline[feature].value_counts()
    current_counts = df_current[feature].value_counts()
    if (len(base_counts) == 0 or len(current_counts) == 0):
        drift_results[feature] = {'test': 'Chi-squared',
                                  'statistic': None,
                                  'p_value': None}
        continue
    all_categories = base_counts.index.union(current_counts.index)
    base_frequency = base_counts.reindex(all_categories, fill_value=0)
    current_frequency = current_counts.reindex(all_categories, fill_value=0)
    contingency = np.array([base_frequency.values, current_frequency.values])
    chi2, p, _, _= chi2_contingency(contingency)
    drift_results[feature] = {'test': 'Chi-squared', 'statistic': chi2, 'p_value': p}

# Target drift
stat, p_value = ks_2samp(df_baseline['predicted_price'].dropna(), df_current['predicted_price'].dropna())
drift_results['predicted_price'] = {'test': 'KS', 'statistic': stat, 'p_value': p_value}

drift_df = pd.DataFrame.from_dict(drift_results, orient='index')
print(drift_df.sort_values('p_value'))

                        test  statistic   p_value
status           Chi-squared   0.027190  0.986497
property_type    Chi-squared   0.043707  0.999765
listing_agent    Chi-squared   0.030726  0.999883
state            Chi-squared   0.008285  0.999991
city             Chi-squared   0.001721  1.000000
bedrooms                  KS   0.001340  1.000000
days_on_market            KS   0.001340  1.000000
lot_size                  KS   0.001005  1.000000
area_sqft                 KS   0.001674  1.000000
bathrooms                 KS   0.001005  1.000000
predicted_price           KS   0.001674  1.000000
zipcode_encoded  Chi-squared  28.000000  1.000000
