<a href="https://colab.research.google.com/github/econ105/AI/blob/main/repeatedsalesindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

# Sample data: property_id, sale_date, sale_price
# Your Excel has aggregate rents, not individual transactions
data = {
    'property_id': [1,1, 2,2, 3,3],
    'sale_date': ['2020-01', '2022-01', '2019-06', '2021-12', '2021-03', '2023-09'],
    'sale_price': [300000, 360000, 450000, 495000, 280000, 325000]
}
df = pd.DataFrame(data)

# Step 1: Create repeat sales pairs
df['date_num'] = pd.to_datetime(df['sale_date']).dt.to_period('M').astype(int)
pairs = df.merge(df, on='property_id', suffixes=('_1', '_2'))
pairs = pairs[pairs['date_num_1'] < pairs['date_num_2']].copy()

# Step 2: Calculate log returns
pairs['ln_return'] = np.log(pairs['sale_price_2'] / pairs['sale_price_1'])

# Step 3: Create time dummies (periods from min to max date)
min_period, max_period = pairs['date_num_1'].min(), pairs['date_num_2'].max()
periods = np.arange(min_period, max_period + 1)
n_periods = len(periods)

# Step 4: Build dummy matrix Z (one row per pair)
Z = []
for _, row in pairs.iterrows():
    dummies = np.zeros(n_periods)
    t1_idx = row['date_num_1'] - min_period
    t2_idx = row['date_num_2'] - min_period
    dummies[t1_idx] = -1
    dummies[t2_idx] = +1
    Z.append(dummies)
Z = np.array(Z)

# Step 5: OLS regression (exclude base period = period 0)
y = pairs['ln_return'].values
X = Z[:, 1:]  # Drop first period (base = 0)
model = LinearRegression(fit_intercept=False).fit(X, y)
beta = np.zeros(n_periods)
beta[1:] = model.coef_

# Step 6: Convert to index levels
index_levels = np.exp(np.cumsum(beta)) * 100
print("Repeat Sales Index:", dict(zip(periods, index_levels)))


Repeat Sales Index: {np.int64(593): np.float64(100.0), np.int64(594): np.float64(100.0), np.int64(595): np.float64(100.0), np.int64(596): np.float64(100.0), np.int64(597): np.float64(100.0), np.int64(598): np.float64(100.0), np.int64(599): np.float64(100.0), np.int64(600): np.float64(91.28709291752769), np.int64(601): np.float64(91.28709291752769), np.int64(602): np.float64(91.28709291752769), np.int64(603): np.float64(91.28709291752769), np.int64(604): np.float64(91.28709291752769), np.int64(605): np.float64(91.28709291752769), np.int64(606): np.float64(91.28709291752769), np.int64(607): np.float64(91.28709291752769), np.int64(608): np.float64(91.28709291752769), np.int64(609): np.float64(91.28709291752769), np.int64(610): np.float64(91.28709291752769), np.int64(611): np.float64(91.28709291752769), np.int64(612): np.float64(91.28709291752769), np.int64(613): np.float64(91.28709291752769), np.int64(614): np.float64(84.73185457363233), np.int64(615): np.float64(84.73185457363233), np.in