In [5]:
import pandas as pd

In [2]:
df = pd.read_csv("~/real_estate/data/v13/unsupervised_df.csv")

In [3]:
features = [
    ## Zillow
    "median_sqft_value",
    "percent_decreasing",
    "percent_increasing",
    "zri_sqft_value",
    "zhvi_condo",
    "zhvi_singlefam",
    ## IRS
    "dependent_count_lag",
    "sum_adjusted_income_lag",
    "sum_real_estate_taxes_lag",
    "sum_mortgage_interest_paid_lag",
    "tax_return_count_lag",
    "joint_return_count_lag",
    "adult_count_lag",
    "people_count_lag",
    "mean_adjusted_income_lag",
    "mean_real_estate_taxes_hh_lag",
    "mean_mortgage_interest_hh_lag",
    "mean_adjusted_income_with_dep_lag",
    ## Fed
    "90_day_treasury_bill_rate",
    "mean_hourly_earnings_adj",
    "mean_fixed_mortgage_rate",
    "total_nonfarm_payroll",
    ## CBP
    "business_count_cbp_lag",
    "employment_count_cbp_lag",
    "business_100_count_cbp_lag",
    "business_500_count_cbp_lag",
    "restaurant_count_cbp_lag",
    "bar_count_cbp_lag"
]

target = "zhvi_condo"

# Store data from previous, future months and years
feature_mo = {}
feature_yr = {}
zhvi_condo_12_months = []

for i in features:
    if "_lag" not in i:
        feature_mo[i+"_mo"] = []
    feature_yr[i+"_yr"] = []

In [4]:
for i in df.index:
    
    current_row = df.iloc[i]
    region_name = current_row["postal_code"]
    month = int(current_row["month"])
    year = int(current_row["year"])
    next_year = year + 1
    
    # Get column values of previous time preiods
    last_year = year - 1
    if month == 1:
        last_month_month = 12
        last_month_year = year - 1
    else:
        last_month_month = month - 1
        last_month_year = year
    
    last_month_row = df.loc[(df["postal_code"] == region_name) & 
                            (df["month"] == last_month_month) &
                            (df["year"] == last_month_year)]
    
    last_year_row = df.loc[(df["postal_code"] == region_name) &
                           (df["month"] == month) &
                           (df["year"] == last_year)]
    
    next_year_row = df.loc[(df["postal_code"] == region_name) &
                           (df["month"] == month) &
                           (df["year"] == next_year)]
    
    for i in feature_mo.keys():
        feature_name = i[:-3]
        feature_mo[i].append(last_month_row[feature_name].values)
    for i in feature_yr.keys():
        feature_name = i[:-3]
        feature_yr[i].append(last_year_row[feature_name].values)
        
    next_year_target = next_year_row["zhvi_condo"]
    
    if len(next_year_target) == 0:
        next_year_target = None
    else:
        next_year_target = next_year_target.values[0]
    
    zhvi_condo_12_months.append(next_year_target)

In [6]:
# Add data to dataframe
for i in feature_mo.keys():
    new_list = [j[0] if len(j) > 0 else None for j in feature_mo[i]]
    feature_mo[i] = new_list

for i in feature_yr.keys():
    new_list = [j[0] if len(j) > 0 else None for j in feature_yr[i]]
    feature_yr[i] = new_list
    
for i in feature_mo.keys():
    df[i] = feature_mo[i]
    
for i in feature_yr.keys():
    df[i] = feature_yr[i]
    
df["zhvi_condo_12_months"] = zhvi_condo_12_months

In [7]:
print(df.shape)
df.head()

(473398, 71)


Unnamed: 0,postal_code,median_sqft_value,percent_decreasing,percent_increasing,zri_sqft_value,zhvi_condo,zhvi_singlefam,month,year,business_count_cbp_lag,...,business_count_cbp_lag_yr,mean_adjusted_income_with_dep_lag_yr,mean_adjusted_income_lag_yr,sum_mortgage_interest_paid_lag_yr,business_500_count_cbp_lag_yr,dependent_count_lag_yr,mean_hourly_earnings_adj_yr,restaurant_count_cbp_lag_yr,mean_fixed_mortgage_rate_yr,zhvi_condo_12_months
0,1001,140.0,40.72,38.95,,149500.0,197900.0,1,2010,458.0,...,,,,,,,,,,138200.0
1,1001,140.0,29.54,51.1,,149400.0,197700.0,2,2010,458.0,...,,,,,,,,,,137300.0
2,1001,141.0,24.1,59.19,,149600.0,198500.0,3,2010,458.0,...,,,,,,,,,,136700.0
3,1001,141.0,20.29,63.02,,149800.0,199600.0,4,2010,458.0,...,,,,,,,,,,136100.0
4,1001,142.0,23.19,58.79,,150400.0,200000.0,5,2010,458.0,...,,,,,,,,,,135100.0


In [8]:
# Export
df.to_csv("~/real_estate/data/v13/supervised_df.csv", index=False)