In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("~/real_estate/data/v14/unsupervised_df.csv")

In [4]:
features = [
    ## Zillow
    "median_sqft_value",
    "percent_decreasing",
    "percent_increasing",
    "zri_sqft_value",
    "zhvi_condo",
    "zhvi_singlefam",
    ## IRS
    "dependent_count_lag",
    "sum_adjusted_income_lag",
    "sum_real_estate_taxes_lag",
    "sum_mortgage_interest_paid_lag",
    "tax_return_count_lag",
    "joint_return_count_lag",
    "adult_count_lag",
    "people_count_lag",
    "mean_adjusted_income_lag",
    "mean_real_estate_taxes_hh_lag",
    "mean_mortgage_interest_hh_lag",
    "mean_adjusted_income_with_dep_lag",
    ## Fed
    "90_day_treasury_bill_rate",
    "mean_hourly_earnings_adj",
    "mean_fixed_mortgage_rate",
    "total_nonfarm_payroll",
    ## CBP
    "business_count_cbp_lag",
    "employment_count_cbp_lag",
    "business_100_count_cbp_lag",
    "business_500_count_cbp_lag",
    "restaurant_count_cbp_lag",
    "bar_count_cbp_lag",
    ## Yelp
    "mean_rating",
    "review_count",
    "restaurant_review_count",
    "bar_review_count",
    "price_2_review_count",
    "price_3_review_count"
]

target = "zhvi_condo"

# Store data from previous, future months and years
feature_mo = {}
feature_yr = {}
zhvi_condo_12_months = []

for i in features:
    if "_lag" not in i:
        feature_mo[i+"_mo"] = []
    feature_yr[i+"_yr"] = []

In [5]:
for i in df.index:
    
    current_row = df.iloc[i]
    region_name = current_row["postal_code"]
    month = int(current_row["month"])
    year = int(current_row["year"])
    next_year = year + 1
    
    # Get column values of previous time preiods
    last_year = year - 1
    if month == 1:
        last_month_month = 12
        last_month_year = year - 1
    else:
        last_month_month = month - 1
        last_month_year = year
    
    last_month_row = df.loc[(df["postal_code"] == region_name) & 
                            (df["month"] == last_month_month) &
                            (df["year"] == last_month_year)]
    
    last_year_row = df.loc[(df["postal_code"] == region_name) &
                           (df["month"] == month) &
                           (df["year"] == last_year)]
    
    next_year_row = df.loc[(df["postal_code"] == region_name) &
                           (df["month"] == month) &
                           (df["year"] == next_year)]
    
    for i in feature_mo.keys():
        feature_name = i[:-3]
        feature_mo[i].append(last_month_row[feature_name].values)
    for i in feature_yr.keys():
        feature_name = i[:-3]
        feature_yr[i].append(last_year_row[feature_name].values)
        
    next_year_target = next_year_row["zhvi_condo"]
    
    if len(next_year_target) == 0:
        next_year_target = None
    else:
        next_year_target = next_year_target.values[0]
    
    zhvi_condo_12_months.append(next_year_target)

In [6]:
# Add data to dataframe
for i in feature_mo.keys():
    new_list = [j[0] if len(j) > 0 else None for j in feature_mo[i]]
    feature_mo[i] = new_list

for i in feature_yr.keys():
    new_list = [j[0] if len(j) > 0 else None for j in feature_yr[i]]
    feature_yr[i] = new_list
    
for i in feature_mo.keys():
    df[i] = feature_mo[i]
    
for i in feature_yr.keys():
    df[i] = feature_yr[i]
    
df["zhvi_condo_12_months"] = zhvi_condo_12_months

In [7]:
print(df.shape)
df.head()

(25997, 90)


Unnamed: 0,postal_code,median_sqft_value,percent_decreasing,percent_increasing,zri_sqft_value,zhvi_condo,zhvi_singlefam,month,year,mean_rating,...,mean_real_estate_taxes_hh_lag_yr,price_2_review_count_yr,people_count_lag_yr,percent_increasing_yr,sum_adjusted_income_lag_yr,bar_count_cbp_lag_yr,tax_return_count_lag_yr,bar_review_count_yr,joint_return_count_lag_yr,zhvi_condo_12_months
0,2109,582.0,16.79,75.38,,534000.0,,1,2010,5.0,...,,,,,,,,,,513900.0
1,2109,583.0,12.27,81.85,,536700.0,,2,2010,5.0,...,,,,,,,,,,509700.0
2,2109,585.0,12.35,82.61,,533200.0,,3,2010,5.0,...,,,,,,,,,,511300.0
3,2109,582.0,12.12,82.23,,526600.0,,4,2010,5.0,...,,,,,,,,,,513800.0
4,2109,578.0,17.09,74.17,,523400.0,,5,2010,5.0,...,,,,,,,,,,515500.0


In [8]:
# Export
df.to_csv("~/real_estate/data/v14/supervised_df.csv", index=False)