In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("~/real_estate/data/v3/unsupervised_df.csv")
yelp_data = pd.read_csv("~/real_estate/data/yelp/yelp_20180919.csv", usecols = 
                        ["business_id","stars","postal_code","Restaurant","Coffee & Tea","Bar",
                         "price_range_3","price_range_2","price_range_1","year","month"])

In [3]:
yelp_data.head()

Unnamed: 0,business_id,stars,postal_code,Bar,Restaurant,Coffee & Tea,price_range_3,price_range_1,price_range_2,month,year
0,iCQpiavjjPzJ5_3gPD5Ebg,2,89109.0,0,1,0,0,1,0,2,2011
1,pomGBqfbxcqPv14c3XH-ZQ,5,89143.0,1,1,0,0,0,1,11,2012
2,jtQARsP6P-LbkyjbO1qNGg,1,89130.0,1,1,0,0,0,1,10,2014
3,elqbBhBfElMNSrjFqW3now,2,89108.0,0,1,0,0,0,1,2,2011
4,Ums3gaP2qM3W1XcA5r6SsQ,5,89149.0,0,1,0,0,1,0,9,2014


In [4]:
# Select columns to group
group_columns = ["postal_code", "year", "month"]
mean_columns = ["postal_code","year","month","stars"]
sum_columns = ["postal_code","year","month","Bar","Restaurant","Coffee & Tea","price_range_3","price_range_2"]
count_columns = ["postal_code","year","month","business_id"]

# Group, format
yelp_data_mean = yelp_data[mean_columns].groupby(by=group_columns).agg('mean')
yelp_data_mean = yelp_data_mean.rename(columns={"stars":"mean_rating"})

yelp_data_sum = yelp_data[sum_columns].groupby(by=group_columns).agg('sum')
yelp_data_sum = yelp_data_sum.rename(columns={"Bar":"bar_review_count",
                                              "Restaurant":"restaurant_review_count",
                                              "Coffee & Tea":"coffee_review_count", 
                                              "price_range_3":"price_3_review_count", 
                                              "price_range_2":"price_2_review_count"})

yelp_data_count = yelp_data[count_columns].groupby(by=group_columns).agg('count')
yelp_data_count = yelp_data_count.rename(columns={"business_id":"review_count"})

# Aggregate
yelp_data = pd.concat([yelp_data_mean, yelp_data_sum, yelp_data_count], sort=False, axis=1)

In [5]:
df.head()

Unnamed: 0,RegionName,median_sqft_value,percent_decreasing,percent_increasing,zhvi_1bed,zhvi_2bed,zhvi_3bed,zri_sqft_value,zhvi_condo,zhvi_singlefam,...,OR,PA,RI,SC,TN,TX,VA,WA,WI,WV
0,94109,677.0,60.67,28.87,512600.0,768100.0,1178000.0,3.176,689200.0,2493800.0,...,0,0,0,0,0,0,0,0,0,0
1,20002,305.0,41.97,44.82,286400.0,343500.0,363700.0,1.694,296400.0,371800.0,...,0,0,0,0,0,0,0,0,0,0
2,90046,501.0,70.76,20.34,341900.0,652300.0,1096300.0,2.268,440500.0,1104100.0,...,0,0,0,0,0,0,0,0,0,0
3,20009,505.0,33.92,48.06,344600.0,508900.0,684100.0,2.782,389300.0,703400.0,...,0,0,0,0,0,0,0,0,0,0
4,7030,471.0,68.06,18.35,341600.0,485200.0,889700.0,2.712,442300.0,948700.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df = df.rename(columns={"RegionName":"postal_code"})
yelp_data = yelp_data.reset_index()
df = pd.merge(df, yelp_data, on=["postal_code","year","month"], how="left")

In [7]:
df = df.sort_values(by=["postal_code","year","month"])

In [8]:
yelp_columns = ["mean_rating","bar_review_count","restaurant_review_count","coffee_review_count",
                "price_3_review_count","price_2_review_count","review_count"]

In [9]:
df[yelp_columns] = df[yelp_columns].fillna(value=0)

In [10]:
df = df.set_index(["postal_code","year","month"])

In [11]:
postal_codes = list(set([i[0] for i in df.index]))
yelp_postal_codes = list(set(yelp_data["postal_code"].values))
yelp_postal_codes = [i for i in yelp_postal_codes if i in postal_codes]

In [19]:
dfs = []
yelp_columns = ["mean_rating","bar_review_count","restaurant_review_count","coffee_review_count",
                "price_3_review_count","price_2_review_count","review_count"]

for i in yelp_postal_codes:
    
    subset_ = df.loc[i].reset_index().sort_values(by=["year","month"])
    subset_["postal_code"] = i
    years = subset_["year"].values
    months = subset_["month"].values
    
    for j in range(1, len(years)):
        
        year, month = years[j], months[j]
        year_, month_ = years[j-1], months[j-1]
        row = subset_.loc[j]
        row_ = subset_.loc[j]
        
        n_reviews = row["review_count"]
        n_reviews_ = row_["review_count"]
        n_reviews_new = n_reviews + n_reviews_
        
        for k in yelp_columns:
            
            col_value = row[k]
            col_value_ = row_[k]
            
            if k == "mean_rating":
                subset_.loc[j,k] = (col_value*n_reviews + col_value_*n_reviews_)/n_reviews_new
            else:
                subset_.loc[j,k] = col_value + col_value_
        
    dfs.append(subset_)
    
yelp_df = pd.concat(dfs)



In [27]:
df = df.reset_index()

df = df.loc[~df["postal_code"].isin(yelp_postal_codes)]
df = pd.concat([df, yelp_df])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,90_day_treasury_bill_rate,AL,AR,AZ,CA,CO,CT,DC,DE,FL,...,restaurant_review_count,review_count,total_nonfarm_payroll,year,zhvi_1bed,zhvi_2bed,zhvi_3bed,zhvi_condo,zhvi_singlefam,zri_sqft_value
0,0.14,0,0,0,0,0,0,0,0,0,...,0.0,0.0,130761.0,2010,63400.0,164900.0,192900.0,142700.0,191600.0,1.060
1,0.14,0,0,0,0,0,0,0,0,0,...,0.0,0.0,130834.0,2010,62300.0,163400.0,191200.0,139900.0,190300.0,1.064
2,0.15,0,0,0,0,0,0,0,0,0,...,0.0,0.0,130878.0,2011,61600.0,162900.0,191400.0,138200.0,190800.0,1.070
3,0.13,0,0,0,0,0,0,0,0,0,...,0.0,0.0,131060.0,2011,61400.0,162100.0,191700.0,137300.0,191000.0,1.076
4,0.10,0,0,0,0,0,0,0,0,0,...,0.0,0.0,131314.0,2011,61500.0,161500.0,191300.0,136700.0,190500.0,1.068
5,0.06,0,0,0,0,0,0,0,0,0,...,0.0,0.0,131637.0,2011,61800.0,160100.0,190200.0,136100.0,189700.0,1.068
6,0.04,0,0,0,0,0,0,0,0,0,...,0.0,0.0,131718.0,2011,61800.0,158000.0,189200.0,135100.0,189000.0,1.072
7,0.04,0,0,0,0,0,0,0,0,0,...,0.0,0.0,131952.0,2011,61600.0,156000.0,188600.0,134300.0,188500.0,1.092
8,0.04,0,0,0,0,0,0,0,0,0,...,0.0,0.0,132024.0,2011,61300.0,154800.0,188800.0,134700.0,188800.0,1.104
9,0.02,0,0,0,0,0,0,0,0,0,...,0.0,0.0,132136.0,2011,61000.0,154000.0,189400.0,135100.0,189400.0,1.114
