In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
yelp_data = pd.read_csv("~/real_estate/data/yelp/yelp_20180919.csv", usecols = 
                        ["business_id","stars","postal_code","Restaurant","Coffee & Tea","Bar",
                         "price_range_3","price_range_2","price_range_1","year","month"])

In [3]:
# Select columns to group
group_columns = ["postal_code", "year", "month"]
mean_columns = ["postal_code","year","month","stars"]
sum_columns = ["postal_code","year","month","Bar","Restaurant","Coffee & Tea","price_range_3","price_range_2"]
count_columns = ["postal_code","year","month","business_id"]

# Group, format
yelp_data_mean = yelp_data[mean_columns].groupby(by=group_columns).agg('mean')
yelp_data_mean = yelp_data_mean.rename(columns={"stars":"mean_rating"})

yelp_data_sum = yelp_data[sum_columns].groupby(by=group_columns).agg('sum')
yelp_data_sum = yelp_data_sum.rename(columns={"Bar":"bar_review_count",
                                              "Restaurant":"restaurant_review_count",
                                              "Coffee & Tea":"coffee_review_count", 
                                              "price_range_3":"price_3_review_count", 
                                              "price_range_2":"price_2_review_count"})

yelp_data_count = yelp_data[count_columns].groupby(by=group_columns).agg('count')
yelp_data_count = yelp_data_count.rename(columns={"business_id":"review_count"})

# Aggregate
yelp_data = pd.concat([yelp_data_mean, yelp_data_sum, yelp_data_count], sort=False, axis=1)

In [4]:
# store postal codes
postal_codes = list(set([i[0] for i in yelp_data.index]))

# Get all possible dates
all_dates = [str(i[1])+"{:02d}".format(i[2]) for i in yelp_data.index]
all_dates = list(set(all_dates))
all_dates = sorted(all_dates)

# Get the first date for each postal code
min_dates = {}

for i in postal_codes:
    subset_ = yelp_data.loc[i]
    subset_dates = [str(i[0])+"{:02d}".format(i[1]) for i in subset_.index]
    min_date = min(subset_dates)
    min_dates[i] = min_date

# Make df with all dates since first for each zip code
postal_code_list = []
year_list = []
month_list = []

for i in postal_codes:
    min_date = min_dates[i]
    min_date_index = all_dates.index(min_date)
    for j in all_dates[min_date_index:]:
        year_ = int(j[:4])
        month_ = int(j[4:])
        postal_code_list.append(i)
        year_list.append(year_)
        month_list.append(month_)

yelp_full_date_df = pd.DataFrame({"postal_code":postal_code_list,
                                  "year":year_list,
                                  "month":month_list})

In [5]:
# Merge dataframes
yelp_data = yelp_data.reset_index()
df = pd.merge(yelp_data, yelp_full_date_df, on=["postal_code","year","month"], how="outer")
df = df.sort_values(by=["postal_code","year","month"])
df = df.fillna(0)
df = df.set_index(["postal_code","year","month"])

In [21]:
subset_ = df.loc[2109]

In [24]:
subset_.loc[(2009,3)]

mean_rating                5.0
bar_review_count           0.0
restaurant_review_count    0.0
coffee_review_count        0.0
price_3_review_count       1.0
price_2_review_count       0.0
review_count               1.0
Name: (2009, 3), dtype: float64

In [29]:
dfs = []
yelp_columns = ["mean_rating","bar_review_count","restaurant_review_count","coffee_review_count",
                "price_3_review_count","price_2_review_count"]

for i in postal_codes:
    
    subset_ = df.loc[i].sort_values(by=["year","month"])
    subset_["postal_code"] = i
    years = [j[0] for j in subset_.index]
    months = [j[1] for j in subset_.index]
    
    for j in range(1, len(years)):
        
        year, month = years[j], months[j]
        year_, month_ = years[j-1], months[j-1]
        row = subset_.loc[year, month]
        row_ = subset_.loc[year_, month_]
        
        n_reviews = row["review_count"]
        n_reviews_ = row_["review_count"]
        n_reviews_new = n_reviews + n_reviews_
        subset_.loc[year,month]["review_count"] = n_reviews_new
        
        for k in yelp_columns:
            
            col_value = row[k] if pd.notnull(row[k]) else 0
            col_value_ = row_[k] if pd.notnull(row_[k]) else 0
            
            if k == "mean_rating":
                subset_.loc[year, month][k] = (col_value*n_reviews + col_value_*n_reviews_)/n_reviews_new
            else:
                subset_.loc[year, month][k] = col_value + col_value_
        
    dfs.append(subset_)
    
yelp_df = pd.concat(dfs)

1.0 1.0 3.0
1.0 1.0 4.0
0.0 1.0 4.0
0.0 1.0 4.0
0.0 1.0 4.0


In [31]:
yelp_df = yelp_df.reset_index()

In [32]:
yelp_df.to_csv("~/real_estate/data/v4/yelp_unsupervised_df.csv", index=False)

In [33]:
yelp_df.head()

Unnamed: 0,year,month,mean_rating,bar_review_count,restaurant_review_count,coffee_review_count,price_3_review_count,price_2_review_count,review_count,postal_code
0,2018,2,1.0,0.0,0.0,0.0,0.0,0.0,2.0,28673.0
1,2018,3,1.0,0.0,0.0,0.0,0.0,0.0,3.0,28673.0
2,2018,4,1.0,0.0,0.0,0.0,0.0,0.0,4.0,28673.0
3,2018,5,1.0,0.0,0.0,0.0,0.0,0.0,4.0,28673.0
4,2018,6,1.0,0.0,0.0,0.0,0.0,0.0,4.0,28673.0
