This code will be used to make the following data exports. The exports will be uploaded to the Google Drive to be used by others/combined into the set of features:

1. For each business:
    a. Proportion of 1 and 2 star reviews pre-pandemic
    b. Proportion of 1 and 2 star reviews throughout the pandemic
    c. Proportion of 4 and 5 star reviews pre_pandemic
    d. Proportion of 4 and 5 star reviews throughout the pandmic
    
2. Tokenized reviews 

In [None]:
import json
import pandas as pd
import datetime
import re
from functools import reduce

## Calculate and export proportion of star ratings

In [None]:
## export start ratings data
def calculate_proportions(dat, include_stars):
    '''
    For each business, this function will calculate the proportion
    of reviews that gave certain star ratings. The arguments should be
    1. a data frame containing the data to work with and 
    2. a list of star ratings to be included in the proportion
    '''
    totals = dat.groupby("business_id").count().reset_index()
    star_totals = dat[dat.stars.isin(include_stars)].groupby(["business_id"]).count().reset_index()                    
    props = star_totals.merge(totals, left_on="business_id", right_on = "business_id")
    props["proportion"] = props.review_id_x/props.review_id_y
    ret = props[["business_id", "review_id_x", "review_id_y", "proportion"]]
    ret.columns = ["business_id", "nratings", "total_reviews", "proportion"]
    return(ret)



In [None]:
reviews = pd.read_json("processed_data/yelp_team7_dataset_review.json")
pre_covid = reviews[reviews.date < datetime.datetime(2020, 3, 1)]
covid_times = reviews[reviews.date >= datetime.datetime(2020, 3, 1)]

In [None]:
star_ratings = {"low": [1, 2],
                "high" : [4, 5]}
timing = {"pre_covid": pre_covid, 
          "covid_times": covid_times}
          
prop_dfs = []

for t in timing:
    for ratings in star_ratings:
        tmpdf = calculate_proportions(dat=timing[t], include_stars = star_ratings[ratings])
        prefix = t + "_" + ratings
        tmpdf.columns = ["business_id", 
                         prefix + "_" + "nratings", 
                        prefix + "_" + "total", 
                        prefix + "_" + "proportion"]
        prop_dfs.append(tmpdf)


In [None]:
### check the size of each
for df in prop_dfs:
    print(len(df.business_id.unique()))

In [None]:
merged = reduce(lambda  left,right: pd.merge(left,right,on=['business_id'],
                                            how='outer'), prop_dfs)

merged = merged.fillna(0)
# merged.head()
merged["low_delta"] = merged.covid_times_low_proportion - merged.pre_covid_low_proportion
merged["high_delta"] = merged.covid_times_high_proportion - merged.pre_covid_high_proportion

In [None]:
### check the merge
# len(reviews.business_id.unique()) == len(merged.business_id.unique())
merged.shape

In [None]:
merged.to_csv("processed_data/proportionate_star_ratings.csv")

## Calculate and export age based on first review

In [None]:
reviews["review_year"] = reviews.date.dt.year
reviews['age'] = 2021 - reviews.review_year
age = reviews.groupby("business_id").agg({'review_year': 'min'})
age["age"] = 2021 - age.review_year
age.head()

In [None]:
age.to_csv("processed_data/business_age_on_yelp.csv", index = True)