This code will be used to make the following data exports. The exports will be uploaded to the Google Drive to be used by others/combined into the set of features:

1. For each business:
    a. Proportion of 1 and 2 star reviews pre-pandemic
    b. Proportion of 1 and 2 star reviews throughout the pandemic
    c. Proportion of 4 and 5 star reviews pre_pandemic
    d. Proportion of 4 and 5 star reviews throughout the pandmic
    
2. Tokenized reviews 

In [None]:
import json
import pandas as pd
import datetime
import re
from functools import reduce

## Calculate and export proportion of star ratings

In [None]:
## export start ratings data
def calculate_proportions(dat, stars):
    '''
    For each business, this function will calculate the proportion
    of reviews that gave certain star ratings. The arguments should be
    1. a data frame containing the data to work with and 
    2. a list of star ratings to be included in the proportion
    '''
    totals = reviews.groupby("business_id").count().reset_index()
    star_totals = reviews[reviews.stars.isin(stars)].groupby(["business_id", "stars"]).count().reset_index()                    
    props = star_totals.merge(totals, left_on="business_id", right_on = "business_id")
    props["proportion"] = props.review_id_x/props.review_id_y
    return(props[["business_id", "review_id_x", "review_id_y", "proportion"]])



In [None]:
reviews = pd.read_json("processed_data/yelp_team7_dataset_review.json")
pre_covid = reviews[reviews.date < datetime.datetime(2020, 3, 1)]
covid_times = reviews[reviews.date >= datetime.datetime(2020, 3, 1)]

In [None]:
calculate_proportions(dat=pre_covid, stars=[1,2])


In [None]:
pre_covid_low = calculate_proportions(dat=pre_covid, stars=[1, 2])
pre_covid_low.rename(columns={
    "review_id_x": "pre_covid_low", 
    "review_id_y": "pre_covid_total",
    "proportion": "pre_covid_low_prop"
}, inplace = True)

pre_covid_high = calculate_proportions(dat=pre_covid, stars=[4, 5])
pre_covid_high.rename(columns={
    "review_id_x": "pre_covid_high", 
    "review_id_y": "pre_covid_total",
    "proportion": "pre_covid_high_prop"
}, inplace = True)

covid_low = calculate_proportions(dat=covid_times, stars=[1, 2])
covid_low.rename(columns={
    "review_id_x": "covid_low", 
    "review_id_y": "covid_total",
    "proportion": "covid_low_prop"
}, inplace = True)

covid_high = calculate_proportions(dat=covid_times, stars=[4, 5])
covid_high.rename(columns={
    "review_id_x": "covid_high", 
    "review_id_y": "covid_total",
    "proportion": "covid_high_prop"
}, inplace = True)

In [None]:
to_merge = [pre_covid_low, pre_covid_high, covid_low, covid_high]
merged = reduce(lambda  left,right: pd.merge(left,right,on=['business_id'],
                                            how='outer'), to_merge)
merged["low_delta"] = merged.covid_low_prop - merged.pre_covid_low_prop
merged["high_delta"] = merged.covid_high_prop - merged.pre_covid_high_prop

### check the merge
len(reviews.business_id.unique()) == len(merged.business_id.unique())

In [None]:
merged.to_csv("processed_data/proportionate_star_ratings.csv")

## Calculate and export age based on first review

In [None]:
reviews["review_year"] = reviews.date.dt.year
reviews['age'] = 2021 - reviews.review_year
age = reviews.groupby("business_id").agg({'review_year': 'min'})
age.shape

In [None]:
age.to_csv("processed_data/star_rating_proportions.csv", index = False)

