In [1]:
import pandas as pd

# import all the data to a single dataframe
reviews = pd.read_csv('csv_data/yelp_academic_dataset_review.csv')

# create a column with the month (and year) the review was made
reviews['date'] = reviews['date'].str[0:7]

# group the reviews by month and take the mean
monthly_reviews = reviews.groupby('date').mean()

In [2]:
# import data for gdp and adjust the date format
us_gdp = pd.read_csv('csv_data/us_gdp.csv')
us_gdp['date'] = us_gdp['date'].str[0:7]

# import data for price level and adjust date
price_level = pd.read_csv('csv_data/us_cpi.csv')
price_level['date'] = price_level['date'].str[0:7]

# find the monthly inflation rate (in percentage) from cpi
price_level['previous_cpi'] = price_level.cpi.shift(1)
price_level = price_level.fillna(203.1)
price_level['inflation'] = 100 * (price_level['cpi'] - price_level['previous_cpi']) / price_level['cpi']

# import data for disposable income
disposable_income = pd.read_csv('csv_data/us_disposable_income.csv')
disposable_income['date'] = disposable_income['date'].str[0:7]

In [3]:
time_series = monthly_reviews.loc[:].reset_index()
for dataframe in [us_gdp, price_level, disposable_income]:
    time_series = time_series.merge(dataframe, on='date')

In [4]:
time_series = time_series.rename(columns={"stars": "mean_rating"})
time_series = time_series[['date', 'mean_rating', 'gdp', 'cpi', 'inflation', 'disposable_income']]

time_series

Unnamed: 0,date,mean_rating,gdp,cpi,inflation,disposable_income
0,2007-01,3.889027,15478.956,203.437,0.165653,11403.7
1,2007-04,3.929793,15577.779,205.904,0.299169,11490.1
2,2007-07,3.893976,15671.605,207.603,0.177743,11500.4
3,2007-10,3.822253,15767.146,209.190,0.307376,11523.6
4,2008-01,3.861677,15702.906,212.174,0.343586,11553.4
...,...,...,...,...,...,...
56,2021-01,3.854847,19055.655,262.200,0.242563,16988.6
57,2021-04,3.714769,19368.310,266.727,0.636981,16146.9
58,2021-07,3.687040,19478.893,272.184,0.451533,15735.2
59,2021-10,3.654734,19806.290,276.590,0.859033,15444.5


In [5]:
# save the file
time_series.to_csv('processed_data/time_series.csv', index=False)