# Load data from the file

This file is here to load all the data from the various datasets and generate a merged version: `data/us_users_ratings.csv`.

In [1]:
import pandas as pd

from data_loader import (
    get_users_df,
    get_reviews_df,
    get_beers_df,
    get_breweries_df,
    join_breweries_on_beers,
    merge_reviews,
    get_us_reviews
)

In [2]:
reviews_path_ba = "../data/matched_beer_data/ratings_ba.txt"
reviews_path_rb = "../data/matched_beer_data/ratings_rb.txt"
users_path_ba = "../data/users_ba.csv"
users_path_rb = "../data/users_rb.csv"
breweries_path = "../data/matched_beer_data/breweries.csv"
beers_path = "../data/matched_beer_data/beers.csv"

## 1) Loading data 

- All the pre-processing is done using the functions defined in `data_loader.py`.
- The preprocessing functions are described and documented in `data_loader.py`.
- We use the reviews/ ratings from both BeerAdvocate and RateBeer, and use the matched beers and breweries dataset to make sure that the names of the same beers are identical across the two datasets.
- Those two reviews/ ratings datasets are then merged and joined to the breweries, beers and users datasets to obtain a single complete dataframe of beer reviews.
- After all the processing and merging is done, the final dataframe is saved as `data/us_users_ratings.csv` to be used in the analysis.



In [3]:
users_df_ba = get_users_df(users_path_ba)
users_df_rb = get_users_df(users_path_rb)
ba_df = get_reviews_df(reviews_path_ba)
rb_df = get_reviews_df(reviews_path_rb)
breweries_df = get_breweries_df(breweries_path)
beers_df = get_beers_df(beers_path)
beers_df = join_breweries_on_beers(beers_df, breweries_df)
reviews_df = merge_reviews(ba_df, rb_df, beers_df, users_df_ba, users_df_rb)
climate_classifications = pd.read_csv("../data/climate_classified.csv")
climate_classifications.set_index("climate", inplace=True)
states_climate = pd.read_csv("../data/states_climate.csv")
states_climate.set_index("State", inplace=True)
general_style_df = pd.read_csv("../data/general_styles.csv")
us_users_ratings = get_us_reviews(
    reviews_df=reviews_df, climate_classifications=climate_classifications, states_climate=states_climate, general_style=general_style_df
)

# save csv
us_users_ratings.to_csv("../data/us_users_ratings.csv", compression="gzip", index=False)

The following cell displays the dataframe to ensure that the data is loaded correctly.

In [4]:
us_users_ratings.head()

Unnamed: 0,beer_id,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,...,brewery_nbr_beers_ba,brewery_nbr_beers_rb,user_location,user_nbr_ratings,nbr_ratings,climate,climate_scheme,climate_precipitation,climate_temperature,general_style
0,19827,1417431600,Hellpop65,hellpop65.48993,,,,,,3.25,...,5,5,Kansas,2326.0,137,Cfa,Temperate,without dry season,hot summer,Pale Ale
1,19827,1401357600,Latarnik,latarnik.52897,,,,,,3.5,...,5,5,New Jersey,3098.0,137,Cfa,Temperate,without dry season,hot summer,Pale Ale
2,19827,1393412400,RochefortChris,rochefortchris.697017,,,,,,3.5,...,5,5,North Carolina,1866.0,137,Cfa,Temperate,without dry season,hot summer,Pale Ale
3,19827,1392030000,OKCNittany,okcnittany.144868,,,,,,3.75,...,5,5,Oklahoma,1131.0,137,Cfa,Temperate,without dry season,hot summer,Pale Ale
4,19827,1390647600,jaydoc,jaydoc.265507,,,,,,3.25,...,5,5,Kansas,9987.0,137,Cfa,Temperate,without dry season,hot summer,Pale Ale
