# Preprocessing

In [None]:
import pandas as pd

In [None]:
reviews = pd.read_csv('data/raw/reviews.csv', parse_dates=["date"])
reviews = reviews.applymap(lambda x: x.lower() if isinstance(x, str) else x)
reviews.head()

In [None]:
# load breweries and transform the 'types' into a one-hot encoding representation
breweries = pd.read_csv('data/raw/breweries.csv', index_col="id")
breweries["brewery_type_bar"] = breweries["types"].apply(lambda x: int("bar" in x.lower()))
breweries["brewery_type_beer_to_go"] = breweries["types"].apply(lambda x: int("to-go" in x.lower()))
breweries["brewery_type_eatery"] = breweries["types"].apply(lambda x: int("eatery" in x.lower()))
breweries["brewery_type_homebrew"] = breweries["types"].apply(lambda x: int("homebrew" in x.lower()))
breweries["brewery_type_brewery"] = breweries["types"].apply(lambda x: int("brewery" in x.lower()))
breweries["brewery_type_store"] = breweries["types"].apply(lambda x: int("store" in x.lower()))
breweries = breweries.drop(columns=["types", "notes"])
breweries = breweries.rename(columns={"name":"brewery_name"})
breweries = breweries.applymap(lambda x: x.lower() if isinstance(x, str) else x)
breweries.head()

In [None]:
beers = pd.read_csv('data/raw/beers.csv')
beers = beers.drop(columns=["state", "country", "notes"])
beers = beers.rename(columns={"style":"style_detailed"})
beers = beers.applymap(lambda x: x.lower() if isinstance(x, str) else x)
beers["retired"] = beers["retired"].apply(lambda x: int(x=='t'))
beers.head()

In [None]:
aromas = pd.read_csv('data/raw/beer_data_set.csv', index_col="key")
aromas = aromas.applymap(lambda x: x.lower() if isinstance(x, str) else x)
aromas = aromas.rename(lambda x: "_".join(x.lower().split()), axis=1)
aromas = aromas.drop(columns=["style_key"])
aromas.head()

In [None]:
# merge beers with their respective breweries, dropping element which cannot be matched
beers = beers.merge(breweries, left_on="brewery_id", right_index=True, how="inner")
beers = beers.drop(columns=["brewery_id"])
print(beers.shape)
beers.head()

In [None]:
beers = beers.merge(aromas, left_on=["name", "brewery_name", "abv"], right_on=["name", "brewery", "abv"], how="inner")
print(beers.shape)
beers.head()

In [None]:
from utils.beer_metastyles import metastyle_beer_dict

# Create a new column for meta_style
beers['meta_style'] = beers['style']

# Create a function to find the metastyle

def find_metastyle(specific_style):
    for metastyle, beer_substyles in metastyle_beer_dict.items():
        for beer_substyle_keyword in beer_substyles:
            if beer_substyle_keyword in specific_style:
                return metastyle
    
    return 'Other'

# Iterate through the styles to find the metastyle
beers['meta_style'] = beers['style'].apply(find_metastyle)
beers = beers[
    [
        "id", "name", "abv",
        "meta_style", "style", "style_detailed",
        "min_ibu", "max_ibu", "alcohol", "astringency", "bitter", "body", "fruits", "hoppy", "malty", "salty", "sour", "spices", "sweet",
        "brewery_name", "city", "state", "country", 
            "brewery_type_bar", "brewery_type_beer_to_go", "brewery_type_eatery", "brewery_type_homebrew", "brewery_type_brewery", "brewery_type_store",
        "availability", "retired", "ave_rating"
        
    ]
]

In [None]:
print(reviews.shape)
reviews = reviews[reviews["beer_id"].isin(beers.index)]
print(reviews.shape)

In [None]:
for col in ["look", "smell", "taste", "feel", "overall", "score"]:
    beers["rating_reviews_"+col] = beers["id"].apply(lambda x: reviews[reviews["beer_id"] == x][col].median(skipna=True))
beers.head()

In [None]:
beers.to_csv("data/processed/beers.csv", index=False)

In [None]:
reviews.to_csv("data/processed/reviews.csv", index=False)