In [6]:
import pandas as pd
import numpy as np
from util import *

# Content:
[1. Data Processing](##Data-Processing)

[2. Beer Style Similarities](##Beer-Style-Similarities)

- [2.1 Matching Product Beer Styles with Official Website Style Name](###Matching-Product-Beer-Styles-with-Official-Website-Style-Name)
- [2.2 Matching Product Beer Styles Manually](###Matching-Product-Beer-Styles-Manually)
- [2.2 Calculate Style Similarities](###Calculate-Style-Similarities)

## Data Processing

In [22]:
# Load the data:
beers_m = pd.read_csv('data/m_beers.csv')
ratings_m = pd.read_csv('data/m_ratings.csv')
users_m = pd.read_csv('data/m_users_approx.csv')
breweries_m = pd.read_csv('data/m_breweries.csv')
styles_bab = pd.read_csv('data/beer_styles_bab.csv')
styles_cb = pd.read_csv('data/beer_styles_cb.csv')

# Split matched data, split_matched_data is a function in util.py
beers_rb, beers_ba = split_matched_data(beers_m)
users_rb, users_ba = split_matched_data(users_m)
ratings_rb, ratings_ba = split_matched_data(ratings_m)
breweries_rb, breweries_ba = split_matched_data(breweries_m)

  beers_m = pd.read_csv('data/m_beers.csv')
  ratings_m = pd.read_csv('data/m_ratings.csv')


## Beer Style Similarities

### Matching Product Beer Styles with Official Website Style Names

In [26]:
# Get beers style
product_style = pd.concat([beers_ba[['beer_id', 'style']], beers_rb[['beer_id', 'style']]], axis=0)

# Product style set
product_style_set = list(set(product_style['style']))

# Get modified style name and description (official name in a beer style website)
m_styles = pd.read_csv('data/m_styles.csv')
styles_details = pd.merge(m_styles, styles_cb, how='left', left_on='Modified Style Name', right_on='style_name').drop_duplicates()

# List to store the matching results
matched_styles = []

styles_group_1 = styles_details['Original Style Name']
styles_group_2 = styles_bab['style_name']

# Match each style and store the results in the list (use fuzzywuzzy)
for style in styles_group_1:
    matched_style = match_styles(style, styles_group_2)
    matched_styles.append(matched_style)

# Replenish the matching results to Modified Style Name
styles_details['Modified Style Name 2'] = matched_styles
styles_details['Modified Style Name'] = styles_details.apply(replace_style_name, axis=1)
styles_details = styles_details.drop(columns=['Modified Style Name 2'])
styles_details = styles_details.drop(columns=['style_name'])

# List of column names to check
columns_to_check = ['Color_SRM', 'Bitterness_IBU', 'Alcohol_ABV', 'Color', 'Clarity',
                    'Perceived Malt Aroma & Flavor', 'Perceived Hop Aroma & Flavor',
                    'Body']

def fill_missing_values(row):
    if pd.notnull(row['Modified Style Name']):
        if row[columns_to_check].isnull().all():  # Check if all columns are NaN
            # Find the corresponding row in styles_bab
            matching_rows = styles_bab[styles_bab['style_name'] == row['Modified Style Name']]
            if not matching_rows.empty:  # Check if matching rows are found
                fill_values = matching_rows.iloc[0]
                for col in columns_to_check:
                    if pd.isnull(row[col]):
                        row[col] = fill_values[col]
    return row

# Apply this function to every row in styles_details
styles_details = styles_details.apply(fill_missing_values, axis=1)

def replace_varies(row):
    # If the word 'varies' or 'Varies' is present in the 'Color_SRM' column, replace its value with '1-100'
    if 'varies' in str(row['Color_SRM']).lower():
        row['Color_SRM'] = '1-100'
    # If the word 'varies' or 'Varies' is present in the 'Bitterness_IBU' column, replace its value with '1-120'
    if 'varies' in str(row['Bitterness_IBU']).lower():
        row['Bitterness_IBU'] = '1-120'
    # If the word 'varies' or 'Varies' is present in the 'Alcohol_ABV' column, replace its value with '0-15+%'
    if 'varies' in str(row['Alcohol_ABV']).lower():
        row['Alcohol_ABV'] = '0-15+%'
    return row

# Apply this function to each row of the styles_details DataFrame
styles_details = styles_details.apply(replace_varies, axis=1)


### Matching Product Beer Styles Manually

In [27]:
# Add lost data manually
missing_values = pd.DataFrame([{'style_name': 'American IPA', 'Color_SRM': '6-12', 'Bitterness_IBU': '50-70', 'Alcohol_ABV': '6.3-7.5%', 'Color': 'Gold to Copper, Red/Brown', 'Clarity': 'Clear to Slight Haze', 
                                        'Perceived Malt Aroma & Flavor': 'Biscuit, Bready, Caramel', 'Perceived Hop Aroma & Flavor': 'Hop aroma is high and hop flavor is strong both with floral qualities and citrus-like, piney, resinous or sulfur-like American-variety hop character. Hop bitterness is medium-high to very high', 'Fermentation Characteristics': None, 'Body': 'Soft to Sticky'},
                               {'style_name': 'German-Style Maibock', 'Color_SRM': '4-9', 'Bitterness_IBU': '20-38', 'Alcohol_ABV': '6-8%', 'Color': 'Pale to Light Amber', 'Clarity': 'Clear', 
                                        'Perceived Malt Aroma & Flavor': 'A lightly toasted and/or bready malt aroma is often evident. Roasted or heavy toast/caramel malt aromas should be absent', 'Perceived Hop Aroma & Flavor': 'Hop aroma and flavor are low to medium low, deriving from noble-type hops. Hop bitterness is low', 'Fermentation Characteristics': None, 'Body': 'Soft'},
                               {'style_name': 'Fruit and Field Beer', 'Color_SRM': '5-50', 'Bitterness_IBU': '5-45', 'Alcohol_ABV': '2.5-12%', 'Color': 'Pale to Very Dark', 'Clarity': 'Varies', 
                                        'Perceived Malt Aroma & Flavor': 'Malt sweetness is very low to medium-high', 'Perceived Hop Aroma & Flavor': 'Hop bitterness is very low to medium-high', 'Fermentation Characteristics': None, 'Body': 'Varies'},
                               {'style_name': 'English-Style IPA', 'Color_SRM': '6-14', 'Bitterness_IBU': '35-63', 'Alcohol_ABV': '5-7%', 'Color': 'Gold to Copper', 'Clarity': 'Clear to Slight Haze', 
                                        'Perceived Malt Aroma & Flavor': 'Bready, Biscuit, Toast, Caramel, Toffee', 'Perceived Hop Aroma & Flavor': 'Hop aroma is medium to high, often flowery. Hop flavor and bitterness are medium to high. Earthy and herbal English-variety hop character is perceived, but may be a result of the skillful use of hops of other national origins', 'Fermentation Characteristics': None, 'Body': 'Moderate'},
                               {'style_name': 'German-Style Pilsner', 'Color_SRM': '3-4', 'Bitterness_IBU': '25-50', 'Alcohol_ABV': '4.6-5.3%', 'Color': 'Straw to Pale', 'Clarity': 'Brilliant', 
                                        'Perceived Malt Aroma & Flavor': 'A sweet, malty residual aroma can be perceived. Bready or light biscuity attributes may be present.', 'Perceived Hop Aroma & Flavor': 'Hop flavor is moderate and quite obvious, deriving from late hopping (not dry-hopping) with noble-type hops. Hop bitterness is medium to high', 'Fermentation Characteristics': None, 'Body': 'Soft'},
                               {'style_name': 'American Amber Ale', 'Color_SRM': '11-18', 'Bitterness_IBU': '25-45', 'Alcohol_ABV': '4.4-6.1%', 'Color': 'Copper to Reddish Brown', 'Clarity': 'Clear to Slight Haze', 
                                        'Perceived Malt Aroma & Flavor': 'Caramel', 'Perceived Hop Aroma & Flavor': 'Citrus-like character is acceptable', 'Fermentation Characteristics': None, 'Body': 'Mouth-Coating'},
                               {'style_name': 'Imperial India Pale Ale', 'Color_SRM': '2-9', 'Bitterness_IBU': '65-100', 'Alcohol_ABV': '7.6-10.6%', 'Color': 'Gold to Light Brown', 'Clarity': 'Clear to Hazy', 
                                        'Perceived Malt Aroma & Flavor': 'Malt character is medium to high', 'Perceived Hop Aroma & Flavor': 'Hop flavor and aroma are very high, should be fresh and lively and should not be harsh in quality, deriving from any variety of hops. Hop bitterness is very high but not harsh', 'Fermentation Characteristics': None, 'Body': 'Mouth-Coating'}])

def fill_missing_values(row):
    if pd.notnull(row['Modified Style Name']):
        if row[columns_to_check].isnull().all():  # Check if all columns are NaN
            # Find the corresponding row in styles_bab
            matching_rows = missing_values[missing_values['style_name'] == row['Modified Style Name']]
            if not matching_rows.empty:  # Check if matching rows are found
                fill_values = matching_rows.iloc[0]
                for col in columns_to_check:
                    if pd.isnull(row[col]):
                        row[col] = fill_values[col]
    return row

styles_details = styles_details.apply(fill_missing_values, axis=1)

# Save the updated DataFrame to a CSV file
styles_details.to_csv('data/beer_style_details.csv')

### Calculate Style Similarities

In [35]:
# Load styles details for calculating style similarities
styles_details = pd.read_csv('data/beer_style_details.csv').drop(columns=['Fermentation Characteristics'])

# Calculate the number of rows that have at least one NaN value
num_rows_with_nan = styles_details.isna().any(axis=1).sum()

# Calculate the total number of rows in the DataFrame
total_rows = len(styles_details)

# Calculate the proportion of rows with NaN values
proportion_with_nan = num_rows_with_nan / total_rows

# Print the result
print(f"Number of rows with NaN values: {num_rows_with_nan:d}")
print(f"total number of rows in the DataFrame: {total_rows:d}")
print(f"Proportion of rows with NaN values: {proportion_with_nan:.2f}")

Number of rows with NaN values: 23
total number of rows in the DataFrame: 171
Proportion of rows with NaN values: 0.13
