In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from util import *

# Content:
[1. Data Processing](##Data-Processing)

- [1.1 Convert Data into Json Format](###Convert-Data-into-Json-Format)

[2. Beer Style Similarities](##Beer-Style-Similarities)

- [2.1 Match Product Beer Styles with Official Website Style Name](###Match-Product-Beer-Styles-with-Official-Website-Style-Name)

- [2.2 Match Product Beer Styles Manually](###Match-Product-Beer-Styles-Manually)

- [2.3 Convert Style Features into Vectors](###Convert-Style-Features-into-Vectors)

    - [2.3.1 Calculate both the mean and the range for each of the numerical features](####Calculate-both-the-mean-and-the-range-for-each-of-the-numerical-features)

    - [2.3.2 Obtain vector representations for each text features](####Obtain-vector-representations-for-each-text-features)

- [2.4 Calculate Style Similarities](###Calculate-Style-Similarities)

## Data Processing

In [29]:
# Load the data:
beers_m = pd.read_csv('data/m_beers.csv')
ratings_m = pd.read_csv('data/m_ratings.csv')
users_m = pd.read_csv('data/m_users_approx.csv')
breweries_m = pd.read_csv('data/m_breweries.csv')
styles_bab = pd.read_csv('data/beer_styles_bab.csv')
styles_cb = pd.read_csv('data/beer_styles_cb.csv')

# Split matched data, split_matched_data is a function in util.py
beers_rb, beers_ba = split_matched_data(beers_m)
users_rb, users_ba = split_matched_data(users_m)
ratings_rb, ratings_ba = split_matched_data(ratings_m)
breweries_rb, breweries_ba = split_matched_data(breweries_m)

  beers_m = pd.read_csv('data/m_beers.csv')
  ratings_m = pd.read_csv('data/m_ratings.csv')


### Convert Data into Json Format

In [30]:
# Drop nan values
ratings_ba.dropna(inplace=True)
ratings_ba.reset_index(drop=True)

# The type of all the features is 'object'. Convert some of them to float.
convert_features=['abv', 'appearance', 'aroma', 'overall', 'palate', 'rating', 'taste']
ratings_rb[convert_features] = ratings_rb[convert_features].apply(pd.to_numeric, errors = 'coerce')
ratings_ba[convert_features] = ratings_ba[convert_features].apply(pd.to_numeric, errors = 'coerce')

# scaling the features
ratings_rb_with_convert_features = ratings_rb[convert_features]
ratings_ba_with_convert_features = ratings_ba[convert_features]
ratings_rb[convert_features] = (
    ratings_rb_with_convert_features - ratings_rb_with_convert_features.min()
) / (ratings_rb_with_convert_features.max() - ratings_rb_with_convert_features.min())
ratings_ba[convert_features] = (
    ratings_ba_with_convert_features - ratings_ba_with_convert_features.min()
) / (ratings_ba_with_convert_features.max() - ratings_ba_with_convert_features.min())

# Convert Datafrome into json
ratings_rb[convert_features].to_json('viz_data/ratings_rb.json', orient='records')
ratings_ba[convert_features].to_json('viz_data/ratings_ba.json', orient='records')
# Correlation
detailed_features = ['abv', 'appearance', 'aroma', 'palate', 'taste']
ratings_rb[detailed_features].corr().to_json('viz_data/ratings_rb_corr.json', orient='records')
ratings_ba[detailed_features].corr().to_json('viz_data/ratings_ba_corr.json', orient='records')

## Beer Style Similarities

### Match Product Beer Styles with Official Website Style Names

In [31]:
# Get beers style
product_style = pd.concat([beers_ba[['beer_id', 'style']], beers_rb[['beer_id', 'style']]], axis=0)

# Product style set
product_style_set = list(set(product_style['style']))

# Get modified style name and description (official name in a beer style website)
m_styles = pd.read_csv('data/m_styles.csv')
styles_details = pd.merge(m_styles, styles_cb, how='left', left_on='Modified Style Name', right_on='style_name').drop_duplicates()

# List to store the matching results
matched_styles = []

styles_group_1 = styles_details['Original Style Name']
styles_group_2 = styles_bab['style_name']

# Match each style and store the results in the list (use fuzzywuzzy)
for style in styles_group_1:
    matched_style = match_styles(style, styles_group_2)
    matched_styles.append(matched_style)

# Replenish the matching results to Modified Style Name
styles_details['Modified Style Name 2'] = matched_styles
styles_details['Modified Style Name'] = styles_details.apply(replace_style_name, axis=1)
styles_details = styles_details.drop(columns=['Modified Style Name 2'])
styles_details = styles_details.drop(columns=['style_name'])

# List of column names to check
columns_to_check = ['Color_SRM', 'Bitterness_IBU', 'Alcohol_ABV', 'Color', 'Clarity',
                    'Perceived Malt Aroma & Flavor', 'Perceived Hop Aroma & Flavor',
                    'Body']

def fill_missing_values(row):
    if pd.notnull(row['Modified Style Name']):
        if row[columns_to_check].isnull().all():  # Check if all columns are NaN
            # Find the corresponding row in styles_bab
            matching_rows = styles_bab[styles_bab['style_name'] == row['Modified Style Name']]
            if not matching_rows.empty:  # Check if matching rows are found
                fill_values = matching_rows.iloc[0]
                for col in columns_to_check:
                    if pd.isnull(row[col]):
                        row[col] = fill_values[col]
    return row

# Apply this function to every row in styles_details
styles_details = styles_details.apply(fill_missing_values, axis=1)

def replace_varies(row):
    # If the word 'varies' or 'Varies' is present in the 'Color_SRM' column, replace its value with '1-100'
    if 'varies' in str(row['Color_SRM']).lower():
        row['Color_SRM'] = '1-100'
    # If the word 'varies' or 'Varies' is present in the 'Bitterness_IBU' column, replace its value with '1-120'
    if 'varies' in str(row['Bitterness_IBU']).lower():
        row['Bitterness_IBU'] = '1-120'
    # If the word 'varies' or 'Varies' is present in the 'Alcohol_ABV' column, replace its value with '0-15+%'
    if 'varies' in str(row['Alcohol_ABV']).lower():
        row['Alcohol_ABV'] = '0-15%'
    # If other symbol in the 'Alcohol_ABV':
    if '<' in str(row['Alcohol_ABV']):
        row['Alcohol_ABV'] = '0-' + row['Color_SRM'].split(' ')[0][1:]
    if '--' in str(row['Alcohol_ABV']):
        row['Alcohol_ABV'] = '0-100%'
    # If the word with '+' in the 'Color_SRM' column, replace its value with 'the number-100'
    if '+' in str(row['Color_SRM']):
        row['Color_SRM'] = row['Color_SRM'].split('+')[0]+'-100'
    return row

# Apply this function to each row of the styles_details DataFrame
styles_details = styles_details.apply(replace_varies, axis=1)


### Match Product Beer Styles Manually

In [32]:
# Add lost data manually
missing_values = pd.DataFrame([{'style_name': 'American IPA', 'Color_SRM': '6-12', 'Bitterness_IBU': '50-70', 'Alcohol_ABV': '6.3-7.5%', 'Color': 'Gold to Copper, Red/Brown', 'Clarity': 'Clear to Slight Haze', 
                                        'Perceived Malt Aroma & Flavor': 'Biscuit, Bready, Caramel', 'Perceived Hop Aroma & Flavor': 'Hop aroma is high and hop flavor is strong both with floral qualities and citrus-like, piney, resinous or sulfur-like American-variety hop character. Hop bitterness is medium-high to very high', 'Fermentation Characteristics': None, 'Body': 'Soft to Sticky'},
                               {'style_name': 'German-Style Maibock', 'Color_SRM': '4-9', 'Bitterness_IBU': '20-38', 'Alcohol_ABV': '6-8%', 'Color': 'Pale to Light Amber', 'Clarity': 'Clear', 
                                        'Perceived Malt Aroma & Flavor': 'A lightly toasted and/or bready malt aroma is often evident. Roasted or heavy toast/caramel malt aromas should be absent', 'Perceived Hop Aroma & Flavor': 'Hop aroma and flavor are low to medium low, deriving from noble-type hops. Hop bitterness is low', 'Fermentation Characteristics': None, 'Body': 'Soft'},
                               {'style_name': 'Fruit and Field Beer', 'Color_SRM': '5-50', 'Bitterness_IBU': '5-45', 'Alcohol_ABV': '2.5-12%', 'Color': 'Pale to Very Dark', 'Clarity': 'Varies', 
                                        'Perceived Malt Aroma & Flavor': 'Malt sweetness is very low to medium-high', 'Perceived Hop Aroma & Flavor': 'Hop bitterness is very low to medium-high', 'Fermentation Characteristics': None, 'Body': 'Varies'},
                               {'style_name': 'English-Style IPA', 'Color_SRM': '6-14', 'Bitterness_IBU': '35-63', 'Alcohol_ABV': '5-7%', 'Color': 'Gold to Copper', 'Clarity': 'Clear to Slight Haze', 
                                        'Perceived Malt Aroma & Flavor': 'Bready, Biscuit, Toast, Caramel, Toffee', 'Perceived Hop Aroma & Flavor': 'Hop aroma is medium to high, often flowery. Hop flavor and bitterness are medium to high. Earthy and herbal English-variety hop character is perceived, but may be a result of the skillful use of hops of other national origins', 'Fermentation Characteristics': None, 'Body': 'Moderate'},
                               {'style_name': 'German-Style Pilsner', 'Color_SRM': '3-4', 'Bitterness_IBU': '25-50', 'Alcohol_ABV': '4.6-5.3%', 'Color': 'Straw to Pale', 'Clarity': 'Brilliant', 
                                        'Perceived Malt Aroma & Flavor': 'A sweet, malty residual aroma can be perceived. Bready or light biscuity attributes may be present.', 'Perceived Hop Aroma & Flavor': 'Hop flavor is moderate and quite obvious, deriving from late hopping (not dry-hopping) with noble-type hops. Hop bitterness is medium to high', 'Fermentation Characteristics': None, 'Body': 'Soft'},
                               {'style_name': 'American Amber Ale', 'Color_SRM': '11-18', 'Bitterness_IBU': '25-45', 'Alcohol_ABV': '4.4-6.1%', 'Color': 'Copper to Reddish Brown', 'Clarity': 'Clear to Slight Haze', 
                                        'Perceived Malt Aroma & Flavor': 'Caramel', 'Perceived Hop Aroma & Flavor': 'Citrus-like character is acceptable', 'Fermentation Characteristics': None, 'Body': 'Mouth-Coating'},
                               {'style_name': 'Imperial India Pale Ale', 'Color_SRM': '2-9', 'Bitterness_IBU': '65-100', 'Alcohol_ABV': '7.6-10.6%', 'Color': 'Gold to Light Brown', 'Clarity': 'Clear to Hazy', 
                                        'Perceived Malt Aroma & Flavor': 'Malt character is medium to high', 'Perceived Hop Aroma & Flavor': 'Hop flavor and aroma are very high, should be fresh and lively and should not be harsh in quality, deriving from any variety of hops. Hop bitterness is very high but not harsh', 'Fermentation Characteristics': None, 'Body': 'Mouth-Coating'}])

def fill_missing_values(row):
    if pd.notnull(row['Modified Style Name']):
        if row[columns_to_check].isnull().all():  # Check if all columns are NaN
            # Find the corresponding row in styles_bab
            matching_rows = missing_values[missing_values['style_name'] == row['Modified Style Name']]
            if not matching_rows.empty:  # Check if matching rows are found
                fill_values = matching_rows.iloc[0]
                for col in columns_to_check:
                    if pd.isnull(row[col]):
                        row[col] = fill_values[col]
    return row

styles_details = styles_details.apply(fill_missing_values, axis=1).drop(columns=['Fermentation Characteristics'])

# Save the updated DataFrame to a CSV file
styles_details.to_csv('data/beer_style_details.csv')

### Convert Style Features into Vectors

In [34]:
# Load styles details for calculating style similarities
styles_details = pd.read_csv('data/beer_style_details.csv')

# Calculate the number of rows that have at least one NaN value
num_rows_with_nan = styles_details.isna().any(axis=1).sum()

# Calculate the total number of rows in the DataFrame
total_rows = len(styles_details)

# Calculate the proportion of rows with NaN values
proportion_with_nan = num_rows_with_nan / total_rows

# Print the result
print(f"Number of rows with NaN values: {num_rows_with_nan:d}")
print(f"total number of rows in the DataFrame: {total_rows:d}")
print(f"Proportion of rows with NaN values: {proportion_with_nan:.2f}")

styles_details = styles_details.dropna().drop(columns=['Unnamed: 0']).copy(deep=True)
display(styles_details)

Number of rows with NaN values: 23
total number of rows in the DataFrame: 171
Proportion of rows with NaN values: 0.13


Unnamed: 0,Original Style Name,Modified Style Name,Color_SRM,Bitterness_IBU,Alcohol_ABV,Color,Clarity,Perceived Malt Aroma & Flavor,Perceived Hop Aroma & Flavor,Body
1,Scottish Ale,Scottish-Style Ale,6-19,9-25,2.8-5.3%,Gold to Dark Chestnut or Brown,Clear,"Caramel, Toffee",Hop aroma and flavor are not perceived. Hop bi...,Soft
2,Pilsener,Bohemian-Style Pilsener,3-7,30-45,4.1-5.1%,Straw to Light Amber,Clear to Brilliant,"Toasted, biscuit-like, and/or bready malt flav...","Hop aroma and flavor are low to medium-low, de...",Soft
3,Brown Ale,American Brown Ale,15-26,25-45,4.2-6.3%,Deep Copper to Very Dark Brown,Clear,"Caramel, Chocolate, Toast",Hop aroma and flavor are low to medium. Hop bi...,Varies
4,Scotch Ale / Wee Heavy,Scotch Ale/Wee Heavy,15-30,25-35,6.2-8%,Light Reddish Brown to Very Dark,Clear,"Dominated by a smooth, balanced sweet maltines...",Hop aroma and flavor are not perceived. Hop bi...,Mouth-Coating
5,Smoked Beer,Smoke Beer,1-100,1-120,0-15%,Varies,Varies,Varies,Varies,Varies
...,...,...,...,...,...,...,...,...,...,...
162,Scotch Ale,Scotch Ale/Wee Heavy,15-30,25-35,6.2-8%,Light Reddish Brown to Very Dark,Clear,"Dominated by a smooth, balanced sweet maltines...",Hop aroma and flavor are not perceived. Hop bi...,Mouth-Coating
164,Munich Dunkel Lager,South German-Style Dunkel Weizen,20-50,10-15,4.80%-5.40%,Copper-brown to very dark,"If served with yeast, appearance may be very c...",Distinct sweet maltiness and a chocolate-like ...,Not present,Medium to full
165,English Pale Ale,English-Style Pale Ale (ESB),5-16,20-40,4.5-5.5%,Gold to Copper,Clear to Brilliant,Residual malt and defining sweetness is medium...,Hop flavor is medium to medium-high. Hop bitte...,Moderate
166,Bock,German-Style Bock,20-30,20-30,6.3-7.5%,Dark Brown to Very Dark,Clear,High malt character with aromas of toasted or ...,Hop flavor is low. Hop bitterness is perceived...,Soft


#### Calculate both the mean and the range for each of the numerical features

In [35]:
# Function to convert percentage range strings to their mean float value and range
def percent_range_to_mean_and_range(x):
    # Split the string by the hyphen to get the range
    range_parts = x.split('-')
    # Remove the percentage sign and convert to float
    range_values = [float(part.strip('%')) for part in range_parts]
    # Calculate the mean of the range and the range width
    range_mean = np.mean(range_values) / 100  # divide by 100 to convert to decimal
    range_width = (max(range_values) - min(range_values)) / 100 if len(range_values) > 1 else 0
    return range_mean, range_width

# Function to get the mean and range width of a numerical range
def mean_and_range_of_range(x):
    if '-' in x:
        nums = [float(n) for n in x.split('-')]
        return np.mean(nums), max(nums) - min(nums)
    else:
        return float(x), 0

# Apply the functions to convert ranges to their mean values and range widths
styles_details[['Color_SRM_mean', 'Color_SRM_range']] = styles_details['Color_SRM'].apply(
    lambda x: pd.Series(mean_and_range_of_range(x))
)
styles_details[['Bitterness_IBU_mean', 'Bitterness_IBU_range']] = styles_details['Bitterness_IBU'].apply(
    lambda x: pd.Series(mean_and_range_of_range(x))
)
styles_details[['Alcohol_ABV_mean', 'Alcohol_ABV_range']] = styles_details['Alcohol_ABV'].apply(
    lambda x: pd.Series(percent_range_to_mean_and_range(x))
)

# Normalize the numerical features including the ranges
numerical_features = ['Color_SRM_mean', 'Bitterness_IBU_mean', 'Alcohol_ABV_mean',
                      'Color_SRM_range', 'Bitterness_IBU_range', 'Alcohol_ABV_range']
scaler = MinMaxScaler()
styles_details[numerical_features] = scaler.fit_transform(styles_details[numerical_features])


#### Obtain vector representations for each text features

In [36]:
# Load the model
glove_model = api.load('glove-wiki-gigaword-50')

# Define your function for converting text to a vector
def document_vector(doc):
    words = [word for word in doc.lower().split() if word in glove_model.key_to_index]
    if len(words) == 0:
        return np.zeros(50)
    return np.mean(glove_model[words], axis=0)

# Concatenate all text features into a single string for each row
styles_details['combined_text'] = styles_details.apply(lambda row: ' '.join([str(row[feature]) for feature in ['Color', 'Clarity', 'Perceived Malt Aroma & Flavor', 'Perceived Hop Aroma & Flavor', 'Body']]), axis=1)

# Apply the document_vector function to the combined text
styles_details['text_vec'] = styles_details['combined_text'].apply(document_vector)

### Calculate Style Similarities

In [37]:
# Concatenate all features into a single vector for each row
styles_details['combined_vector'] = styles_details.apply(
    lambda row: np.concatenate([
        row[numerical_features].values, 
        row['text_vec']
    ]),
    axis=1
)

# Convert the combined vectors into a matrix
combined_vectors_matrix = np.vstack(styles_details['combined_vector'].values)

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(combined_vectors_matrix)

identifiers = styles_details['Original Style Name'].values  # Replace 'Style_Name' with the appropriate column name or index

# Create a DataFrame for the similarity matrix
similarity_df = pd.DataFrame(similarity_matrix, index=identifiers, columns=identifiers)

# Save to a CSV file
similarity_df.to_csv('data/similarity_matrix.csv')