In [1]:
import pandas as pd
from IPython.display import display, HTML
pd.set_option('display.max_rows', 500)

In [2]:
df = pd.read_csv('beer_data5000.csv')
df.head(10)


FileNotFoundError: [Errno 2] No such file or directory: 'beer_data5000.csv'

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import pandas as pd

# Load the falcon-11B tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-11b")
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-11b")


def encode_description(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Mean pooling over the last hidden states
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings.detach().numpy()[0]  # Convert to a 1D numpy array

# Assuming your DataFrame is named `df`
df['description_embedding'] = df['description'].apply(encode_description)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

: 

In [67]:
import numpy as np

# Stack embeddings into a 2D array for VAE input
embeddings = np.vstack(df['description_embedding'].values)


In [68]:
import torch.nn as nn

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim * 2)  # Mean and log variance for the latent space
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid()
        )

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        x = self.encoder(x)
        mu, log_var = x.chunk(2, dim=-1)
        z = self.reparameterize(mu, log_var)
        return self.decoder(z), mu, log_var


In [70]:
import torch.optim as optim

input_dim = embeddings.shape[1]
latent_dim = 10  # Adjust as needed
vae = VAE(input_dim, latent_dim)
optimizer = optim.Adam(vae.parameters(), lr=0.001)

# Define loss function
def vae_loss(recon_x, x, mu, log_var):
    recon_loss = nn.functional.mse_loss(recon_x, x, reduction='sum')
    kl_divergence = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return recon_loss + kl_divergence

# Training loop
epochs = 10  # Adjust as needed
for epoch in range(epochs):
    vae.train()
    optimizer.zero_grad()
    data = torch.tensor(embeddings, dtype=torch.float32)
    recon_data, mu, log_var = vae(data)
    loss = vae_loss(recon_data, data, mu, log_var)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')
    
with torch.no_grad():
    latent_representations, _ = vae.encoder(torch.tensor(embeddings, dtype=torch.float32)).chunk(2, dim=-1)
    df['latent_representation'] = latent_representations.numpy().tolist()



Epoch 1, Loss: 44315.8515625
Epoch 2, Loss: 43265.1484375
Epoch 3, Loss: 42379.6640625
Epoch 4, Loss: 41506.29296875
Epoch 5, Loss: 40507.2109375
Epoch 6, Loss: 39547.86328125
Epoch 7, Loss: 38465.203125
Epoch 8, Loss: 37141.375
Epoch 9, Loss: 35653.19140625
Epoch 10, Loss: 34016.078125


In [75]:
def recommend_beer(prompt, df, vae, tokenizer, model, top_n=5):
    # Step 1: Encode the prompt using BERT
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    prompt_embedding = torch.mean(outputs.last_hidden_state, dim=1)

    # Step 2: Project the prompt embedding into the VAE's latent space
    with torch.no_grad():
        prompt_latent, _ = vae.encoder(prompt_embedding).chunk(2, dim=-1)
        prompt_latent = prompt_latent.squeeze(0)  # Remove batch dimension if present

    # Step 3: Calculate similarity between prompt latent and beer latent representations
    similarities = []
    for beer_latent in df['latent_representation']:
        beer_latent_tensor = torch.tensor(beer_latent, dtype=torch.float32)
        
        # Ensure beer_latent_tensor matches prompt_latent's shape for cosine similarity
        similarity = torch.cosine_similarity(prompt_latent, beer_latent_tensor, dim=0)
        similarities.append(similarity.item())

    # Step 4: Find the top N similar beers
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_n]
    
    # Return the top N recommended beers
    return df.iloc[top_indices][['name', 'link', 'rating', 'brewery', 'location', 'description']]


In [83]:
prompt = "I'm looking for a light, refreshing beer with a fruity taste. Please not malty"
recommended_beers = recommend_beer(prompt, df, vae, tokenizer, model, top_n=5)
display(HTML(recommended_beers.to_html()))


Unnamed: 0,name,link,rating,brewery,location,description
5,Palm 8 Horse Power Blond,https://www.ratebeer.com/beer/palm-8-horse-power-blond/1097512/231465/,3.4,Brouwerij Palm (Swinkels Family Brewers),"Steenhuffel, Londerzeel, Flemish Brabant",Light amber with medium white. Grain aroma with yeasty notes. Medium bitterness with grainy touch mixed with yeasty fruitiness. Could be thicker for its strength.
57,5mans Ain't No schLager,https://www.ratebeer.com/beer/5mans-aint-no-schlager/1184490/32671/,3.3,5mans Bryggeri,"Borlänge, Dalarna","(Draught at Stockholm Beer & Whisky Festival, 5 Sep 2024) Golden colour with frothy, white head. Malty nose with bread, grass and herbs. Malty taste with notes of bread, cereal, hay and a mild herbal bitterness in the finish. Medium body, with a touch of malt sweetness. Clean and fresh. Quite nice."
102,Kalik,https://www.ratebeer.com/beer/kalik/8682/140470/,2.0,Commonwealth Brewery (Heineken),"Nassau, Bahamas","345ml bottle [courtesy of AvB - thanks a lot!]. Clear, orange, dark golden colour with small to average, frothy, half-way lasting, minimally lacing, white head. Grainy, strawy, pale malty aroma, a touch of cereals and brown bread crust. Taste is mildly bitter hoppy, minimally grainy, pale malty basis with some residual , hints of corn, a touch of cereals, minimally soapy overtones.\nMinimally oily, watery texture, smooth and soft, minimally greasy, simultaneously minimally dry palate, medium, scattered, mildly prickly carbonation.\nSimple and monotonous as expected - boring but better than nothing."
51,Monday Night Oatmeal Raisin Cookie Quad,https://www.ratebeer.com/beer/monday-night-oatmeal-raisin-cookie-quad/1237478/24501/,4.0,Monday Night Brewing,"Atlanta, Georgia","Taps coffee sable with a thin head. Aroma provides boozy, roast malt, raisin and oaty tones. Flavor supplies bourbon, roast malt, oatmeal, raisin, syrup. Texture features good body & peppy fizz. Say what you like, I LOVE oatmeal raisin cookies."
44,Monday Night Fun Size,https://www.ratebeer.com/beer/monday-night-fun-size/1237481/24501/,3.7,Monday Night Brewing,"Atlanta, Georgia","Taps coffee with a thin tan head. Aroma has roast malt, chocolate, nutty and subtle molasses tones. Flavor yields rich roast malt, chocolate, nutty and slight molasses notes. Texture features good body and lively fizz. Fun!"
