In [1]:
import pandas as pd
import numpy as np
from torch import nn
import torch
import time

from sklearn.model_selection import train_test_split

### Get Categorical One-Hot 

In [2]:
df  = pd.read_csv('data/tea_data.csv')
df.head()

Unnamed: 0,Account Id,Review Link,Tea Name,Aroma Rating,Flavor Rating,Value Rating,Total Rating,Brand,Style,Region,Caffeine,Loose Type,Product page
0,1,https://ratetea.com//review/6625/,Classic Robust Jin Jun Mei Black Tea of Fujian,10,5,5,100,Yunnan Sourcing,Black Tea,"Wuyi, Fujian, China",Caffeinated,Loose,https://yunnansourcing.com/products/classic-ro...
1,1,https://ratetea.com//review/6189/,Wu Liang Hong Mao Feng Yunnan Black Tea,10,5,5,100,Yunnan Sourcing,Yunnan Gold,"Simao, Yunnan, China",Caffeinated,Loose,https://yunnansourcing.com/products/wu-liang-h...
2,1,https://ratetea.com//review/5755/,Jingmai Mountain Wild Arbor Black Tea of Sprin...,10,5,5,100,Yunnan Sourcing,Yunnan Gold,"Lancang, Yunnan, China",Caffeinated,Loose,https://yunnansourcing.com/products/jingmai-mo...
3,1,https://ratetea.com//review/4567/,Black King,10,5,5,100,Harney and Sons,Black Tea,"Hunan, China",Caffeinated,Loose,https://www.harney.com/products/black-king?var...
4,1,https://ratetea.com//review/4203/,Ruby Black Tea (Whole Leaf),10,5,5,100,Health & Tea,Black Tea,Taiwan / Formosa,Caffeinated,Loose,https://www.healthandtea.com/product/ruby-blac...


In [4]:
cat_data = df[['Brand', 'Style', 'Region', 'Caffeine', 'Loose Type']]

In [5]:
one_hot_data = pd.get_dummies(cat_data, dtype=np.int64).to_numpy()

### Train Embeddings

In [6]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim = 5, div_value = 10):
        super(Autoencoder, self).__init__()
        d1 = input_dim // div_value
        d2 = d1 // div_value
        
        self.encoding = nn.Sequential(
            nn.Linear(input_dim, d1),
            nn.ReLU(),
            nn.Linear(d1, d2),
            nn.ReLU(),
            nn.Linear(d2, embedding_dim), # bottleneck
            nn.ReLU()
        )

        self.decoding = nn.Sequential(
            nn.Linear(embedding_dim, d2),
            nn.ReLU(),
            nn.Linear(d2, d1),
            nn.ReLU(),
            nn.Linear(d1, input_dim)
        )
    def forward(self, x):
        encode = self.encoding(x)
        decode = self.decoding(encode)
        return encode, decode

In [7]:
batch_size = 4

X_train, X_val = train_test_split(one_hot_data, test_size=0.1, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)

train_loader = torch.utils.data.DataLoader(X_train_tensor, batch_size=batch_size, shuffle=True)

In [8]:
embedding_dim = 4
epochs = 30

model = Autoencoder(input_dim=one_hot_data.shape[1], embedding_dim=embedding_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [9]:
for epoch in range(epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    for batch in train_loader:
        encode, decode = model(batch)
        loss = criterion(decode, batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    end_time = time.time()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}, Time: {end_time-start_time:.4f} sec")

Epoch 1, Loss: 5.6819, Time: 13.2228 sec
Epoch 2, Loss: 4.6692, Time: 15.0034 sec
Epoch 3, Loss: 4.4582, Time: 15.7491 sec
Epoch 4, Loss: 4.3490, Time: 16.2621 sec
Epoch 5, Loss: 4.2616, Time: 16.4592 sec
Epoch 6, Loss: 4.1822, Time: 15.4008 sec
Epoch 7, Loss: 4.1146, Time: 15.8497 sec
Epoch 8, Loss: 4.0621, Time: 15.4078 sec
Epoch 9, Loss: 4.0273, Time: 15.6067 sec
Epoch 10, Loss: 4.0034, Time: 15.8946 sec
Epoch 11, Loss: 3.9840, Time: 16.2450 sec
Epoch 12, Loss: 3.9661, Time: 16.4243 sec
Epoch 13, Loss: 3.9496, Time: 16.2223 sec
Epoch 14, Loss: 3.9345, Time: 16.6035 sec
Epoch 15, Loss: 3.9204, Time: 16.2486 sec
Epoch 16, Loss: 3.9088, Time: 16.3157 sec
Epoch 17, Loss: 3.8920, Time: 16.1428 sec
Epoch 18, Loss: 3.8826, Time: 16.7793 sec
Epoch 19, Loss: 3.8709, Time: 17.5982 sec
Epoch 20, Loss: 3.8608, Time: 18.6465 sec
Epoch 21, Loss: 3.8514, Time: 17.7480 sec
Epoch 22, Loss: 3.8426, Time: 17.1709 sec
Epoch 23, Loss: 3.8325, Time: 17.5690 sec
Epoch 24, Loss: 3.8217, Time: 17.5437 sec
E

### Get Embeddings for all Teas

In [10]:
one_hot_tensor = torch.tensor(one_hot_data, dtype=torch.float32)

In [11]:
one_hot_tensor.shape

torch.Size([5346, 709])

In [12]:
embeds = []
for tea in one_hot_tensor:
    with torch.no_grad():
        embed, _ = model(tea)
        embeds.append(embed)
embeds = np.stack(embeds)

Save Embeddings

In [13]:
pd.DataFrame(embeds).to_csv('data/content_embeddings.csv', header=False, index=False)