In [1]:
import torch 
import torch.nn as nn
from pytorch_metric_learning.losses import NTXentLoss
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd

In [None]:
class ContrastiveLearning(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, projection_dim, dropout_rate=0.15):
        super(ContrastiveLearning, self).__init__()

        self.encoder = nn.Sequential(
                nn.Linear(input_dim, 1028),
                nn.BatchNorm1d(1028),
                nn.ReLU(),
                nn.Dropout(dropout_rate),
                nn.Linear(1028, 512),
                nn.BatchNorm1d(512),
                nn.ReLU(),
                nn.Dropout(dropout_rate),
                nn.Linear(512, embedding_dim),
            )

        self.projector = nn.Sequential(
                nn.Linear(embedding_dim, 256),
                nn.BatchNorm1d(256),
                nn.ReLU(),
                nn.Dropout(dropout_rate),
                nn.Linear(256, projection_dim),
            )

        
    def forward(self, x):
        embedding = self.encoder(x)
        projection = self.projector(embedding)
        return projection

In [None]:
criterion = NTXentLoss(temperature=0.10)

In [2]:
import pandas as pd
data = pd.read_excel("data/synthetic_dataset.xlsx", index_col=0)
cluster_labels = pd.read_excel("data/clusters.xlsx", index_col=0)
cluster_labels.index = data.index
cluster_labels = cluster_labels[4]

In [3]:
data

Unnamed: 0,allspice,almond,amaretto,anise,apple,applesauce,apricot,artichoke,arugula,asparagus,...,watercress,watermelon,wheat,whip,whiskey,wine,wrapper,yeast,yoghurt,yogurt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71617,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71618,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71619,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
cluster_labels

0        0
1        0
2        0
3        1
4        0
        ..
71616    3
71617    0
71618    4
71619    1
71620    4
Name: 4, Length: 69265, dtype: int64

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_dim = data.shape[1]
embedding_dim = 32
projection_dim = 8

model = ContrastiveLearning(input_dim, embedding_dim, projection_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
class ClusterContrastiveDataset(Dataset):
    def __init__(self, data, cluster_labels):
        self.data = data
        self.cluster_labels = cluster_labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        label = self.cluster_labels.iloc[idx]
        positive_indices = [i for i, same_label in enumerate(self.cluster_labels) if same_label == label and i != idx]
        positive_idx = random.choice(positive_indices)
        positive_item = self.data.iloc[positive_idx]

        item_tensor = torch.tensor(item, dtype=torch.float32)
        positive_item_tensor = torch.tensor(positive_item, dtype=torch.float32)


        return item_tensor, positive_item_tensor

In [None]:
dataset = ClusterContrastiveDataset(data=data, cluster_labels=cluster_labels)

dataloader = DataLoader(dataset, batch_size=512, shuffle=True, num_workers=0)

In [None]:
def train(num_epochs, log_interval): 

   for epoch in range(num_epochs):
        model.train()  
        total_loss = 0

        for batch_idx, (data_i, data_j) in enumerate(dataloader):

            data_i, data_j = data_i.float().to(device), data_j.float().to(device)

            optimizer.zero_grad()  

            projections_i = model(data_i)
            projections_j = model(data_j)

            # Concatenate the projections: 
            # The positive pairs are adjacent to each other, and all others are considered negatives.
            projections = torch.cat([projections_i, projections_j], dim=0)
            
            batch_size = projections_i.size(0)
            labels = torch.arange(batch_size, dtype=torch.long).to(device)
            labels = torch.cat((labels, labels), dim=0)  # Duplicate labels for both halves of concatenated data

            # Calculate the contrastive loss
            loss = criterion(projections, labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            if batch_idx % log_interval == 0:
                print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}')

In [None]:
train(5,1)

In [None]:
torch.save(model, "model.pkl")

In [None]:
model = torch.load('model/model.pkl')

In [None]:
model.eval()

# Convert the Pandas series to a tensor and add an extra batch dimension
single_sample = torch.tensor(dataset.data.iloc[100].values).float().unsqueeze(0)

model.encoder(single_sample)

In [None]:
from finch import FINCH

embeddings = model.encoder(torch.tensor(dataset.data.values).float()).detach()

if embeddings.is_cuda:
    embeddings = embeddings.cpu()

embeddings_np = embeddings.numpy()

c, num_clust, req_c = FINCH(embeddings_np)

In [None]:
new_clusters =  pd.DataFrame(c)[4].values

In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

ari_score = adjusted_rand_score(cluster_labels.values, new_clusters)
nmi_score = normalized_mutual_info_score( cluster_labels.values, new_clusters)

print("Adjusted Rand Index:", ari_score)
print("Normalized Mutual Information:", nmi_score)

In [None]:
pd.DataFrame(embeddings, index = cluster_labels.index)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Assuming 'embeddings' is a numpy array of your data embeddings
# And 'cluster_labels' is an array of cluster labels corresponding to each point in 'embeddings'
embeddings_2d = TSNE(n_components=2, random_state=0).fit_transform(embeddings)

plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=new_clusters, cmap='viridis', alpha=0.5)
plt.colorbar()  # To show the color scale
plt.title('t-SNE plot of the embeddings colored by cluster label')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

data = pd.read_excel("data/synthetic_dataset.xlsx", index_col=0)
embeddings_2d = TSNE(n_components=2, random_state=0).fit_transform(data)

plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='viridis', alpha=0.5)
plt.colorbar()  # To show the color scale
plt.title('t-SNE plot of the actual data')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

## Evaluation 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
recipes = pd.read_excel("data/clustered_data.xlsx", index_col=0)
recipes.drop("cluster_labels", axis=1, inplace=True)

In [None]:
test_set = pd.read_excel("data/recipe_logs.xlsx", index_col=0)
test_set.drop("id", axis=1, inplace=True)
users_feedback = test_set
labels = users_feedback.is_accepted

In [None]:
users_feedback

In [None]:
recipes = recipes.loc[users_feedback.recipe_id.values]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(recipes, labels, test_size=0.20, random_state=42, stratify=labels)

In [None]:
simple_LR = LogisticRegression(penalty='l2', C=0.1, n_jobs=-1, max_iter=1000)
contastive_LR = LogisticRegression(penalty='l2', C=0.1, n_jobs=-1, max_iter=1000)

In [None]:
simple_LR.fit(X_train,y_train)
embeddings = model.encoder(torch.tensor(X_train.values).float()).detach()
embedding_test = model.encoder(torch.tensor(X_test.values).float()).detach()
contastive_LR.fit(embeddings, y_train)

In [None]:
simple_LR.score(X_train,y_train)

In [None]:
contastive_LR.score(embeddings,y_train)

In [None]:
simple_LR.predict(X_train)

In [None]:
contastive_LR.predict(embeddings).sum()

In [None]:
simple_LR.score(X_test,y_test)

In [None]:
contastive_LR.score(embedding_test,y_test)