In [534]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd

## Prepare Dataset

In [535]:
data =pd.read_excel("data/clustered_data.xlsx", index_col=0)
data.drop("cluster_labels", axis=1, inplace=True)
data_tensor = torch.tensor(data.values, dtype=torch.float32)


train_tensor, val_tensor = train_test_split(data_tensor, test_size=0.2, random_state=42)

train_dataset = TensorDataset(train_tensor)
val_dataset = TensorDataset(val_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

## AutoEncoder

In [536]:
class EnhancedLR(nn.Module):
    
    def __init__(self, input_dim):
        super(EnhancedLR , self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = self.linear(x)
        return torch.sigmoid(x)

    def predict_proba(self, x):
        with torch.no_grad():
            output = self.forward(x)
            return torch.cat((1 - output, output), dim=1).numpy()


In [537]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, reconstruction_loss, classification_loss, dropout_rate = 0.15):
        super(Autoencoder, self).__init__()

        self.input_dim = input_dim
        self.reconstruction_loss = reconstruction_loss
        self.classification_loss = classification_loss
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.encoder = nn.Sequential(
                nn.Linear(input_dim, 1024),
                nn.ReLU(),
                nn.Dropout(dropout_rate),
                nn.Linear(1024, embedding_dim),
            )

        self.decoder = nn.Sequential(
                nn.Linear(embedding_dim, 1024),
                nn.ReLU(),
                nn.Linear(1024, input_dim),
                nn.Sigmoid(),
            )
    
        self.classifier = EnhancedLR(embedding_dim)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def predict(self, x):
        x = self.encoder(x)
        prediction = self.classifier(x)
        return prediction
    
    def predict_proba(self, x):
        x = self.encoder(x)
        predict_proba = self.classifier.predict_proba(x)
        return predict_proba
    
    def unsupervised_train(self, num_epochs, train_dataloader, val_dataloader, optimizer):

        for epoch in range(num_epochs):
            self.train()
            total_train_loss = 0
            
            # Training phase
            for batch in train_dataloader:
                optimizer.zero_grad()
                inputs = batch[0].to(self.device)
                outputs = self(inputs)
                loss = self.reconstruction_loss(outputs, inputs)
                loss.backward()
                optimizer.step()
                total_train_loss += loss.item()
            
            # Validation phase
            self.eval()
            total_val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    inputs = batch[0].to(self.device)
                    outputs = self(inputs)
                    val_loss = self.reconstruction_loss(outputs, inputs)
                    total_val_loss += val_loss.item()

            avg_train_loss = total_train_loss / len(train_dataloader)
            avg_val_loss = total_val_loss / len(val_dataloader)
            
            print(f'Epoch: {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}')

    def fit(self, num_epochs, train_dataloader, optimizer):
        
        # L1 Regularization strength
        lambda_l1 = 0.2

        for epoch in range(num_epochs):
            self.train()
            total_train_loss = 0
            total_clasification_loss = 0
            total_recon_loss = 0
            
            # Training phase
            for inputs, labels in train_dataloader:

                inputs, labels = inputs.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = self(inputs)

                # Fine-tune the embeddings
                reconstruction_loss = self.reconstruction_loss(outputs, inputs)

                # Train the classifier 
                predictions = self.predict(inputs)
                classification_loss = self.classification_loss(predictions, labels)

                # L1 regularization
                l1_reg = torch.tensor(0.)
                for param in self.classifier.parameters():
                    l1_reg += torch.norm(param, 1)

                # Optimize Both 
                loss = reconstruction_loss + classification_loss +  ( l1_reg * lambda_l1 ) 
                
                loss.backward()
                optimizer.step()
                total_train_loss += loss.item()
                total_clasification_loss = classification_loss.item()
                total_recon_loss = reconstruction_loss.item()

            avg_train_loss = total_train_loss / len(train_dataloader)
            avg_class_loss = total_clasification_loss / len(train_dataloader)
            avg_recon_loss = total_recon_loss / len(train_dataloader)
            
            print(f'Epoch: {epoch+1}, Training Loss: {avg_train_loss}, Recon Loss: {avg_recon_loss}, Classification Loss: {avg_class_loss}')

In [538]:
input_dim = data.shape[1]
embedding_dim = 32

reconstruction_loss = nn.BCELoss()
classification_loss = nn.BCELoss()

model = Autoencoder(input_dim, embedding_dim, reconstruction_loss, classification_loss)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [539]:
model.unsupervised_train(10, train_dataloader, val_dataloader, optimizer)

Epoch: 1, Training Loss: 0.5202501349978976, Validation Loss: 0.24255972603956857
Epoch: 2, Training Loss: 0.22826355530156028, Validation Loss: 0.22056570649147034
Epoch: 3, Training Loss: 0.19947504334979588, Validation Loss: 0.1940621038277944
Epoch: 4, Training Loss: 0.18265177971786922, Validation Loss: 0.18069389959176382
Epoch: 5, Training Loss: 0.17493641045358446, Validation Loss: 0.17681154112021127
Epoch: 6, Training Loss: 0.1710172494252523, Validation Loss: 0.17329390347003937
Epoch: 7, Training Loss: 0.1666908793979221, Validation Loss: 0.16795764366785684
Epoch: 8, Training Loss: 0.16045398844612968, Validation Loss: 0.16116819282372793
Epoch: 9, Training Loss: 0.15167764325936636, Validation Loss: 0.1532986064751943
Epoch: 10, Training Loss: 0.14495214654339683, Validation Loss: 0.1463757554690043


In [540]:
torch.save(model,"auto_encoder.pkl")

## Evaluate

In [541]:
from sklearn.linear_model import LogisticRegression

In [542]:
recipes = pd.read_excel("data/clustered_data.xlsx", index_col=0)
recipes.drop("cluster_labels", axis=1, inplace=True)
data = recipes

In [543]:
recipes

Unnamed: 0,Fresh Beans,Tomato,Sugar,Salt (Non-Iodized),Olive Oil,Dried Onion,Potato (in Shell),Beef (Low Fat),Charliston Pepper,Tomato Paste,...,Paste Types,Flour Mixture,Sesame,Arugula,Milk,Corn Starch,Fresh Basil,Carbonate,Red Onion,Cherry Tomatoes
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1924,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1925,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1929,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1930,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [544]:
users_feedback

Unnamed: 0,uuid,recipe_id,recipe,is_accepted
0,Derin,505,"{""CookingTime"": 30, ""Cuisine"": ""World"", ""Ingre...",0
1,Derin,1699,"{""CookingTime"": 10, ""Cuisine"": ""T\u00fcrkiye"",...",1
2,Derin,435,"{""CookingTime"": 45, ""Cuisine"": ""T\u00fcrkiye"",...",0
3,Derin,583,"{""CookingTime"": 20, ""Cuisine"": ""T\u00fcrkiye"",...",0
4,Derin,1714,"{""CookingTime"": 40, ""Cuisine"": ""T\u00fcrkiye"",...",1
5,Derin,1833,"{""CookingTime"": 30, ""Cuisine"": ""T\u00fcrkiye"",...",0
6,Derin,806,"{""CookingTime"": 15, ""Cuisine"": ""T\u00fcrkiye"",...",0
7,Derin,1597,"{""CookingTime"": 15, ""Cuisine"": ""T\u00fcrkiye"",...",0
8,Derin,900,"{""CookingTime"": 15, ""Cuisine"": ""World"", ""Ingre...",1
12,Derin,306,"{""CookingTime"": 10, ""Cuisine"": ""T\u00fcrkiye"",...",0


In [545]:
test_set = pd.read_excel("data/recipe_logs.xlsx", index_col=0)
test_set.drop("id", axis=1, inplace=True)
users_feedback = test_set
users_feedback = users_feedback[users_feedback.uuid == "Derin"]
labels = users_feedback.is_accepted

In [546]:
users_feedback = users_feedback[users_feedback.uuid == "Derin"]

In [547]:
recipes = recipes.loc[users_feedback.recipe_id.values]

In [548]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(recipes, labels, test_size=0.60, random_state=42, stratify=labels)

In [549]:
y_train.index = X_train.index

In [550]:
X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1) 
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [551]:
input_dim = recipes.shape[1]

In [552]:
simple_LR = LogisticRegression(penalty='l2', C=0.1, n_jobs=-1, max_iter=1000)

In [560]:
simple_LR.fit(X_train,y_train)
model.fit(5,dataloader, optimizer)

Epoch: 1, Training Loss: 0.7295981645584106, Recon Loss: 0.0795564353466034, Classification Loss: 0.21470609307289124
Epoch: 2, Training Loss: 0.6816551089286804, Recon Loss: 0.07532789558172226, Classification Loss: 0.17653021216392517
Epoch: 3, Training Loss: 0.6428464651107788, Recon Loss: 0.07159875333309174, Classification Loss: 0.14710985124111176
Epoch: 4, Training Loss: 0.6092850565910339, Recon Loss: 0.06737720966339111, Classification Loss: 0.12352975457906723
Epoch: 5, Training Loss: 0.5784178972244263, Recon Loss: 0.058660365641117096, Classification Loss: 0.10706464946269989


In [554]:
simple_LR.score(X_test, y_test)

0.625

In [555]:
X_test = torch.tensor(X_test.values, dtype=torch.float32)

In [561]:
probabilities = model.predict(X_test).detach()
labels = (probabilities >= 0.5).float().numpy()

In [562]:
from sklearn.metrics import accuracy_score
accuracy_score( labels , y_test)

0.625

In [563]:
labels

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)