# Autoencoder In String Theory

### Dependences

In [None]:
!pip3 install numpy
!pip3 install torch
!pip3 install sklearn
!pip3 install pandas

In [1]:
import numpy as np
import math
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

## Neural Network

In [4]:
class Net(nn.Module):

    def __init__(self, dimensions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(dimensions[0], dimensions[1])
        self.fc2 = nn.Linear(dimensions[1], dimensions[2])
        self.fc3 = nn.Linear(dimensions[2], dimensions[3])
        self.fc4 = nn.Linear(dimensions[3], dimensions[4])
        self.fc5 = nn.Linear(dimensions[4], dimensions[5])

        self.fc6 = nn.Linear(dimensions[5], dimensions[6])
        self.fc7 = nn.Linear(dimensions[6], dimensions[7])
        self.fc8 = nn.Linear(dimensions[7], dimensions[8])
        self.fc9 = nn.Linear(dimensions[8], dimensions[9])
        self.fc10 = nn.Linear(dimensions[9], dimensions[10])

    def encode(self, Layer):
        Layer = F.leaky_relu(self.fc1(Layer))
        Layer = F.leaky_relu(self.fc2(Layer))
        Layer = F.leaky_relu(self.fc3(Layer))
        Layer = F.leaky_relu(self.fc4(Layer))
        Layer = F.leaky_relu(self.fc5(Layer))
        return Layer

    def decode(self, Layer):
        Layer = F.leaky_relu(self.fc6(Layer))
        Layer = F.leaky_relu(self.fc7(Layer))
        Layer = F.leaky_relu(self.fc8(Layer))
        Layer = F.leaky_relu(self.fc9(Layer))
        Layer = F.leaky_relu(self.fc10(Layer))
        return Layer

    def forward(self, Layer):
        return self.decode(self.encode(Layer))

## Data

In [None]:
class CustomDataset(Dataset):
    def __init__(self, file_name):
        ohe = OneHotEncoder(dtype=np.int8)
        file_out = pd.read_csv(file_name)

        data_tensor = pd.DataFrame(ohe.fit_transform(file_out).toarray())
        
        x = data_tensor.iloc[:,:].values
        y = file_out.iloc[:, -1].values

        self.X_train = torch.tensor(x,dtype=torch.float)
        self.y_train = torch.tensor(y)

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, idx):
        return self.X_train[idx], self.y_train[idx]

In [None]:
class CustomDataset1(Dataset):
    def __init__(self, file_name, other_file):
        ohe = OneHotEncoder(dtype=np.int8)
        file_out = pd.read_csv(file_name)
        other = pd.read_csv(other_file)
        trans = ohe.fit(file_out)
        data_tensor = pd.DataFrame(trans.transform(other).toarray())
        
        x = data_tensor.iloc[:,:].values
        y = file_out.iloc[:, -1].values

        self.X_train = torch.tensor(x,dtype=torch.float)
        self.y_train = torch.tensor(y)

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, idx):

In [2]:
def lenght_ohe(PATH):
    datos = pd.read_csv(PATH)
    lengths = []
    for column in datos.columns:
        lengths.append(datos[column].nunique())
    return sum(lengths)

In [3]:
def lenghts_features(name_dataset):
    datos = pd.read_csv(name_dataset)
    lengths = []
    for column in datos.columns:
        lengths.append(datos[column].nunique())
    return lengths

## Loss Function

In [None]:
def CustomLossFunction(data,output,criterion,lenghts_data):
    ini = 0
    train_loss = 0

    for ind in lenghts_data:
        curr_data = data[:][:,ini:ini+ind]
        curr_target = output[:][:,ini:ini+ind]
        train_loss += criterion(curr_target,curr_data)
        ini = ind + ini
    return train_loss

## Training

In [None]:
def train(model, device, train_dataloader, val_dataloader, criterion, optimizer, scheduler):

    print("Train to:", epocas, "epochs", end='\n')
    loggertrain = open("Loss_accu"+funcion+".txt", "w")
    start = time.time()
    print('Start training',end='\n')
    
    best_model_wts = model.state_dict()
    best_acc = 0.0
    best_epoch = 0
    best_optimizer = optimizer.state_dict()

    for epoch in range(1,epocas+1):

        print('Epoch {}/{}'.format(epoch, epocas))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train','valid']:
            if phase == 'train':
                if epoch != 1:
                   scheduler.step()
                model.train(True)
                dataloader = train_dataloader
            else:
                model.train(False)
                dataloader = val_dataloader

            running_loss = 0.0
            running_corrects = 0

            for (x, y) in dataloader:   
                data = x.to(device)
              
                output = model(data)
                train_loss = CustomLossFunction(data,output,criterion,lenghts_data)
                optimizer.zero_grad()
                # backward + optimize only if in training phase
                if phase == 'train':
                    train_loss.backward()
                    optimizer.step()
                
                ini = 0
                for ind in lenghts_data:
                    curr_target = output[:,ini:ini + ind]
                    curr_data = data[:,ini:ini + ind]
                    max_ind_tar = torch.max(curr_target,1)[1]
                    max_ind_dat = torch.max(curr_data,1)[1]
                    running_corrects += torch.sum(max_ind_tar == max_ind_dat)
                    ini = ind + ini 
                    
                # statistics 
                running_loss += train_loss.item()
                        
            epoch_loss = running_loss / len(dataloader.dataset)
            epoch_acc = running_corrects / len(dataloader.dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            if phase == "train":
                print('{}, {:.4f}, {:.4f}'.format(epoch,epoch_loss,epoch_acc),end=', ',file=loggertrain)
            if phase == "valid":
                print('{:.4f}, {:.4f}'.format(epoch_loss,epoch_acc), file=loggertrain)

            # deep copy the model
            if phase == 'valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
                best_optimizer = optimizer.state_dict()
                best_epoch = epoch

    stop = time.time()
    hr, minu, seg = io.timetotal(start,stop)
    print('Training finished. Time elapsed:', '{:.0f}h {:.0f}m {:.0f}s'.format(hr,minu,seg))

    # Save the best model
    save(funcion, best_model_wts, best_optimizer, best_epoch)
   
    loggertrain.close()

## Save models

### Weight configuration

In [None]:
def save(funcion, best_model, best_optimizer, best_epoch):
    path = "./savedModels/model-"+funcion+".pt"
    torch.save({'model_dict': best_model,
                'optim_dict': best_optimizer,
                'epoch': best_epoch}, path)

### Latent space model mapping

In [None]:
def process(model, device, dataloader, path):
    model.eval()
    model.to(device)
    processed = []
    count = 0

    for (x,y) in dataloader:
        data = x.to(device)
        processed.append(model.encode(data).cpu().detach().numpy())
        count += len(data)

    out = np.empty([count, latent])
    index = 0
    for batch in processed:
        out[index:index + len(batch)] = batch
        index += len(batch)

    np.savetxt(path, out)

### Accuracy by feature

In [None]:
def success(model, device, dataloader):
    model.eval()
    loggerreconstruction = open("Success"+funcion+".txt", "w")

    for (x,y) in dataloader:
        data = x.to(device)
        output = model(data)
        evalIn = np.copy(data.cpu().numpy())
        evalOut = np.copy(output.detach().cpu().numpy())

        for j in range(0, len(evalIn)):
            correct_counter = 0
            partial_counter = np.zeros(len(lenghts_data),dtype=np.int8)
            ini = 0
            k = 0
            for ind in lenghts_data:
                curr_target = evalIn[j][ini:ini + ind]
                curr_data = evalOut[j][ini:ini + ind]
                target_index = curr_target.tolist().index(np.max(curr_target))
                data_index = curr_data.tolist().index(np.max(curr_data))
                if target_index == data_index:
                    partial_counter[k] = partial_counter[k] + 1
                    correct_counter += 1
                ini = ind + ini
                k += 1
            print('{},{}'.format(correct_counter,",".join(map(str, partial_counter.tolist()))), file=loggerreconstruction)

    loggerreconstruction.close()

## Reconstruction 

### Seen data

In [None]:
def reconstruction(device, test_dataloader,dataloader_Z8,dataloader_Z12):

    print('Reconstruction running', end='\n')
    nombre = "./savedModels/model-"+funcion+".pt"
    dicc_model = torch.load(nombre,map_location=torch.device('cpu'))
    model = Net(dimensions).to(device)
    model.load_state_dict(dicc_model['model_dict'])
    print(dicc_model['epoch'])

    x1 = "latentZ8"+funcion
    x2 = "latentZ12"+funcion
    process(model, device, dataloader_Z8, x1)
    process(model, device, dataloader_Z12, x2)
    success(model, device, test_dataloader)

### Unseen data 

In [None]:
def reconstruction_other_models(device, dataloader_other_models):

    print('Reconstruction running other models', end='\n')
    nombre = "./savedModels/model-"+funcion+".pt"
    dicc_model = torch.load(nombre,map_location=torch.device('cpu'))
    model = Net(dimensions).to(device)
    model.load_state_dict(dicc_model['model_dict'])
    
    x1 = "latentother"+funcion
    process(model, device, dataloader_other_models, x1)

## Autoencoder random test

In [None]:
def proof(device,test_dataloader):
    
    nombre = "./savedModels/model-"+funcion+".pt"
    dicc_model = torch.load(nombre)
    model = Net(dimensions).to(device)
    model.load_state_dict(dicc_model['model_dict'])
    

    (x,y) = next(iter(test_dataloader))

    data =x.to(device)
    output = model(data)
    evalIn = np.copy(data.cpu().numpy())
    evalOut = np.copy(output.detach().cpu().numpy())

    for j in range(0, 1):
        ini = 0
        k = 0
        for ind in lenghts_data:
            curr_target = evalIn[j][ini:ini + ind]
            curr_data = evalOut[j][ini:ini + ind]
            print(curr_target)
            print(curr_data)

            target_index = curr_target.tolist().index(np.max(curr_target))
            data_index = curr_data.tolist().index(np.max(curr_data))
            if target_index == data_index:
                print("Correct")
            input("Pause")
            ini = ind+ini
            k += 1

## Phase selection

In [None]:
def main(phase="All"):

    # Load and ohe dataset 
    print('Preparing data',end='\n')
    dataset = CustomDataset(datasetname)
    other_models = CustomDataset1(datasetname,datasetname_othermodels)

    # Split dataset for category
    train_length=int(train_set*len(dataset))
    val_length=int(val_set*len(dataset))
    test_length=len(dataset)-train_length-val_length
    train_dataset,val_dataset,test_dataset=torch.utils.data.random_split(dataset,(train_length,val_length,test_length))
    

    # Split dataset for geometry 
    Z8_length=292053
    Z12_length=len(dataset)-Z8_length
    Z8_X = dataset.X_train[:Z8_length]
    Z8_y = dataset.y_train[:Z8_length]
    Z12_X = dataset.X_train[Z8_length:len(dataset)]
    Z12_y = dataset.y_train[Z8_length:len(dataset)]
    Z8_dataset = torch.utils.data.TensorDataset(Z8_X, Z8_y)
    Z12_dataset = torch.utils.data.TensorDataset(Z12_X, Z12_y)
    
 

    #Z8_dataset,Z12_dataset=torch.utils.data.random_split(dataset1,(Z8_length,Z12_length))
    
    # Dataloder train phase
    dataloader_train=torch.utils.data.DataLoader(train_dataset,
            batch_size=batchsize, shuffle=True, num_workers=4)
    dataloader_val=torch.utils.data.DataLoader(val_dataset,
            batch_size=batchsize, shuffle=True, num_workers=4)

    # Dataloder reconstruction phase
    dataloader_test=torch.utils.data.DataLoader(dataset,
        batch_size=batchsize, shuffle=False, num_workers=4)
    dataloader_Z8=torch.utils.data.DataLoader(Z8_dataset,
        batch_size=batchsize, shuffle=True)
    dataloader_Z12=torch.utils.data.DataLoader(Z12_dataset,
        batch_size=batchsize, shuffle=True)

    # Dataloader other models

    dataloader_other_models = torch.utils.data.DataLoader(other_models,
        batch_size=batchsize, shuffle=False)


    print('Data ready',end='\n')

    # Load data in GPU
    use_cuda = torch.cuda.is_available()
    torch.manual_seed(1)
    device = torch.device("cuda" if use_cuda else "cpu")
    model = Net(dimensions).to(device)
    if use_cuda:
        print('Using GPU')

    # Loss function vainilla and Optimizer     
    criterion = nn.CrossEntropyLoss() # No es necesario
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    exp_scheduler = lr_scheduler.StepLR(optimizer, step_size= 4000, gamma=0.1)

    if phase == "All":
        # Training 
        train(model, device, dataloader_train, dataloader_val, criterion, optimizer, exp_scheduler)
        # Reconstruction
        reconstruction(device, dataloader_test,dataloader_Z8,dataloader_Z12)
        # Live proof
        proof(device,dataloader_test)
    if phase == "T-R":
        # Training 
        train(model, device, dataloader_train, dataloader_val, criterion, optimizer, exp_scheduler)
        # Reconstruction
        reconstruction(device, dataloader_test,dataloader_Z8,dataloader_Z12)
    if phase == "T":
        # Training 
        train(model, device, dataloader_train, dataloader_val, criterion, optimizer, exp_scheduler)
    if phase == "R":
        # Reconstruction
        reconstruction(device, dataloader_test,dataloader_Z8,dataloader_Z12)
    if phase == "O":
        reconstruction_other_models(device, dataloader_other_models)        

## Setup and dataset selection

In [None]:

# ---- Setup ------- #

# Name input dataset
#datasetname = './Data/Z8_Z12.csv'
datasetname = './Data/600K2_Z12-Z8.csv'


# Name other dataset
datasetname_othermodels = './Data/Z12_SU5_100models.csv'
 
# Number of epochs for train
epocas = 1010

# Parameters
train_set = 0.6
val_set = 0.3
batchsize = 32
workers = 8

# Label of final files
#funcion = str(epocas)+"e-7leaky_rely-ADAM-CrossEntropyLoss-vainilla"
funcion = "1010e-7leaky_rely-ADAM-CrossEntropyLoss-vainilla445"
#funcion = "1010e-7leaky_rely-ADAM-CrossEntropyLoss-vainilla_new_dataset_seed440"
# latent space dimension
latent = 3

# OHE lenght
l_ohe = io.lenght_ohe(datasetname)
lenghts_data = io.lenghts_features(datasetname)

# Dimensions of layers 
dimensions = [l_ohe, 2*l_ohe, 200, 26, 13, latent, 13, 26, 200, 2*l_ohe, l_ohe]

# ----- Proccess ----- # 

main("R")
