In [None]:
import numpy as np
import json
from gensim.models import Word2Vec 
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
questions = []
tables = []
actual_col = []
label_cols = []
one_hot_label = []

# Read JSON data from file line by line
with open('data/A2_train.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Parse JSON data from each line
        parsed_data = json.loads(line)
        
        questions.append(parsed_data['question'])
        
        tables.append(parsed_data['table'])
        label_cols.append(parsed_data['label_col'])
        # Extract actual column names from table
        actual_col.append(list(parsed_data['table']['cols']))
        # Extract ground truth index from label_col
        one_hot_vector = []
        for i in range(len(parsed_data['table']['cols'])):
            if i in parsed_data['label_col']:
                one_hot_vector.append(1.0)
            else:
                one_hot_vector.append(0.0)
        # padd with zeros till 64
        while len(one_hot_vector) < 64:
            one_hot_vector.append(0.0)
        one_hot_label.append(one_hot_vector)

# make one_hot_label a tensor
one_hot_label = torch.tensor(one_hot_label)

print('Number of questions:', len(questions))
print('Number of tables:', len(tables))
print('Number of label columns:', len(label_cols))
print('Number of actual columns:', len(actual_col))
print('Number of one-hot labels:', len(one_hot_label))

In [None]:
word2vec_model = gensim.models.KeyedVectors.load('models/glove-wiki-gigaword-100')

In [None]:
class PositionalEncoding:
    def __init__(self, hidden_dim, max_seq_len=60):
        self.hidden_dim = hidden_dim
        self.max_seq_len = max_seq_len
        self.positional_encoding = self.get_positional_encoding()

    def get_positional_encoding(self):
        pe = np.zeros((self.max_seq_len, self.hidden_dim))
        position = np.arange(0, self.max_seq_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, self.hidden_dim, 2) * (-np.log(10000.0) / self.hidden_dim))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        return pe

    def forward(self, seq_len):
        return self.positional_encoding[:seq_len, :]
    
def get_word2vec_embeddings(sentence, word2vec_model):
    tokens = nltk.word_tokenize(sentence)
    tokens = [token.lower() for token in tokens]
    embeddings = []
    for token in tokens:
        try:
            embeddings.append(word2vec_model[token])
        except:
            embeddings.append(np.zeros(100))
    embeddings_array = np.stack(embeddings)
    return embeddings_array

def combine_embeddings(positional_embeddings, word_embeddings):
    return positional_embeddings + word_embeddings

def process_questions(questions, word2vec_model):
    # Initialize PositionalEncoding
    positional_encoder = PositionalEncoding(hidden_dim=100)

    # Initialize empty numpy array to store embeddings for each word in each question
    all_embeddings = []

    # Process each question
    for question in questions:
        # Get Word2Vec embeddings for each word
        word_embeddings = get_word2vec_embeddings(question, word2vec_model)
        # Get positional encodings for each word
        seq_len = len(nltk.word_tokenize(question))
        positional_embeddings = positional_encoder.forward(seq_len)
        # print(positional_embeddings.shape)

        # Combine positional and Word2Vec embeddings
        combined_embeddings = combine_embeddings(positional_embeddings, word_embeddings)
        # print(combined_embeddings.shape)
        combined_embeddings_tensor = torch.tensor(combined_embeddings, dtype=torch.float32)
        # print(combined_embeddings_tensor.shape)
        # Append embeddings for each word to the list
        all_embeddings.append(combined_embeddings_tensor)
    

    # padd the embeddings to the same length
    max_len = 60
    for i in range(len(all_embeddings)):
        if all_embeddings[i].shape[0] < max_len:
            padding = torch.zeros(max_len - all_embeddings[i].shape[0], 100)
            all_embeddings[i] = torch.cat((all_embeddings[i], padding), 0)
        else:
            all_embeddings[i] = all_embeddings[i][:max_len]

    all_embeddings_tensor = torch.stack(all_embeddings)
    print(all_embeddings_tensor.shape)
    return all_embeddings_tensor

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_heads, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.embedding_dim = input_dim
        self.transformer_encoder = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer = nn.TransformerEncoder(self.transformer_encoder, num_layers)

    def forward(self, input):
        output = self.transformer(input)
        return output

# Example usage
input_dim = 100  # Assuming each word embedding is of size 100
hidden_dim = 100
num_layers = 2
num_heads = 2

# Initialize transformer encoder
transformer_encoder = TransformerEncoder(input_dim, hidden_dim, num_layers, num_heads)

In [None]:
word_embeddings = process_questions(questions, word2vec_model)
contextual_embeddings = transformer_encoder.forward(word_embeddings)
print("Contextual embeddings shape:", contextual_embeddings.shape)

question_embeddings = torch.sum(contextual_embeddings, dim=1)
print("Question embeddings shape:", question_embeddings.shape)

In [None]:
class TextClassifier (nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_heads, num_classes):
        super(TextClassifier, self).__init__()
        self.transformer_encoder = TransformerEncoder(input_dim, hidden_dim, num_layers, num_heads)
        self.fc = nn.Linear(input_dim, num_classes)   

    def forward(self, question_vectors, col_embeding):
        # Compute dot product between contextual embeddings and one_hot_label
        dot_products = []
        for i in range(len(question_vectors)):
            dot_products.append(torch.matmul(question_vectors[i], col_embeding[i].t()))
        dot_products = torch.stack(dot_products)

        # Compute softmax along the last dimension (column-wise softmax)
        softmax_output = F.softmax(dot_products, dim=-1)

        return softmax_output
           

In [None]:
# make a tensor of tensors
actual_col_embeddings = []
padded_length = 64
for col in actual_col:
    temp_embedding = []
    for word in col:
        try:
            temp_embedding.append(word2vec_model[word.lower()])
        except:
            temp_embedding.append(np.zeros(100))
    # print(len(temp_embedding))
    # padd the embeddings to the same length
    if len(col) < padded_length:
        padding = np.zeros(100)
        for i in range(padded_length - len(col)):
            temp_embedding.append(padding)
    else:
        temp_embedding = temp_embedding[:padded_length]
    # print(len(temp_embedding))
    actual_col_embeddings.append(temp_embedding)

# convert the list to tensor
for i in range(len(actual_col_embeddings)):
    actual_col_embeddings[i] = torch.tensor(actual_col_embeddings[i], dtype=torch.float32)
actual_col_embeddings = torch.stack(actual_col_embeddings)
print(actual_col_embeddings.shape)

In [None]:
weights = [1.0, 1.0/2, 1.0/3, 1.0/4, 1.0/5, 1.0/6, 1.0/7, 1.0/8, 1.0/9, 1.0/10, 1.0/11, 1.0/12, 1.0/13, 1.0/14, 1.0/15, 1.0/16, 1.0/17, 1.0/18, 1.0/19, 1.0/20, 1.0/21, 1.0/22, 1.0/23, 1.0/24, 1.0/25, 1.0/26, 1.0/27, 1.0/28, 1.0/29, 1.0/30, 1.0/31, 1.0/32, 1.0/33, 1.0/34, 1.0/35, 1.0/36, 1.0/37, 1.0/38, 1.0/39, 1.0/40, 1.0/41, 1.0/42, 1.0/43, 1.0/44, 1.0/45, 1.0/46, 1.0/47, 1.0/48, 1.0/49, 1.0/50, 1.0/51, 1.0/52, 1.0/53, 1.0/54, 1.0/55, 1.0/56, 1.0/57, 1.0/58, 1.0/59, 1.0/60, 1.0/61, 1.0/62, 1.0/63, 1.0/64]

# change the weights to tensor
weights = torch.tensor(weights, dtype=torch.float32)

In [None]:
classifier = TextClassifier(input_dim, hidden_dim, num_layers, num_heads, len(actual_col))
optimizer = optim.Adam(classifier.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(weights)


In [None]:
print("Training start")

# Train the model
for epoch in range(1000):
    running_loss = 0.0
    accuracy = 0
    classifier.train()
    
    optimizer.zero_grad()
    outputs = classifier(question_embeddings, actual_col_embeddings)
    loss = criterion(outputs, one_hot_label)
    accuracy += (outputs.argmax(dim=1) == one_hot_label.argmax(dim=1)).sum().item()
    loss.backward(retain_graph=True)
    optimizer.step()
    running_loss += loss.item()
    if i % 5 == 4:
        # print(f'Epoch {epoch + 1}: loss {running_loss}')
        print(f'Accuracy: {accuracy / len(question_embeddings)}')
        running_loss = 0.0
        accuracy = 0
    classifier.eval()

In [None]:
# class CustomDataset(Dataset):
#     def _init_(self, data_list, columns_list, labels_list):
#         self.data = data_list
#         self.columns = columns_list
#         self.labels = labels_list

#     def _len_(self):
#         return len(self.data)

#     def _getitem_(self, idx):
#         sample = {
#             'data': self.data[idx],
#             'columns': self.columns[idx],
#             'labels': self.labels[idx]
#         }
#         return sample

# # Build the dataloader
# dataset = CustomDataset(tokenized_questions, column_embeddings, train_labels)
# dataloader = DataLoader(dataset, batch_size=5000, shuffle=True)

# val_dataset = CustomDataset(val_tokenized_questions, val_column_embeddings, val_labels)
# val_dataloader = DataLoader(val_dataset, batch_size=1000, shuffle=True)

# print("Training start")

# # Train the model
# for epoch in range(1000):
#     running_loss = 0.0
#     accuracy = 0
#     classifier.train()
#     for i, data in enumerate(dataloader, 0):
#         inputs = data['data'].to(device)
#         columns = data['columns'].to(device)
#         labels = data['labels'].to(device)
#         optimizer.zero_grad()
#         outputs = classifier(inputs, columns)
#         loss = criterion(outputs, labels)
#         accuracy += (outputs.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         if i % 5 == 4:
#             print(f'Epoch {epoch + 1}, batch {i + 1}: loss {running_loss / 5}')
#             print(f'Accuracy: {accuracy/(25 * 1000)}')
#             running_loss = 0.0
#             accuracy = 0
#     classifier.eval()
#     with torch.no_grad():
#         val_accuracy = 0
#         for i, data in enumerate(val_dataloader, 0):
#             inputs = data['data'].to(device)
#             columns = data['columns'].to(device)
#             labels = data['labels'].to(device)
#             outputs = classifier(inputs, columns)
#             val_accuracy += (outputs.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
#         print(f'Validation accuracy: {val_accuracy/len(val_questions)}')
#         if val_accuracy/len(val_questions) > 0.9:
#             break