In [None]:
import numpy as np
import json
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import re
from unidecode import unidecode


In [None]:
# from google.colab import drive
# drive.mount('data')
# nltk.download('punkt')

In [None]:
def clean_questions(questions):
    cleaned_questions = []
    for question in questions:
        question = question.lower()
        question = unidecode(question)
        cleaned_question = re.sub(r"[^\w\s]", "", question)  # Remove punctuation
        cleaned_questions.append(cleaned_question)
    return cleaned_questions

In [None]:
questions_train = []
tables_train = []
actual_col_train = []
label_cols_train = []
# Read JSON data from file line by line
with open('data/A2_train.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Parse JSON data from each line
        parsed_data = json.loads(line)
        questions_train.append(parsed_data['question'])
        tables_train.append(parsed_data['table'])
        label_cols_train.append(parsed_data['label_col'][0])
        actual_col_train.append(list(parsed_data['table']['cols']))

questions_train = clean_questions(questions_train)

print('Number of questions:', len(questions_train))
print('Number of tables:', len(tables_train))
print('Number of label columns:', len(label_cols_train))
print('Number of actual columns:', len(actual_col_train))

# word2vec_model = gensim.models.KeyedVectors.load('models/glove-wiki-gigaword-100')

In [None]:
questions_test = []
tables_test = []
actual_col_test = []
label_cols_test = []
qid_test = []
# Read JSON data from file line by line
with open('data/A2_val.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Parse JSON data from each line
        parsed_data = json.loads(line)
        questions_test.append(parsed_data['question'])
        tables_test.append(parsed_data['table'])
        label_cols_test.append(parsed_data['label_col'][0])
        actual_col_test.append(list(parsed_data['table']['cols']))
        qid_test.append(parsed_data['qid'])

questions_test = clean_questions(questions_test)

print('Number of questions:', len(questions_test))
print('Number of tables:', len(tables_test))
print('Number of label columns:', len(label_cols_test))
print('Number of actual columns:', len(actual_col_test))
print('Number of qids is ', len(qid_test))

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=60):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embedding_dim)
        self.max_len = max_len
        self.embedding_dim = embedding_dim
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self):
        return self.pe

def combine_embeddings(positional_embeddings, word_embeddings):
    return positional_embeddings + word_embeddings

class Classifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, num_heads, dropout, max_len=60):
        super(Classifier, self).__init__()
        self.pos_encoding = PositionalEncoding(embedding_dim, max_len)
        self.encode = nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim, dropout,batch_first=True)
        self.encoder = nn.TransformerEncoder(self.encode, num_layers)


    def forward(self, text_vectors, column):
        position = self.pos_encoding()
        final_embed = position + text_vectors
        contextual_embed = self.encoder(final_embed)
        question_embed  = torch.sum(contextual_embed,dim = 1)
        mat_mul = torch.nn.functional.normalize(column,dim = 2) * torch.nn.functional.normalize(question_embed.unsqueeze(1), dim = 2)
        dot_product = torch.sum(mat_mul, dim=2)
        return dot_product

model = gensim.downloader.load('glove-wiki-gigaword-100')

def get_one_hot_vectors(sentences, actual_cols, label_cols):
    one_hot_vectors = torch.zeros((len(sentences), 64),dtype=float)
    for i in range(len(sentences)):
        cols = actual_cols[i]
        for j in range(len(cols)):
            if cols[j]==label_cols[i]:
                one_hot_vectors[i][j] = 1.0
    return one_hot_vectors


def column_embeddings(tables,actual_col):
    column_embeddings = []
    for i in range(len(tables)):
        cols = actual_col[i]
        col_tensor = []
        for j in range(len(cols)):
            tokens = nltk.word_tokenize(cols[j].lower())
            vectors = []
            for token in tokens:
                try:
                    vectors.append(torch.tensor(model[token]))
                except:
                    vectors.append(torch.zeros(100))
            vectors = torch.sum(torch.stack(vectors,dim=0),dim=0)
            col_tensor.append(vectors)
        while len(col_tensor) < 64:
            col_tensor.append(torch.zeros(100))
        column_embeddings.append(torch.stack(col_tensor,dim=0))
    return column_embeddings

In [None]:
train_labels = get_one_hot_vectors(questions_train, actual_col_train, label_cols_train)
column_embeddings_train = column_embeddings(tables_train, actual_col_train)
test_labels = get_one_hot_vectors(questions_test, actual_col_test, label_cols_test)
column_embeddings_test = column_embeddings(tables_test, actual_col_test)

In [None]:
question_vectors_train = []
for question in questions_train:
    tokens = nltk.word_tokenize(question)

    # Convert the tokens to word vectors
    vectors = []
    for token in tokens:
        try:
            vectors.append(torch.tensor(model[token]))
        except:
            pass
    # pad to 100 tokens
    while len(vectors) < 60:
        vectors.append(torch.zeros(100))
    # concatenate the vectors to one tensor
    vectors = torch.stack(vectors, dim=0)
    question_vectors_train.append(vectors)

In [None]:
question_vectors_test = []
for question in questions_test:
    tokens = nltk.word_tokenize(question)

    # Convert the tokens to word vectors
    vectors = []
    for token in tokens:
        try:
            vectors.append(torch.tensor(model[token]))
        except:
            pass
    # pad to 100 tokens
    while len(vectors) < 60:
        vectors.append(torch.zeros(100))
    # concatenate the vectors to one tensor
    vectors = torch.stack(vectors, dim=0)
    question_vectors_test.append(vectors)

In [None]:
embedding_dim = 100
hidden_dim = 256
output_dim = 64
num_layers = 2
num_heads = 1
dropout = 0.02
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
classifier = Classifier(embedding_dim, hidden_dim, num_layers, num_heads, dropout).to(device)
class_weights = torch.tensor([1.0, 1.0/2, 1.0/3, 1.0/4, 1.0/5, 1.0/6, 1.0/7, 1.0/8, 1.0/9, 1.0/10, 1.0/11, 1.0/12, 1.0/13, 1.0/14, 1.0/15, 1.0/16, 1.0/17, 1.0/18, 1.0/19, 1.0/20, 1.0/21, 1.0/22, 1.0/23, 1.0/24, 1.0/25, 1.0/26, 1.0/27, 1.0/28, 1.0/29, 1.0/30, 1.0/31, 1.0/32, 1.0/33, 1.0/34, 1.0/35, 1.0/36, 1.0/37, 1.0/38, 1.0/39, 1.0/40, 1.0/41, 1.0/42, 1.0/43, 1.0/44, 1.0/45, 1.0/46, 1.0/47, 1.0/48, 1.0/49, 1.0/50, 1.0/51, 1.0/52, 1.0/53, 1.0/54, 1.0/55, 1.0/56, 1.0/57, 1.0/58, 1.0/59, 1.0/60, 1.0/61, 1.0/62, 1.0/63, 1.0/64]).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=0.005)
classifier.train()

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data_list, columns_list, labels_list):
        self.data = data_list
        self.columns = columns_list
        self.labels = labels_list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = {
            'data': self.data[idx],
            'columns': self.columns[idx],
            'labels': self.labels[idx]
        }
        return sample

# Build the dataloader
dataset_train = CustomDataset(question_vectors_train, column_embeddings_train, train_labels)
dataloader_train = DataLoader(dataset_train, batch_size=5000, shuffle=True)

dataset_test = CustomDataset(question_vectors_test, column_embeddings_test, test_labels)
dataloader_test = DataLoader(dataset_test, batch_size=1000, shuffle=True)

In [None]:
print("Training start")

# Train the model
for epoch in range(1000):
    running_loss = 0.0
    accuracy = 0
    classifier.train()
    for i, data in enumerate(dataloader_train, 0):
        inputs = data['data'].to(device)
        columns = data['columns'].to(device)
        labels = data['labels'].to(device)
        optimizer.zero_grad()
        outputs = classifier(inputs, columns)
        loss = criterion(outputs, labels)
        accuracy += (outputs.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 5 == 4:
            print(f'Epoch {epoch + 1}, batch {i + 1}: loss {running_loss / 5}')
            print(f'Accuracy: {accuracy/(25 * 1000)}')
            running_loss = 0.0
            accuracy = 0
    classifier.eval()
    with torch.no_grad():
        val_accuracy = 0
        for i, data in enumerate(dataloader_test, 0):
            inputs = data['data'].to(device)
            columns = data['columns'].to(device)
            labels = data['labels'].to(device)
            outputs = classifier(inputs, columns)
            val_accuracy += (outputs.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
        print(f'Validation accuracy: {val_accuracy/len(questions_test)}')
        if val_accuracy/len(questions_test) > 0.9:
            break