In [None]:
import nltk 
import pandas as pd
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from gensim.models import FastText
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_vectors
import gensim
import math
import random
import numpy as np
from unidecode import unidecode

# nltk.download('punkt')
# from google.colab import drive
# drive.mount('data')


questions_train = []
tables_train = []
actual_col_train = []
label_cols_train = []
# Read JSON data from file line by line
with open('data/A2_train.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Parse JSON data from each line
        parsed_data = json.loads(line)
        questions_train.append(parsed_data['question'])
        tables_train.append(parsed_data['table'])
        label_cols_train.append(parsed_data['label_col'][0])
        actual_col_train.append(list(parsed_data['table']['cols']))

# questions_train = clean_questions(questions_train)

print('Number of questions:', len(questions_train))
print('Number of tables:', len(tables_train))
print('Number of label columns:', len(label_cols_train))
print('Number of actual columns:', len(actual_col_train))

questions_test = []
tables_test = []
actual_col_test = []
label_cols_test = []
qid_test = []
# Read JSON data from file line by line
with open('data/A2_val.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Parse JSON data from each line
        parsed_data = json.loads(line)
        questions_test.append(parsed_data['question'])
        tables_test.append(parsed_data['table'])
        label_cols_test.append(parsed_data['label_col'][0])
        actual_col_test.append(list(parsed_data['table']['cols']))
        qid_test.append(parsed_data['qid'])

# questions_test = clean_questions(questions_test)

print('Number of questions:', len(questions_test))
print('Number of tables:', len(tables_test))
print('Number of label columns:', len(label_cols_test))
print('Number of actual columns:', len(actual_col_test))
print('Number of qids is ', len(qid_test))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

max_len_question = 60

class PositionalEmbedding(nn.Module):
    def __init__(self, embedding_dim):
        super(PositionalEmbedding, self).__init__()
        pos_em = torch.zeros(max_len_question, embedding_dim)
        division = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        position = torch.arange(0, max_len_question, dtype=torch.float).unsqueeze(1)
        pos_em[:, 0::2] = torch.sin(position * division)
        pos_em[:, 1::2] = torch.cos(position * division)
        self.register_buffer('pos_em', pos_em)

    def forward(self):
        return self.pos_em


class Classifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_layers, num_heads, dropout):
        super(Classifier, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim, dropout,batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers)
        self.positional_embedding = PositionalEmbedding(embedding_dim)
        

    def forward(self, text_vectors, column):
        pos_embedding = self.positional_embedding()
        input_embedding = text_vectors + pos_embedding
        contextual_embedding = self.transformer_encoder(input_embedding)
        question_embedding  = torch.sum(contextual_embedding,dim = 1)
        mat_mul = torch.nn.functional.normalize(column,dim = 2) * torch.nn.functional.normalize(question_embedding.unsqueeze(1), dim = 2)
        dot_prod = torch.sum(mat_mul, dim=2)
        return dot_prod

model = gensim.downloader.load('glove-wiki-gigaword-100')

tokenized_questions = []
for question in questions_train:
    tokens = nltk.word_tokenize(unidecode(question).lower())

    vectors = []
    for token in tokens:
        try:
            vectors.append(torch.tensor(model[token]))
        except:
            pass
    # pad to 100 tokens
    while len(vectors) < 60:
        vectors.append(torch.zeros(100))
    # concatenate the vectors to one tensor
    vectors = torch.stack(vectors, dim=0)
    tokenized_questions.append(vectors)

val_tokenized_questions = []
for question in questions_test:
    tokens = nltk.word_tokenize(unidecode(question).lower())

    vectors = []
    for token in tokens:
        try:
            vectors.append(torch.tensor(model[token]))
        except:
            pass
    # pad to 100 tokens
    while len(vectors) < 60:
        vectors.append(torch.zeros(100))
    # concatenate the vectors to one tensor
    vectors = torch.stack(vectors, dim=0)
    val_tokenized_questions.append(vectors)

train_labels = np.zeros((len(questions_train),64),dtype=float)
column_embeddings = []
for idx,table in enumerate(tables_train):
    cols = actual_col_train[idx]
    table_column_tensor = []
    x = 0
    for j,col in enumerate(cols):
        if col == label_cols_train[idx]:
            x += 1
            train_labels[idx][j] = 1.0
        tokens = nltk.word_tokenize(unidecode(col).lower())
        vectors = []
        for token in tokens:
            try:
                vectors.append(torch.tensor(model[token]))
            except:
                vectors.append(torch.zeros(100))
        # sum
        vectors = torch.sum(torch.stack(vectors, dim=0), dim = 0)
        table_column_tensor.append(vectors)
    assert(x == 1)
    while len(table_column_tensor) < 64:
        table_column_tensor.append(torch.zeros(100))
    column_embeddings.append(torch.stack(table_column_tensor,dim = 0))
train_labels = torch.Tensor(np.array(train_labels))

val_labels = np.zeros((len(questions_test),64),dtype=float)
val_column_embeddings = []
for idx,table in enumerate(tables_test):
    cols = actual_col_test[idx]
    table_column_tensor = []
    x = 0
    for j,col in enumerate(cols):
        if col == label_cols_test[idx]:
            x += 1
            val_labels[idx][j] = 1.0
        tokens = nltk.word_tokenize(unidecode(col).lower())
        vectors = []
        for token in tokens:
            try:
                vectors.append(torch.tensor(model[token]))
            except:
                vectors.append(torch.zeros(100))
        # sum
        vectors = torch.sum(torch.stack(vectors, dim=0), dim = 0)
        table_column_tensor.append(vectors)
    assert(x == 1)
    while len(table_column_tensor) < 64:
        table_column_tensor.append(torch.zeros(100))
    val_column_embeddings.append(torch.stack(table_column_tensor,dim = 0))
val_labels = torch.Tensor(np.array(val_labels))

# Create the model
embedding_dim = 100
hidden_dim = 256
num_layers = 2
num_heads = 1
dropout = 0.02
classifier = Classifier(embedding_dim, hidden_dim, num_layers, num_heads, dropout).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.005)
classifier.train()
class CustomDataset(Dataset):
    def __init__(self, data_list, columns_list, labels_list):
        self.data = data_list
        self.columns = columns_list
        self.labels = labels_list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = {
            'data': self.data[idx],
            'columns': self.columns[idx],
            'labels': self.labels[idx]
        }
        return sample

# Build the dataloader
dataset = CustomDataset(tokenized_questions, column_embeddings, train_labels)
dataloader = DataLoader(dataset, batch_size=5000, shuffle=True)

val_dataset = CustomDataset(val_tokenized_questions, val_column_embeddings, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=1000, shuffle=True)

print("Training start")

# Train the model
for epoch in range(200):
    running_loss = 0.0
    accuracy = 0
    classifier.train()
    for i, data in enumerate(dataloader, 0):
        inputs = data['data'].to(device)
        columns = data['columns'].to(device)
        labels = data['labels'].to(device)
        optimizer.zero_grad()
        outputs = classifier(inputs, columns)
        loss = criterion(outputs, labels)
        accuracy += (outputs.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 5 == 4:
            print(f'Epoch {epoch + 1}, batch {i + 1}: loss {running_loss / 5}')
            print(f'Accuracy: {accuracy/(25 * 1000)}')
            running_loss = 0.0
            accuracy = 0
    classifier.eval()
    with torch.no_grad():
        val_accuracy = 0
        for i, data in enumerate(val_dataloader, 0):
            inputs = data['data'].to(device)
            columns = data['columns'].to(device)
            labels = data['labels'].to(device)
            outputs = classifier(inputs, columns)
            val_accuracy += (outputs.argmax(dim=1) == labels.argmax(dim=1)).sum().item()
        print(f'Validation accuracy: {val_accuracy/len(questions_test)}')
        if val_accuracy/len(questions_test) > 0.9:
            break
