In [1]:
!pip install sequitur gdown sentence_transformers

Collecting sequitur
  Downloading sequitur-1.2.4.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tqdm (from gdown)
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scikit-learn (from sentence_transformers)


In [2]:
import gdown
import os


# same as the above, and you can copy-and-paste a URL from Google Drive with fuzzy=True
output = "./train_qna.csv"
url = "https://drive.google.com/file/d/1l3IhqCEErRu_gls34kOdt4V6kmmeQzXE/view?usp=sharing"
gdown.download(url=url, output=output, quiet=False, fuzzy=True)


output = "./val_qna.csv"
url = "https://drive.google.com/file/d/1_MmyHp7u384ZSfF1Ww0A1TCrj9Yg5ZQD/view?usp=sharing"
gdown.download(url=url, output=output, quiet=False, fuzzy=True)

output = "./test_qna.csv"
url = "https://drive.google.com/file/d/1hpb5AXrgxu8GQpAkGnYNl_SnnPEWAFJ9/view?usp=sharing"
gdown.download(url=url, output=output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1l3IhqCEErRu_gls34kOdt4V6kmmeQzXE
To: /workspace/train_qna.csv
100%|██████████| 33.9M/33.9M [00:00<00:00, 37.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1_MmyHp7u384ZSfF1Ww0A1TCrj9Yg5ZQD
To: /workspace/val_qna.csv
100%|██████████| 10.2M/10.2M [00:00<00:00, 22.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hpb5AXrgxu8GQpAkGnYNl_SnnPEWAFJ9
To: /workspace/test_qna.csv
100%|██████████| 8.20M/8.20M [00:00<00:00, 23.1MB/s]


'./test_qna.csv'

In [4]:
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MCQDataset(torch.utils.data.Dataset):
    """
        
    """

    _nlp_model = None

    @property
    def nlp_model(self):
        from sentence_transformers import SentenceTransformer
        if MCQDataset._nlp_model:
            return MCQDataset._nlp_model
        
        MCQDataset._nlp_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        return MCQDataset._nlp_model


    def __init__(self, datapath, seq_len=5):
        self.datapath = datapath
        self.seq_len = seq_len

        import os
        self.df = pd.read_csv(self.datapath)
            
        # preprocess topic data
        self.df['question_embedding'] = self._create_q_embeddings()
        self.df['answer_embedding']= self._create_a_embeddings()

    def _create_q_embeddings(self):
        # create embeddings for each topic
        embeddings = self.nlp_model.to(device).encode(self.df["question"])
        return list(map(lambda x: np.squeeze(x), np.split(embeddings, embeddings.shape[0])))
    def _create_a_embeddings(self):
        # create embeddings for each topic
        embeddings = self.nlp_model.to(device).encode(self.df["choice"])
        return list(map(lambda x: np.squeeze(x), np.split(embeddings, embeddings.shape[0])))
       
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
#         display(self.df)
        df2 = self.df[self.df["user_id"] == self.df.iloc[idx]["user_id"]].reset_index(drop=True)
        df2 = df2.sort_values(by="start_time").reset_index(drop=True)
        indx = df2[(df2["question"]==self.df.iloc[idx]["question"]) & (df2["choice"]==self.df.iloc[idx]["choice"])].index[0]

        
        if indx >= self.seq_len:
            seq_before = df2.iloc[indx-self.seq_len+1 : indx+1]
        else:
            seq_before = df2.iloc[0: indx+1]


        data = torch.stack(
            seq_before.apply(lambda x: np.concatenate((x['question_embedding'], x['answer_embedding'])), axis=1)
              .apply(lambda x: torch.tensor(x, dtype=torch.float32))
              .tolist()
        )

        return data

In [6]:
train_dataset = MCQDataset('./train_qna.csv', seq_len=10)
val_dataset = MCQDataset('./val_qna.csv', seq_len=10)
test_dataset = MCQDataset('./test_qna.csv', seq_len=10)

Downloading (…)0fe39/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)83e900fe39/README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading (…)e900fe39/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)900fe39/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [7]:
train_index_to_filter = []
train_filtered = []
for x in tqdm(train_dataset):
    train_index_to_filter.append(x.shape[0] > 1)
    if x.shape[0] > 1:
        train_filtered.append(x)


val_index_to_filter = []
val_filtered = []
for x in tqdm(val_dataset):
    val_index_to_filter.append(x.shape[0] > 1)
    if x.shape[0] > 1:
        val_filtered.append(x)


test_index_to_filter = []
test_filtered = []
for x in tqdm(test_dataset):
    test_index_to_filter.append(x.shape[0] > 1)
    if x.shape[0] > 1:
        test_filtered.append(x)

  0%|          | 0/145620 [00:00<?, ?it/s]

  0%|          | 0/43006 [00:00<?, ?it/s]

  0%|          | 0/32351 [00:00<?, ?it/s]

In [73]:
from statistics import mean
from tqdm.auto import tqdm
from pathlib import Path
from torch.nn import MSELoss

def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")


def instantiate_model(model, train_set, encoding_dim, **kwargs):
    if model.__name__ in ("LINEAR_AE", "LSTM_AE"):
        return model(train_set[-1].shape[-1], encoding_dim, **kwargs)
    elif model.__name__ == "CONV_LSTM_AE":
        if len(train_set[-1].shape) == 3:  # 2D elements
            return model(train_set[-1].shape[-2:], encoding_dim, **kwargs)
        elif len(train_set[-1].shape) == 4:  # 3D elements
            return model(train_set[-1].shape[-3:], encoding_dim, **kwargs)

@torch.no_grad()
def validate_model(model, val_set, epoch, criterion, device):
    model.eval()
    
    losses = []
    for x in tqdm(val_set, desc=f"Val Epoch {epoch: 3d}"):
        x = x.to(device)
        x_prime = model(x)
        
        loss = criterion(x_prime, x)
        losses.append(loss.item())
        
    return mean(losses)
        

def train_model(
    model, train_set, val_set, verbose, lr, epochs, denoise, clip_value, device=None, save_path=Path("./checkpoints")
):
    if device is None:
        device = get_device()
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = MSELoss(reduction="sum")

    mean_losses = []
    val_losses = []
    for epoch in range(1, epochs + 1):
        model.train()

        # # Reduces learning rate every 50 epochs
        # if not epoch % 50:
        #     for param_group in optimizer.param_groups:
        #         param_group["lr"] = lr * (0.993 ** epoch)

        losses = []
        for x in tqdm(train_set, desc=f"Train Epoch {epoch: 3d}"):
            x = x.to(device, non_blocking=True)
            
            optimizer.zero_grad()

            # Forward pass
            x_prime = model(x)

            loss = criterion(x_prime, x)

            # Backward pass
            loss.backward()
           

            # Gradient clipping on norm
            if clip_value is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

            optimizer.step()

            losses.append(loss.item())

        mean_loss = mean(losses)
        mean_losses.append(mean_loss)
        
        torch.save(model.state_dict(), str(save_path / f"model_{epoch:03d}.pt"))

        val_loss = validate_model(model, val_set, epoch, criterion, device)
        val_losses.append(val_loss)
        
        if verbose:
            print(f"Epoch: {epoch}, Train loss: {mean_loss}, Validation Loss: {val_loss}")

    return mean_losses, val_losses


@torch.no_grad()
def get_encodings(model, train_set, device=None):
    if device is None:
        device = get_device()
    model.eval()
    encodings = [model.encoder(x.to(device)) for x in tqdm(train_set)]
    return encodings


######
# MAIN
######


def quick_train(
    model,
    train_set,
    val_set,
    encoding_dim,
    verbose=False,
    lr=1e-3,
    epochs=50,
    clip_value=1,
    denoise=False,
    device=None,
    save_path=Path("./checkpoints"),
    **kwargs,
):
    model = instantiate_model(model, train_set, encoding_dim, **kwargs)

    save_path.mkdir(parents=True, exist_ok=True)
    train_losses, val_losses = train_model(
        model, train_set, val_set, verbose, lr, epochs, denoise, clip_value, device, save_path
    )

    return model.encoder, model.decoder, train_losses, val_losses

In [62]:
from sequitur.models import LSTM_AE

encoder, decoder, train_losses, val_losses = quick_train(LSTM_AE, train_gpu, val_filtered, encoding_dim=512, verbose=True, lr=1e-3, epochs=50, denoise=False, h_dims=[512, 512, 512],)

Train Epoch   1:   0%|          | 0/135267 [00:00<?, ?it/s]

Val Epoch   1:   0%|          | 0/35007 [00:00<?, ?it/s]

Epoch: 1, Train loss: 156.61422064648283, Validation Loss: 90.17015298436436


Train Epoch   2:   0%|          | 0/135267 [00:00<?, ?it/s]

Val Epoch   2:   0%|          | 0/35007 [00:00<?, ?it/s]

Epoch: 2, Train loss: 122.75223265864777, Validation Loss: 89.74077505458817


Train Epoch   3:   0%|          | 0/135267 [00:00<?, ?it/s]

Epoch: 3, Train loss: 122.94635681162087, Validation Loss: 89.98076720173303


Train Epoch   4:   0%|          | 0/135267 [00:00<?, ?it/s]

Epoch: 5, Train loss: 122.91766345850905, Validation Loss: 90.34999194363006


Train Epoch   6:   0%|          | 0/135267 [00:00<?, ?it/s]

Val Epoch  11:   0%|          | 0/35007 [00:00<?, ?it/s]

Epoch: 11, Train loss: 123.03513567070489, Validation Loss: 90.16574310107461


Train Epoch  12:   0%|          | 0/135267 [00:00<?, ?it/s]

In [None]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))
    
download_file("./checkpoints", "out")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

ax = plt.subplot()
sns.lineplot(x=range(1, len(train_losses)), y=train_losses[1:], label="Train Loss", ax=ax)
sns.lineplot(x=range(1, len(val_losses)), y=val_losses[1:], label="Validation Loss", ax=ax)
ax.set_yscale('log')

In [71]:
import glob

for mpath in glob.glob("./checkpoints/*"):
    ckpt = torch.load(mpath)
    mloaded = LSTM_AE(val_filtered[-1].shape[-1], 512, h_dims=[512, 512, 512])
    mloaded.load_state_dict(ckpt)
    print(validate_model(mloaded.to(device), val_filtered[:10000], 1, torch.nn.functional.mse_loss, device))

Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.02002157229902223


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.019925999245140703


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.019988625720608978


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.019891070815501736


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.020066130213579163


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.020145628695655615


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.02007863878160715


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.020226230228925123


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.02002329792706296


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.02014954390199855


Val Epoch   1:   0%|          | 0/10000 [00:00<?, ?it/s]

0.0200246145113837


In [76]:
ckpt = torch.load("./model_004.pt")
mloaded = LSTM_AE(val_filtered[-1].shape[-1], 512, h_dims=[512, 512, 512])
mloaded.load_state_dict(ckpt)
mloaded = mloaded.to(device)

train_embeddings = get_encodings(mloaded, train_dataset)
val_embeddings = get_encodings(mloaded, val_dataset)
test_embeddings = get_encodings(mloaded, test_dataset)

  0%|          | 0/145620 [00:00<?, ?it/s]

  0%|          | 0/43006 [00:00<?, ?it/s]

  0%|          | 0/32351 [00:00<?, ?it/s]

In [96]:
output = "./train_qna_initial.pkl"
url = "https://drive.google.com/file/d/124QDhAo21G4jWOA9L-LIKMjZe8PY0zE0/view?usp=sharing"
gdown.download(url=url, output=output, quiet=False, fuzzy=True)

output = "./val_qna_initial.pkl"
url = "https://drive.google.com/file/d/1cnQGQ6Y--Cd3Ri_vQuOgsRmZ-BNhznoY/view?usp=sharing"
gdown.download(url=url, output=output, quiet=False, fuzzy=True)

output = "./test_qna_initial.pkl"
url = "https://drive.google.com/file/d/16Kis7d7tTu9SUwBT6kZt4kfKvW1M0q6d/view?usp=sharing"
gdown.download(url=url, output=output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=124QDhAo21G4jWOA9L-LIKMjZe8PY0zE0
To: /workspace/train_qna_initial.pkl
100%|██████████| 63.1M/63.1M [00:25<00:00, 2.52MB/s]
Downloading...
From: https://drive.google.com/uc?id=1cnQGQ6Y--Cd3Ri_vQuOgsRmZ-BNhznoY
To: /workspace/val_qna_initial.pkl
100%|██████████| 18.9M/18.9M [00:01<00:00, 16.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=16Kis7d7tTu9SUwBT6kZt4kfKvW1M0q6d
To: /workspace/test_qna_initial.pkl
100%|██████████| 14.1M/14.1M [00:01<00:00, 12.6MB/s]


'./test_qna_initial.pkl'

In [97]:
train_csv = pd.read_csv("./train_qna.csv", )
train_csv['embeddings'] = list(map(lambda x: x.detach().cpu().numpy(), train_embeddings))

train_qna_pkl = pd.read_pickle("./train_qna_initial.pkl")[["user_id", "multiple_responses", "question", "choices", "correct", "student_answer", "start_time"]]
train_qna_pkl["embedding"] = train_csv.groupby("question_index").apply(lambda x: x.iloc[-1])["embeddings"]
train_qna_pkl.to_pickle("./train_pkl_with_embeddings")

In [98]:
val_csv = pd.read_csv("./val_qna.csv", )
val_csv['embeddings'] = list(map(lambda x: x.detach().cpu().numpy(), val_embeddings))

val_qna_pkl = pd.read_pickle("./val_qna_initial.pkl")[["user_id", "multiple_responses", "question", "choices", "correct", "student_answer", "start_time"]]
val_qna_pkl["embedding"] = val_csv.groupby("question_index").apply(lambda x: x.iloc[-1])["embeddings"]
val_qna_pkl.to_pickle("./val_pkl_with_embeddings")

In [99]:
test_csv = pd.read_csv("./test_qna.csv", )
test_csv['embeddings'] = list(map(lambda x: x.detach().cpu().numpy(), test_embeddings))

test_qna_pkl = pd.read_pickle("./test_qna_initial.pkl")[["user_id", "multiple_responses", "question", "choices", "correct", "student_answer", "start_time"]]
test_qna_pkl["embedding"] = test_csv.groupby("question_index").apply(lambda x: x.iloc[-1])["embeddings"]
test_qna_pkl.to_pickle("./test_pkl_with_embeddings")

In [104]:
qna_with_embeddings = pd.concat((train_qna_pkl, val_qna_pkl, test_qna_pkl), ignore_index=True)
qna_with_embeddings.to_pickle("merged_qna_with_embeddings.pkl")
qna_with_embeddings.head(2)

Unnamed: 0,user_id,multiple_responses,question,choices,correct,student_answer,start_time,embedding
0,387604,True,<p>Marlyne Sahakian ist Assistenzprofessorin a...,"[<p>…wundert sich darüber, dass SchweizerInnen...","[False, False, True, False]","[False, True, True, False]",2021-10-31 18:36:44.534,"[0.24643469, -0.05258508, 0.5322496, -0.043402..."
1,387604,True,<strong>Wähle die korrekte Pluralform(en) als ...,"[die Daten, die Data, die Datume]","[True, False, False]","[True, False, False]",2021-11-09 07:57:38.255,"[0.22013251, -0.052667905, 0.59776336, -0.0211..."
