## Analysis

In [3]:
BASE_PATH = '/content/drive/MyDrive/Health/'
DATASET_PATH = BASE_PATH + 'dataset/'
VOCAB_PATH = '/content/drive/MyDrive/Health/vocab.txt'

TEST_SET_PATH = BASE_PATH + 'test_1000-1099.fas'

URL_DRIVE = 'https://drive.google.com/uc?id='

# BASIC DATASET
# G_TRAIN_FILE = URL_DRIVE + '1-1eXBY8yHHmWDlzr-gZXAX8POor-P1xs'
# G_DEV_FILE   = URL_DRIVE + '1bo_JKi6SbcRH8l-2GhIcdl1AnWvNaOYK'
# G_TEST_FILE  = URL_DRIVE + '1-2zqIzM3FgsfTi4jJmC2wBb9KHjKk_xx'

# K512 DATASET
G_TRAIN_FILE = URL_DRIVE + '1-ESFbZab0N7npe1Q01549CqEKLfQoCDd'
G_VALID_FILE   = URL_DRIVE + '1-GSyPyObFCwFqa8amuizDIQbE51sPAKw'
G_TEST_FILE  = URL_DRIVE + '1-8EQbga_UpaV3wwBxuUc7dzij_LJzQ3a'

TRAIN_FILE = 'dataset/train/train.tsv'
VALID_FILE = 'dataset/train/dev.tsv'
TEST_FILE = 'dataset/test/dev.tsv'

# ID_LABELS
TRAIN_ID_LABELS = URL_DRIVE + '1_D-b0-R4ybQUqzjA8rHxjvNQxBF36V-G'
VALID_ID_LABELS = URL_DRIVE + '1-P13Uomv9SeBQondNdqNIKsYgWVkpbrj'
TEST_ID_LABELS = URL_DRIVE + '1SRO5FxufHQcjGHFTXHuXy-M_xDAkqUWm'

TRAIN_ID_LABELS_FILE = 'dataset/train/train_id_labels.txt'
VALID_ID_LABELS_FILE = 'dataset/train/dev_id_labels.txt'
TEST_ID_LABELS_FILE = 'dataset/test/dev_id_labels.txt'

K = 6
SPLIT_SIZE = 3584 #512 sequences

In [4]:
!pip -q install gdown

In [5]:
from itertools import product
import random
import glob
import os
import time
import shutil

import gdown

In [6]:
os.makedirs('dataset', exist_ok=True)
os.makedirs('dataset/train', exist_ok=True)
os.makedirs('dataset/test', exist_ok=True)
os.makedirs('output', exist_ok=True)

In [7]:
# downloading dataset seq-id
gdown.download(G_TRAIN_FILE, TRAIN_FILE, quiet=True)
gdown.download(G_VALID_FILE, VALID_FILE, quiet=True)
gdown.download(G_TEST_FILE, TEST_FILE, quiet=True)

# downloading dataset id-label
gdown.download(TRAIN_ID_LABELS, TRAIN_ID_LABELS_FILE, quiet=True)
gdown.download(VALID_ID_LABELS, VALID_ID_LABELS_FILE, quiet=True)
gdown.download(TEST_ID_LABELS, TEST_ID_LABELS_FILE, quiet=True)

In [8]:
!ls dataset/train

In [10]:
import pandas as pd

df = pd.read_csv('dataset/train/dev.tsv', sep='\t', header=0)
df.drop(df.tail(1).index,inplace=True) # drop last n rows

In [11]:
labels_df = pd.read_csv(VALID_ID_LABELS_FILE, sep='\t', header=0)
labels_df.rename(columns={"seq_id": "id"}, inplace=True)
#labels_df

In [12]:
df_merged = pd.merge(df, labels_df, on=['id'])
df_merged.drop(columns=['id'], inplace=True)
df_merged.head()

In [13]:
#from datasets import load_dataset
from datasets import Dataset

#dataset = load_dataset('csv', data_files='dataset/train/dev.tsv')
dataset = Dataset.from_pandas(df_merged)
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

In [14]:
import torch
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification

tokenizer = BigBirdTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli", num_labels=2)
model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli")

In [15]:
def tokenize_function(examples):
    return tokenizer(examples["sequence"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [18]:
small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(100))

In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

In [22]:
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

In [None]:
LONG_SEQUENCE = small_eval_dataset[0]["sequence"]
inputs = tokenizer(LONG_SEQUENCE, return_tensors="pt")
list(inputs["input_ids"].shape)

In [None]:
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]