In [None]:
import gc
import os
import pandas as pd
import pytorch_lightning as pl
import pytorch_lightning.callbacks as callbacks
import torch
import wandb

from pytorch_lightning.loggers import WandbLogger
from sklearn.model_selection import train_test_split

## Data preprocessing and augmentation

In [None]:
import preprocessing as pp

# Import and read data
df = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

"""
Preprocess data:
    - Filter duplicated features
    - Impute null entries
    - Concatenate titles and summaries
    - Multi-hot encode labels
    - Remove stopwords
    - Lemmatize
    - Filter short texts from training data
"""
df = pp.process(df, 'train')
df_test = pp.process(df_test, 'test')
df['length'] = df['text'].apply(lambda x: len(x))
df = df[df['length'] >= 64].reindex()

# list of all labels
labels = df.loc[:, ~df.columns.isin(['text', 'length'])].sum().index.tolist()

# stratified split into train/val sets
X, y = df['text'], df[labels]
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size = 0.2,
    shuffle = True,
    stratify = y,
    random_state = 42
)

# Oversample minority classes
# Randomly sample tokens
# Join tokens
num_samples = 1000 # number of samples per class
train_data = y_train.join(X_train)
augmented_train = pp.random_sampling_augment(train_data, labels, 1000)

# reformat data
X_train, y_train = augmented_train['text'], augmented_train[labels]
X_val = X_val.apply(lambda x: pp.join_tokens(x))
X_test = df_test['text'].apply(lambda x: pp.join_tokens(x))

# compute class weights for loss
pos_weight = pp.get_pos_weight(y_train)

# prepare dataset for dataloader
data = {
    'train': (X_train, y_train),
    'val': (X_val, y_val),
    'test': X_test,
    'predict': X_test
}

# Defining the model and training loop

In [None]:
import model

def train(
        config = None,
        data = data, 
        num_labels = len(labels), 
        pos_weight = pos_weight
        ):
    with wandb.init(config=config):
        config = wandb.config

        model_config = {
            'weight_decay': config.weight_decay,
            'max_lr': config.max_lr,
            'pct_start': config.pct_start,
            'dropout': config.dropout,
            'num_labels': num_labels,
            'vocab_length': config.vocab_length,
            'max_length': config.max_length,
            'embedding_dim': config.embedding_dim,
            'hidden_dim': config.hidden_dim,
            'pos_weight': torch.tensor(pos_weight)
        }
        datamodule_config = {
            'batch_size': config.batch_size,
            'max_length': config.max_length,
            'vocab_length': config.vocab_length,
            'n_cores': os.cpu_count()
        }

        gc.collect()
        torch.cuda.empty_cache()

        try:
            classifier = model.PLEmbeddingClassifier(config=model_config)
            datamodule = model.arXivDataModule(config=datamodule_config,data=data)
            wandb_logger = WandbLogger(log_model=True)
            trainer = pl.Trainer(
                accelerator = 'gpu',
                devices = 'auto',
                max_epochs = config.epochs,
                benchmark = True,
                precision = 16,
                log_every_n_steps = 5,
                logger = wandb_logger,
                strategy = 'ddp_notebook_find_unused_parameters_false',
                callbacks = [
                    callbacks.ModelCheckpoint(monitor='val_loss', mode='min'),
                    callbacks.ModelSummary(2),
                    callbacks.LearningRateMonitor(logging_interval = 'step')
                ]
            )
            trainer.fit(classifier, datamodule)
        except Exception as e:
            print(e)

        return trainer, classifier, datamodule

In [None]:
train_config = {
    'epochs': 6,
    'weight_decay': 4.695,
    'max_lr': 1.743e-3,
    'pct_start': 0.3332,
    'dropout': 1.342e-3,
    'embedding_dim': 1024,
    'hidden_dim': 256,
    'batch_size': 512,
    'max_length': 32,
    'vocab_length': 8192
}

# Training

In [None]:
wandb.finish()
trainer, classifier, datamodule = train(train_config)

In [None]:
trainer.save_checkpoint('./output/best_model.ckpt', weights_only=True)

# Inference

In [None]:
# put classifier in evaluation mode
classifier.eval()
# set up prediction dataloader
datamodule.setup(stage='predict')
prediction_DL = datamodule.predict_dataloader()
# need trainer on CPU for inference
newTrainer = pl.Trainer(accelerator='cpu')
# generate predictions
preds = torch.cat(newTrainer.predict(classifier, prediction_DL))
# process predictions tensor into DataFrame
preds_df = pd.DataFrame(preds.numpy())
preds_df.columns = labels
test_ids = pd.read_csv('./data/test.csv')
preds_df['ids'] = test_ids['ids']
df_sample_submission = pd.read_csv('./data/sample_submission.csv')
# reorder columns to match submission format
preds_df = preds_df[df_sample_submission.columns.tolist()]
preds_df.to_csv('./output/submission.csv', index=False)