# SCIBERT

trained on Zora

In [8]:
import spacy
import pandas as pd

from spacy.cli.train import train as spacy_train

In [4]:
train_set = 'zora'
test_set = 'zora'
model_name = 'scibert'

# Edit paths as needed:
config_path = f"../cfg/spacy/{model_name}.cfg"
model_out_path = f"models/ft-{model_name}-{train_set}"
train_path, dev_path, test_path = (
    f"data/spacy_docs/{train_set}_train.spacy",
    f"data/spacy_docs/{train_set}_dev.spacy",
    f"data/spacy_docs/{test_set}_test.spacy"
)

In [5]:
# train
spacy_train(
    config_path,
    output_path=model_out_path,
    use_gpu=0,
    overrides={
        "paths.train": train_path,
        "paths.dev": dev_path
    }
)

[38;5;4mℹ Saving to output directory: models/ft-scibert-zora[0m
[38;5;4mℹ Using GPU: 0[0m
[1m


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  -------------  ------------  ----------  ------
  0       0           0.00          0.06        0.37    0.00
  6     200           0.00         37.32       17.16    0.17
 12     400           0.02         18.98       33.56    0.34
 18     600           0.02          5.23       29.39    0.29
 25     800           0.02          1.73       28.24    0.28
 31    1000           0.01          0.83       32.78    0.33
 37    1200           0.01          0.44       31.44    0.31
 43    1400           0.01          0.13       32.00    0.32
 50    1600           0.02          0.11       31.44    0.31
 56    1800           0.01          0.09       31.44    0.31
 62    2000           0.01          0.09       31.44    0.31
[38;5;2m✔ Saved pipeline to output directory[0m
models/ft-scibert-zora/mo

In [7]:
# test
best_model = f"models/ft-scibert-zora/model-best"
test_cats = [str(i) for i in range(1, 18)]
output_file = f"{best_model}/test_{test_set}_eval.json"

!python3 -m spacy benchmark accuracy --gpu-id 0 \
  {best_model}/ {test_path} \
  --output {output_file}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[38;5;4mℹ Using GPU: 0[0m
[1m

TOK                 100.00
TEXTCAT (macro F)   38.61 
SPEED               3911  

[1m

          P        R        F
1      0.00     0.00     0.00
2     50.00    50.00    50.00
3     75.00    92.31    82.76
4      0.00     0.00     0.00
5     77.78   100.00    87.50
6      0.00     0.00     0.00
7    100.00   100.00   100.00
8     50.00    87.50    63.64
9      0.00     0.00     0.00
10    41.67    50.00    45.45
11     0.00     0.00     0.00
12    33.33    44.44    38.10
13   100.00    58.33    73.68
14     0.00     0.00     0.00
15    72.73    88.89    80.00
16    42.86    30.00    35.29
17     0.00     0.00     0.00

[1m

     ROC AUC
1       0.87
2       0.84
3       0

In [10]:

def get_test_df(file):
  test_df = pd.read_csv(file, sep='\t', keep_default_na=False,
                        index_col=0, encoding='utf-8')
  test_df = test_df.astype({'sdg': 'string',
                            'abstract': 'string'})
  test_df.drop(columns=['faculty', 'year'], inplace=True)

  return test_df

# Edit `test_df` path:
test_df = get_test_df(f"data/train_test/{test_set}_test_clean.tsv")

X_test = test_df['abstract'].values
y_test = test_df['sdg'].values

nlp = spacy.load(best_model)
print("Making predictions....")

spacy_probs = [doc.cats for doc in nlp.pipe(X_test)]
print("Done making predictions!")

# For each item, select the label to which the model has assigned the highest probability:
preds = []
probs = []
for label_probs_dict in spacy_probs:
    pred, prob = max(label_probs_dict.items(), key=lambda x: x[1])
    preds.append(pred)
    probs.append(prob)

preds = pd.Series(preds)
probs = pd.Series(probs)

Making predictions....
Done making predictions!


In [11]:
preds_df = pd.DataFrame({'abstract': X_test,
                         'label': y_test,
                         'prediction': preds,
                         'probability': probs})
# Align original indices
preds_df.index = test_df.index

preds_df = preds_df.astype({'abstract': 'string',
                            'label': 'int',
                            'prediction': 'int',
                            'probability': 'float'})

preds_file = f"{train_set}-{test_set}_preds.tsv"
preds_df.to_csv(preds_file, sep='\t', encoding='utf-8')