In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import tensorflow as tf
import h5py

from sentence_transformers import SentenceTransformer

In [15]:
# Load data from the desired source. This example uses the 200k abstracts with the numbers unchanged.

train = pd.read_csv('200k_abstracts/train.txt', dtype=str, delimiter='\n', header=None, engine='python', comment='###')
test = pd.read_csv('200k_abstracts/test.txt', dtype=str, delimiter='\n', header=None, engine='python', comment='###')
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")

train.shape: (2211861, 1)
test.shape: (29493, 1)


In [16]:
# Split data at tab strings and drop any empty rows.

train = train[0].str.split('\t', expand=True)
train = train.dropna()
print(train.head())

test = test[0].str.split('\t', expand=True)
test = test.dropna()
print(test.head())

            0                                                  1
0  BACKGROUND  The emergence of HIV as a chronic condition me...
1  BACKGROUND  This paper describes the design and evaluation...
2     METHODS  This study is designed as a randomised control...
3     METHODS  The intervention group will participate in the...
4     METHODS  The program is based on self-efficacy theory a...
            0                                                  1
0  BACKGROUND  Many pathogenic processes and diseases are the...
1  BACKGROUND  It was recently demonstrated that supraphysiol...
2  BACKGROUND  In the present study , we examined the associa...
3  BACKGROUND  In addition , we compared plasma levels of MAp...
4     METHODS  A total of 192 MI patients and 140 control per...


In [17]:
# Load the desired model and set the max sequence length. This roughly corresponds to the max sentence length.

model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

model.max_seq_length = 300

In [19]:
# Map the labels to integers. 

train[0] = train[0].map({'BACKGROUND': 0, 'OBJECTIVE': 1, 'METHODS': 2, 'RESULTS': 3, 'CONCLUSIONS': 4})
test[0] = test[0].map({'BACKGROUND': 0, 'OBJECTIVE': 1, 'METHODS': 2, 'RESULTS': 3, 'CONCLUSIONS': 4})
print(test.head())

   0                                                  1
0  0  Many pathogenic processes and diseases are the...
1  0  It was recently demonstrated that supraphysiol...
2  0  In the present study , we examined the associa...
3  0  In addition , we compared plasma levels of MAp...
4  2  A total of 192 MI patients and 140 control per...


In [20]:
# Split into X and y arrays.

X_test = np.array(test[1])
y_test = np.array(test[0])
X_train = np.array(train[1])
y_train = np.array(train[0])

In [21]:
# Encode the sentences with sentence_transformers

X_test = model.encode(X_test, device='cuda', show_progress_bar=True)
X_train = model.encode(X_train, device='cuda', show_progress_bar=True)

Batches:   0%|          | 0/922 [00:00<?, ?it/s]

Batches:   0%|          | 0/69121 [00:00<?, ?it/s]

In [24]:
# Save processed data into hdf5 files.

with h5py.File('sept1/200k.hdf5', 'w') as f:
    f.create_dataset('X_train', data=X_train)
    f.create_dataset('X_test', data=X_test)
    f.create_dataset('y_train', data=y_train)
    f.create_dataset('y_test', data=y_test)

In [26]:
# Load data from hdf5 files if desired.

with h5py.File('sept1/200k.hdf5', 'r') as f:
    X_train = np.array(f.get('X_train'))
    X_test = np.array(f.get('X_test'))
    y_train = np.array(f.get('y_train'))
    y_test = np.array(f.get('y_test'))

print(X_train.shape, y_train.shape)

(2211861, 768) (2211861,)
