#### Author: Alexis Geslin

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/cours/cs224n/project/LLM-Prop/

/content/drive/MyDrive/cours/cs224n/project/LLM-Prop


In [3]:
import re
import time
import glob
import pandas as pd
import numpy as np
import argparse
from statistics import stdev

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm


In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('-'*20)
    print(f'I have {torch.cuda.device_count()} devices, currently on {torch.cuda.current_device()}')
    print('-'*20)
else:
    print('-'*20)
    print("You are running on CPU only")
    print('-'*20)
    device = torch.device("cpu")

--------------------
I have 1 devices, currently on 0
--------------------


In [5]:
np_train = pd.read_csv('data/train_no_stopwords.csv')
np_valid = pd.read_csv('data/validation_no_stopwords.csv')
np_test = pd.read_csv('data/test_no_stopwords.csv')



In [6]:
my_model_name = 'scibert'

In [10]:
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31090, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
#print the shapes
print(np_test.shape, np_train.shape, np_valid.shape)

(11531, 7) (125098, 7) (9945, 7)


In [11]:
def get_sentence_embedding(text):
    """Convert text to numerical embeddings."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pooling of last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

def embed_batch(text_list):
    inputs = tokenizer(text_list, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    #print the values of the inputs
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

def embed_dataset(dataset):
  sentences = dataset['description'].tolist()
  batch_size = 256  if device.type ==  "cuda" else 8
  print(f'batch size: {batch_size}')
  all_embeddings = []

  for i in tqdm(range(0, len(sentences), batch_size)):
      batch = sentences[i:i+batch_size]
      embeddings = embed_batch(batch)
      all_embeddings.append(embeddings)

  all_embeddings = np.vstack(all_embeddings)

  return all_embeddings

In [12]:
#testing stuff

# sentence   = np_test['description'][0]
# embeddings = get_sentence_embedding(sentence)
# embeddings.shape

# sentences = np_test['description'][:10].tolist()
# batch_size = 128  if device ==  "cuda" else 8
# print(batch_size)
# all_embeddings = []

# for i in tqdm(range(0, len(sentences), batch_size)):
#     batch = sentences[i:i+batch_size]
#     embeddings = embed_batch(batch)
#     all_embeddings.append(embeddings)

# all_embeddings = np.vstack(all_embeddings)

# all_embeddings.shape

In [13]:
#testing
dataset_to_embed = np_train[:100]
print(dataset_to_embed.shape)

all_embeddings = embed_dataset(dataset_to_embed)
print('\n',all_embeddings.shape)

(100, 7)
batch size: 256


100%|██████████| 1/1 [00:01<00:00,  1.56s/it]


 (100, 768)





## Embedding the validation dataset

In [14]:
print(my_model_name)

scibert


In [15]:
dataset_to_embed = np_valid
print(dataset_to_embed.shape)

#actually embedding:
all_embeddings = embed_dataset(dataset_to_embed)
df = pd.DataFrame(all_embeddings)

#recovering metadata
if (dataset_to_embed[:3] == np_train[:3]).all().all():
  dataset_used ="train"
elif (dataset_to_embed[:3] == np_valid[:3]).all().all():
  dataset_used ="valid"
elif (dataset_to_embed[:3] == np_test[:3]).all().all():
  dataset_used ="test"
print(df.shape, dataset_used)

#saving
df.to_csv(f'./embeddings/{my_model_name}_{dataset_used}_{df.shape[0]}.csv',index=False,header = False)

(9945, 7)
batch size: 256


100%|██████████| 39/39 [01:51<00:00,  2.85s/it]


(9945, 768) valid


## Embedding the test dataset

In [16]:
dataset_to_embed = np_test
print(dataset_to_embed.shape)

#actually embedding:
all_embeddings = embed_dataset(dataset_to_embed)
df = pd.DataFrame(all_embeddings)

#recovering metadata
if (dataset_to_embed[:3] == np_train[:3]).all().all():
  dataset_used ="train"
elif (dataset_to_embed[:3] == np_valid[:3]).all().all():
  dataset_used ="valid"
elif (dataset_to_embed[:3] == np_test[:3]).all().all():
  dataset_used ="test"
print(df.shape, dataset_used)

#saving
df.to_csv(f'./embeddings/{my_model_name}_{dataset_used}_{df.shape[0]}.csv',index=False,header = False)

(11531, 7)
batch size: 256


100%|██████████| 46/46 [02:09<00:00,  2.82s/it]


(11531, 768) test


## Embedding the train dataset

In [17]:
dataset_to_embed = np_train
print(dataset_to_embed.shape)

#actually embedding:
all_embeddings = embed_dataset(dataset_to_embed)
df = pd.DataFrame(all_embeddings)

#recovering metadata
if (dataset_to_embed[:3] == np_train[:3]).all().all():
  dataset_used ="train"
elif (dataset_to_embed[:3] == np_valid[:3]).all().all():
  dataset_used ="valid"
elif (dataset_to_embed[:3] == np_test[:3]).all().all():
  dataset_used ="test"
print(df.shape, dataset_used)

#saving
df.to_csv(f'./embeddings/{my_model_name}_{dataset_used}_{df.shape[0]}.csv',index=False,header = False)

(125098, 7)
batch size: 256


100%|██████████| 489/489 [24:30<00:00,  3.01s/it]


(125098, 768) train
