#### Author: Alexis Geslin

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/cours/cs224n/project/LLM-Prop/

/content/drive/MyDrive/cours/cs224n/project/LLM-Prop


In [3]:
import re
import time
import glob
import pandas as pd
import numpy as np
import argparse
from statistics import stdev

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm


In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('-'*20)
    print(f'I have {torch.cuda.device_count()} devices, currently on {torch.cuda.current_device()}')
    print('-'*20)
else:
    print('-'*20)
    print("You are running on CPU only")
    print('-'*20)
    device = torch.device("cpu")

--------------------
I have 1 devices, currently on 0
--------------------


In [5]:
np_train = pd.read_csv('data/train_no_stopwords.csv')
np_valid = pd.read_csv('data/validation_no_stopwords.csv')
np_test = pd.read_csv('data/test_no_stopwords.csv')



In [6]:
my_model_name = 'e5small'

In [7]:
model_name = "intfloat/e5-small-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [8]:
#print the shapes
print(np_test.shape, np_train.shape, np_valid.shape)

(11531, 7) (125098, 7) (9945, 7)


In [9]:
def get_sentence_embedding(text):
    """Convert text to numerical embeddings."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pooling of last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

def embed_batch(text_list):
    inputs = tokenizer(text_list, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    #print the values of the inputs
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

def embed_dataset(dataset):
  sentences = dataset['description'].tolist()
  batch_size = 256  if device.type ==  "cuda" else 8
  print(f'batch size: {batch_size}')
  all_embeddings = []

  for i in tqdm(range(0, len(sentences), batch_size)):
      batch = sentences[i:i+batch_size]
      embeddings = embed_batch(batch)
      all_embeddings.append(embeddings)

  all_embeddings = np.vstack(all_embeddings)

  return all_embeddings

In [10]:
#testing stuff

# sentence   = np_test['description'][0]
# embeddings = get_sentence_embedding(sentence)
# embeddings.shape

# sentences = np_test['description'][:10].tolist()
# batch_size = 128  if device ==  "cuda" else 8
# print(batch_size)
# all_embeddings = []

# for i in tqdm(range(0, len(sentences), batch_size)):
#     batch = sentences[i:i+batch_size]
#     embeddings = embed_batch(batch)
#     all_embeddings.append(embeddings)

# all_embeddings = np.vstack(all_embeddings)

# all_embeddings.shape

In [15]:
def test_embeddings_seq(text):
    """Convert text to numerical embeddings."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pooling of last hidden state
    embeddings = outputs.last_hidden_state.cpu().numpy()
    return embeddings

sentence   = np_test['description'][1]
embeddings = test_embeddings_seq(sentence)
embeddings.shape

(1, 512, 384)

In [12]:
#testing
dataset_to_embed = np_train[:10]
print(dataset_to_embed.shape)

all_embeddings = embed_dataset(dataset_to_embed)
print('\n',all_embeddings.shape)

(10, 7)
batch size: 256


100%|██████████| 1/1 [00:00<00:00, 21.34it/s]


 (10, 384)





## Embedding the validation dataset

In [None]:
print(my_model_name)

e5large


In [None]:
dataset_to_embed = np_valid
print(dataset_to_embed.shape)

#actually embedding:
all_embeddings = embed_dataset(dataset_to_embed)
df = pd.DataFrame(all_embeddings)

#recovering metadata
if (dataset_to_embed[:3] == np_train[:3]).all().all():
  dataset_used ="train"
elif (dataset_to_embed[:3] == np_valid[:3]).all().all():
  dataset_used ="valid"
elif (dataset_to_embed[:3] == np_test[:3]).all().all():
  dataset_used ="test"
print(df.shape, dataset_used)

#saving
df.to_csv(f'./embeddings/{my_model_name}_{dataset_used}_{df.shape[0]}.csv',index=False,header = False)

(9945, 7)
batch size: 64


100%|██████████| 156/156 [07:13<00:00,  2.78s/it]


(9945, 1024) valid


## Embedding the test dataset

In [None]:
dataset_to_embed = np_test
print(dataset_to_embed.shape)

#actually embedding:
all_embeddings = embed_dataset(dataset_to_embed)
df = pd.DataFrame(all_embeddings)

#recovering metadata
if (dataset_to_embed[:3] == np_train[:3]).all().all():
  dataset_used ="train"
elif (dataset_to_embed[:3] == np_valid[:3]).all().all():
  dataset_used ="valid"
elif (dataset_to_embed[:3] == np_test[:3]).all().all():
  dataset_used ="test"
print(df.shape, dataset_used)

#saving
df.to_csv(f'./embeddings/{my_model_name}_{dataset_used}_{df.shape[0]}.csv',index=False,header = False)

(11531, 7)
batch size: 256


100%|██████████| 46/46 [08:27<00:00, 11.03s/it]


(11531, 1024) test


## Embedding the train dataset

In [None]:
dataset_to_embed = np_train
print(dataset_to_embed.shape)

#actually embedding:
all_embeddings = embed_dataset(dataset_to_embed)
df = pd.DataFrame(all_embeddings)

#recovering metadata
if (dataset_to_embed[:3] == np_train[:3]).all().all():
  dataset_used ="train"
elif (dataset_to_embed[:3] == np_valid[:3]).all().all():
  dataset_used ="valid"
elif (dataset_to_embed[:3] == np_test[:3]).all().all():
  dataset_used ="test"
print(df.shape, dataset_used)

#saving
df.to_csv(f'./embeddings/{my_model_name}_{dataset_used}_{df.shape[0]}.csv',index=False,header = False)

(125098, 7)
batch size: 256


100%|██████████| 489/489 [1:32:24<00:00, 11.34s/it]


(125098, 1024) train
