In [1]:
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [2]:
device = "cuda:0"
model_ckpt = "meta-llama/Llama-2-7b-hf"

In [3]:
with open("./data/access_tokens.json") as f:
    hf_key = json.load(f)["huggingface"]["read_token"]

In [4]:
encoder = SentenceTransformer(model_ckpt, device=device, use_auth_token=hf_key).half()

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/meta-llama_Llama-2-7b-hf. Creating a new one with MEAN pooling.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
encoder.tokenizer.pad_token = encoder.tokenizer.eos_token

In [6]:
sentence = "I love reading."
embeds = encoder.encode(sentences=[sentence], device=device, convert_to_numpy=True)

In [7]:
embeds.shape

(1, 4096)

In [8]:
train = pd.read_csv("./data/train_data.csv", encoding="utf-8")[["statement"]]
train_sentences = train.statement.to_list()

In [9]:
valid = pd.read_csv("./data/valid_data.csv", encoding="utf-8")[["statement"]]
valid_sentences = valid.statement.to_list()

In [10]:
train_features = encoder.encode(sentences=train_sentences, device=device, convert_to_numpy=True, show_progress_bar=True)
valid_features = encoder.encode(sentences=valid_sentences, device=device, convert_to_numpy=True, show_progress_bar=True)

Batches:   0%|          | 0/381 [00:00<?, ?it/s]

Batches:   0%|          | 0/96 [00:00<?, ?it/s]

In [11]:
train_features.shape, valid_features.shape

((12170, 4096), (3042, 4096))

In [12]:
np.save("./data/train_features_llama.npy", train_features)
np.save("./data/valid_features_llama.npy", valid_features)