In [4]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
import copy
import logging
import numpy as np
import pandas as pd
from datetime import datetime,timedelta,date
import time
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader,RandomSampler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, AutoModel, AutoModelForCausalLM
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from scipy.stats import pearsonr, spearmanr
from transformers import BertConfig, BertTokenizer,BertTokenizerFast
from torch.nn.utils.rnn import pad_sequence
from torch.types import Number
from typing import List,Union,Dict
import tqdm
import re
import json
from dataclasses import dataclass
from pprint import pprint
#os.chdir('/home/stops/Work_space/NLP_work/Qwen_LM_train')
os.chdir('/home/stops/Work_space/NLP_work/Med_assit_chatglm')

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')
logger.info('Starting')


def show_df(df):
    print(df.shape)
    print(df.head(2))
    print(df.isnull().sum())



2024-06-17 17:54:40,748 - INFO - Starting


In [1]:
###############################################################
## BCE-1 :Based on BCEmbedding
###############################################################


from BCEmbedding import EmbeddingModel

# list of sentences
sentences = ['头疼', '发烧']

## XLMRobertaModel, Layers:12 , model-weight:1G
bce_embedding_model_path="/home/stops/Work_space/NLP_models/bce-embedding-base_v1"
# init embedding model
model = EmbeddingModel(model_name_or_path=bce_embedding_model_path)

# extract embeddings
embeddings = model.encode(sentences)


06/17/2024 17:56:20 - [INFO] -BCEmbedding.models.EmbeddingModel->>>    Loading from `/home/stops/Work_space/NLP_models/bce-embedding-base_v1`.
06/17/2024 17:56:21 - [INFO] -BCEmbedding.models.EmbeddingModel->>>    Execute device: cuda;	 gpu num: 1;	 use fp16: False;	 embedding pooling type: cls;	 trust remote code: False
Extract embeddings: 100%|██████████| 1/1 [00:00<00:00,  8.51it/s]


In [2]:

embeddings.shape


(2, 768)

In [3]:

from BCEmbedding import RerankerModel

# your query and corresponding passages
query = '头有点痛'
passages = ['头疼', '发烧']

# construct sentence pairs
sentence_pairs = [[query, passage] for passage in passages]

## XLMRobertaForSequenceClassification, Layers:12 , model-weight:1G
bce_ranker_model_path="/home/stops/Work_space/NLP_models/bce-reranker-base_v1"

# init reranker model
model = RerankerModel(model_name_or_path=bce_ranker_model_path)

# method 0: calculate scores of sentence pairs
scores = model.compute_score(sentence_pairs)

# method 1: rerank passages
rerank_results = model.rerank(query, passages)


06/17/2024 17:57:52 - [INFO] -BCEmbedding.models.RerankerModel->>>    Loading from `/home/stops/Work_space/NLP_models/bce-reranker-base_v1`.
06/17/2024 17:57:52 - [INFO] -BCEmbedding.models.RerankerModel->>>    Execute device: cuda;	 gpu num: 1;	 use fp16: False
Calculate scores: 100%|██████████| 1/1 [00:00<00:00, 58.91it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [4]:
rerank_results

{'rerank_passages': ['头疼', '发烧'],
 'rerank_scores': [0.6026140451431274, 0.4403996467590332],
 'rerank_ids': [0, 1]}

In [5]:
###############################################################
## BCE-2 :Based on transformers
###############################################################

from transformers import AutoModel, AutoTokenizer

# list of sentences
sentences = ['头疼', '发烧']

# init model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(bce_embedding_model_path)
model = AutoModel.from_pretrained(bce_embedding_model_path)

device = 'cuda'  # if no GPU, set "cpu"
model.to(device)

# get inputs
inputs = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt")
inputs_on_device = {k: v.to(device) for k, v in inputs.items()}

# get embeddings
outputs = model(**inputs_on_device, return_dict=True)
embeddings = outputs.last_hidden_state[:, 0]  # cls pooler
embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  # normalize




In [6]:
embeddings.shape

torch.Size([2, 768])

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# init model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(bce_ranker_model_path)
model = AutoModelForSequenceClassification.from_pretrained(bce_ranker_model_path)

device = 'cuda'  # if no GPU, set "cpu"
model.to(device)

# get inputs
inputs = tokenizer(sentence_pairs, padding=True, truncation=True, max_length=512, return_tensors="pt")
inputs_on_device = {k: v.to(device) for k, v in inputs.items()}

# calculate scores
scores = model(**inputs_on_device, return_dict=True).logits.view(-1,).float()
scores = torch.sigmoid(scores)




In [8]:
print(scores)



tensor([0.6026, 0.4404], device='cuda:0', grad_fn=<SigmoidBackward0>)


In [10]:
###############################################################
## BCE-3 :Based on sentence_transformers
###############################################################

from sentence_transformers import SentenceTransformer
# init embedding model
## New update for sentence-trnasformers. So clean up your "`SENTENCE_TRANSFORMERS_HOME`/maidalun1020_bce-embedding-base_v1" or "～/.cache/torch/sentence_transformers/maidalun1020_bce-embedding-base_v1" first for downloading new version.
model = SentenceTransformer(bce_ranker_model_path)

# extract embeddings
embeddings = model.encode(sentences, normalize_embeddings=True)


06/17/2024 18:08:15 - [INFO] -datasets->>>    PyTorch version 2.3.0 available.
06/17/2024 18:08:15 - [INFO] -sentence_transformers.SentenceTransformer->>>    Use pytorch device_name: cuda
06/17/2024 18:08:15 - [INFO] -sentence_transformers.SentenceTransformer->>>    Load pretrained SentenceTransformer: /home/stops/Work_space/NLP_models/bce-reranker-base_v1
Some weights of XLMRobertaModel were not initialized from the model checkpoint at /home/stops/Work_space/NLP_models/bce-reranker-base_v1 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
embeddings.shape

(2, 768)

In [12]:
from sentence_transformers import CrossEncoder
# init reranker model
model = CrossEncoder(bce_ranker_model_path, max_length=512)

# calculate scores of sentence pairs
scores = model.predict(sentence_pairs)

06/17/2024 18:08:51 - [INFO] -sentence_transformers.cross_encoder.CrossEncoder->>>    Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
scores

array([0.60261405, 0.44039965], dtype=float32)

In [19]:
###############################################################
## BCE-4 : RAG Used in langchain
###############################################################

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy

query = 'apples'
passages = [
        'I like apples', 
        'I like oranges', 
        'Apples and oranges are fruits'
    ]
  
# init embedding model
model_name = bce_embedding_model_path
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'batch_size': 64, 'normalize_embeddings': True}

embed_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
  )

# example #1. extract embeddings
query_embedding = embed_model.embed_query(query)
passages_embeddings = embed_model.embed_documents(passages)

# example #2. langchain retriever example
faiss_vectorstore = FAISS.from_texts(passages, embed_model, distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)

retriever = faiss_vectorstore.as_retriever(search_type="similarity", search_kwargs={"score_threshold": 0.5, "k": 3})

related_passages = retriever.get_relevant_documents(query)


06/17/2024 18:13:59 - [INFO] -sentence_transformers.SentenceTransformer->>>    Load pretrained SentenceTransformer: /home/stops/Work_space/NLP_models/bce-embedding-base_v1
06/17/2024 18:14:00 - [INFO] -faiss.loader->>>    Loading faiss with AVX2 support.
06/17/2024 18:14:00 - [INFO] -faiss.loader->>>    Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
06/17/2024 18:14:00 - [INFO] -faiss.loader->>>    Loading faiss.
06/17/2024 18:14:00 - [INFO] -faiss.loader->>>    Successfully loaded faiss.
  warn_deprecated(


In [20]:
related_passages

[Document(page_content='I like apples'),
 Document(page_content='Apples and oranges are fruits')]

In [22]:
###############################################################
## BCE-5 : Used in llama_index
###############################################################

from llama_index.embeddings import HuggingFaceEmbedding
from llama_index import VectorStoreIndex, ServiceContext, SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.llms import OpenAI

query = 'apples'
passages = [
        'I like apples', 
        'I like oranges', 
        'Apples and oranges are fruits'
    ]

# init embedding model
model_args = {'model_name': bce_embedding_model_path, 'max_length': 512, 'embed_batch_size': 64, 'device': 'cuda'}
embed_model = HuggingFaceEmbedding(**model_args)

# example #1. extract embeddings
query_embedding = embed_model.get_query_embedding(query)
passages_embeddings = embed_model.get_text_embedding_batch(passages)


volcengine_api_key_info={'api_key': '73ccb572-0f77-486e-83f0-d6aa9fba0d6e',
'endpoint_id':'ep-20240516065014-825qc','model_name':'Doubao-pro-32k'}


llm = OpenAI( api_key=f"{volcengine_api_key_info['api_key']}",
                  api_base="https://ark.cn-beijing.volces.com/api/v3",
                  model=f"{volcengine_api_key_info['endpoint_id']}" )


# example #2. rag example
#llm = OpenAI(model='gpt-3.5-turbo-0613', api_key=os.environ.get('OPENAI_API_KEY'), api_base=os.environ.get('OPENAI_BASE_URL'))
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

documents = SimpleDirectoryReader(input_files=["/home/stops/Work_space/Soft/Comp_en_llama2.pdf"]).load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents[0:36])
index = VectorStoreIndex(nodes, service_context=service_context)
query_engine = index.as_query_engine()
response = query_engine.query("What is llama?")



ImportError: cannot import name 'HuggingFaceEmbedding' from 'llama_index.embeddings' (/home/stops/miniconda3/envs/llm_env/lib/python3.10/site-packages/llama_index/embeddings/__init__.py)