In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score  
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt  
import numpy as np  
import tensorflow as tf
from transformers import BertTokenizer
import pandas as pd
from keras.src.callbacks import LambdaCallback
from transformers import BertConfig
from transformers import BertModel
from keras.layers import Dense
from keras.models import Model



In [2]:
 # 加载训练集和测试集 
X_train = pd.read_csv(r"data/X_train_minmaxscaler.csv")['ChatGPT回答'].iloc[:10]
X_test = pd.read_csv(r"data/X_test_minmaxscaler.csv")['ChatGPT回答'].iloc[:10]
y_train = pd.read_csv(r"data/y_train_minmaxscaler.csv")[:10]
y_test = pd.read_csv(r"data/y_test_minmaxscaler.csv")[:10]

In [3]:
# 检查GPU是否可用  
import torch

if torch.cuda.is_available():    
  device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0")  # 如果有多个GPU，使用GPU 1；否则使用第一个GPU  
  print(f"Using device: {device}")  
else:  
  device = torch.device("cpu")  
  print("Using device: CPU")

Using device: CPU


In [4]:
print(torch.__version__)

2.2.2+cpu


In [5]:
# 加载BERT模型和tokenizer
config = BertConfig.from_json_file("../Bert_model/Bert/config.json")
tokenizer = BertTokenizer.from_pretrained('../Bert_model/Bert/')
bert = BertModel.from_pretrained("../Bert_model/Bert/", config=config)

bert.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
def get_word_embeddings(text, model, tokenizer):    

    text="CLS" + text + "SEP"
    encoded_inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    encoded_inputs = {k: v.to(device) for k, v in encoded_inputs.items()}
    
    # Pretrain the text
    with torch.no_grad():
        outputs = model(**encoded_inputs)
    
    # Get the CLS token embeddings
    #将cls标记的向量作为整个句子向量
    word_embeddings = outputs[0][:,0,:]
    
    
    return word_embeddings

In [6]:
x1= get_word_embeddings(X_train[0], bert, tokenizer)
for i in range(1,len(X_train)):
    x2 = get_word_embeddings(X_train[i], bert, tokenizer)
    X_train_embedding = torch.cat((x1, x2), dim=0)
    x1=X_train_embedding

In [7]:
h1=get_word_embeddings(X_test[0], bert, tokenizer)
for i in range(1,len(X_test)):
    h2 = get_word_embeddings(X_test[i], bert, tokenizer)
    X_test_embedding = torch.cat((h1, h2), dim=0)
    h1=X_test_embedding

In [10]:
#保存词向量
torch.save(X_train_embedding, 'train_embedding.pt')

torch.save(X_test_embedding, 'test_embedding.pt')

In [11]:
# #加载词向量
# X= torch.load('../data/train_embedding.pt')
# 
# X_test_embedding = torch.load('../data/test_embedding.pt')

In [12]:
X_train_embedding.shape

torch.Size([10, 768])