# 预处理
## 文本预处理
> Bert

验证bert是否成功下载

In [4]:
import torch
from transformers import BertTokenizer, BertModel

# 设置模型路径
BERT_PATH = '../modules/models/BERT'

# 加载本地 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
model = BertModel.from_pretrained(BERT_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [15]:
def load_data(file_path):
    """读取数据集并返回句子列表、标签列表和图片ID列表"""
    sentences, labels, img_ids = [], [], []

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # 每条记录按 4 行解析
    for i in range(0, len(lines), 4):
        # 确保数据完整性（防止不足 4 行的情况）
        if i + 3 >= len(lines):
            print(f"Skipping incomplete record at line {i}")
            break

        sentence_1 = lines[i].strip()  # 第一行：文本句子
        sentence_2 = lines[i + 1].strip()  # 第二行：额外文本
        label = int(lines[i + 2].strip())  # 第三行：标签，转为整数
        img_id = lines[i + 3].strip()  # 第四行：图片ID

        # 合并句子
        full_sentence = f"{sentence_1} {sentence_2}"
        sentences.append(full_sentence)
        labels.append(label)
        img_ids.append(img_id)

    return sentences, labels, img_ids

# 读取数据并输出前两个句子及其对应的 IMGID
train_path = "../src_data/data_baseline/twitter2015/train.txt"  # 替换为你的训练集路径
sentences, labels, img_ids = load_data(train_path)

print(f"Number of records: {len(img_ids)}")
# 打印前两个句子及其标签和图像ID
for i in range(2):
    print(f"IMGID: {img_ids[i]}")
    print("Sentence:", sentences[i])
    print("Labels:", labels[i])
    print()


Number of records: 3179
IMGID: 1860693.jpg
Sentence: RT @ ltsChuckBass : $T$ is everything # MCM Chuck Bass
Labels: 1

IMGID: 1860693.jpg
Sentence: RT @ ltsChuckBass : Chuck Bass is everything $T$ # MCM
Labels: 0



In [6]:
# 生成词向量的函数
def get_bert_embeddings(sentences):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for sentence in sentences:
            # 分词
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            # 获取BERT输出
            outputs = model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
            all_embeddings.append(embeddings.cpu())

    return all_embeddings

# 获取词向量
sentence_embeddings = get_bert_embeddings(sentences)
# 打印第一个句子的词向量形状
print(sentence_embeddings[0].shape)  # (batch_size, max_seq_len, word_embedding_dimension)

torch.Size([1, 24, 1024])


## 图片预处理
> ViT

In [12]:
from transformers import ViTImageProcessor, ViTModel
from PIL import Image

# 设置模型路径
Vit_PATH = '../modules/models/Vit'

image_path = '../img_data/twitter2015/0.jpg'
image = Image.open(image_path).convert("RGB")
processor = ViTImageProcessor.from_pretrained(Vit_PATH)
model = ViTModel.from_pretrained(Vit_PATH)
inputs = processor(images=image, return_tensors="pt")

outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
last_hidden_states.size()

torch.Size([1, 197, 768])

In [13]:
import os

# 设置模型路径和图片文件夹路径
Vit_PATH = '../modules/models/Vit'
image_folder = '../img_data/twitter2015'

# 批量加载和预处理图片
def preprocess_images(folder_path, processor):
    pixel_values = []  # 用于存储所有图片的张量
    for file_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, file_name)
        # 检查文件类型，确保是图片
        if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            try:
                # 加载图片
                image = Image.open(image_path).convert("RGB")
                # 使用处理器预处理
                inputs = processor(images=image, return_tensors="pt")
                pixel_values.append(inputs["pixel_values"])
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    # 将所有图片的张量堆叠成一个批量张量
    if pixel_values:
        batch_pixel_values = torch.cat(pixel_values, dim=0)  # 形状: (batch_size, 3, height, width)
        return batch_pixel_values
    else:
        raise ValueError("No valid images found in the folder!")

# 调用批量预处理函数
batch_inputs = preprocess_images(image_folder, processor)
print("Batch input shape:", batch_inputs.shape)  # 打印批量张量形状


Batch input shape: torch.Size([8288, 3, 224, 224])
