# 预处理
## 文本预处理
> Bert

验证bert是否成功下载

In [1]:
from transformers import BertModel,BertTokenizer
import os


BERT_PATH = '../modules/models/BERT'

tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

print(tokenizer.tokenize('I have a good time, thank you.'))

bert = BertModel.from_pretrained(BERT_PATH)

print('load bert model over')

  from .autonotebook import tqdm as notebook_tqdm


['i', 'have', 'a', 'good', 'time', ',', 'thank', 'you', '.']
load bert model over


In [2]:
import torch
from transformers import BertTokenizer, BertModel

# 设置模型路径
BERT_PATH = '../modules/models/BERT'

# 加载本地 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
model = BertModel.from_pretrained(BERT_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [3]:
def load_data(file_path):
    """读取数据集并返回句子列表、标签列表和图片ID列表"""
    sentences, labels, img_ids = [], [], []

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # 每条记录按 4 行解析
    for i in range(0, len(lines), 4):
        # 确保数据完整性（防止不足 4 行的情况）
        if i + 3 >= len(lines):
            print(f"Skipping incomplete record at line {i}")
            break

        sentence_1 = lines[i].strip()  # 第一行：文本句子
        sentence_2 = lines[i + 1].strip()  # 第二行：额外文本
        label = int(lines[i + 2].strip())  # 第三行：标签，转为整数
        img_id = lines[i + 3].strip()  # 第四行：图片ID

        # 合并句子
        full_sentence = f"{sentence_1} {sentence_2}"
        sentences.append(full_sentence)
        labels.append(label)
        img_ids.append(img_id)

    return sentences, labels, img_ids

# 读取数据并输出前两个句子及其对应的 IMGID
train_path = "../src_data/data_baseline/twitter2015/train.txt"  # 替换为你的训练集路径
sentences, labels, img_ids = load_data(train_path)

print(f"Number of records: {len(img_ids)}")
# 打印前两个句子及其标签和图像ID
for i in range(2):
    print(f"IMGID: {img_ids[i]}")
    print("Sentence:", sentences[i])
    print("Labels:", labels[i])
    print()


Number of records: 3179
IMGID: 1860693.jpg
Sentence: RT @ ltsChuckBass : $T$ is everything # MCM Chuck Bass
Labels: 1

IMGID: 1860693.jpg
Sentence: RT @ ltsChuckBass : Chuck Bass is everything $T$ # MCM
Labels: 0



In [4]:
from transformers import BertTokenizer, BertModel
import torch

# 加载BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
model = BertModel.from_pretrained(BERT_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 生成词向量的函数
def get_bert_embeddings(sentences):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for sentence in sentences:
            # 分词
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            # 获取BERT输出
            outputs = model(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
            all_embeddings.append(embeddings.cpu())

    return all_embeddings

# 获取词向量
sentence_embeddings = get_bert_embeddings(sentences)

# 打印第一个句子的词向量形状
print(sentence_embeddings[0].shape)  # (batch_size, max_seq_len, word_embedding_dimension)


torch.Size([1, 24, 1024])


## 图片预处理
> ViT

In [5]:
import torch
from vit_pytorch import ViT

v = ViT(
    image_size = 256,
    patch_size = 32,
    num_classes = 1000,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

img = torch.randn(1, 3, 256, 256)

preds = v(img) # (1, 1000)

In [6]:
import os
from PIL import Image
import torch
from torchvision import transforms

# 定义预处理步骤
preprocess = transforms.Compose([
    transforms.Resize((256, 256)),  # 调整图片大小到模型输入尺寸
    transforms.ToTensor(),         # 转为张量
    transforms.Normalize(           # 使用 ImageNet 的均值和标准差归一化
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225]
    )
])

# 加载本地图片
def load_image(image_path):
    """加载图片并进行预处理"""
    img = Image.open(image_path).convert("RGB")  # 确保图片是 RGB 模式
    img_tensor = preprocess(img)  # 应用预处理
    return img_tensor.unsqueeze(0)  # 增加 batch 维度

# 批量处理图片
def process_images_in_folder(folder_path, model):
    """批量处理文件夹中的图片"""
    results = {}
    for file_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, file_name)
        if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):  # 只处理图片文件
            try:
                img_tensor = load_image(image_path)
                with torch.no_grad():  # 预测时不需要计算梯度
                    preds = model(img_tensor)  # 模型预测
                results[file_name] = preds.squeeze(0)  # 去除 batch 维度
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    return results

# 测试代码
image_folder = "../img_data/twitter2015"  # 替换为你的图片文件夹路径
v.eval()  # 将模型设置为评估模式（关闭 dropout 和 batchnorm）
results = process_images_in_folder(image_folder, v)

# 打印前几张图片的预测结果
for file_name, output in list(results.items())[:5]:  # 只显示前 5 个
    print(f"Image: {file_name}, Output shape: {output.shape}")


Image: 49805.jpg, Output shape: torch.Size([1000])
Image: 8700.jpg, Output shape: torch.Size([1000])
Image: 478494.jpg, Output shape: torch.Size([1000])
Image: 1092215.jpg, Output shape: torch.Size([1000])
Image: 654966.jpg, Output shape: torch.Size([1000])


In [7]:
preds

tensor([[ 2.8593e-02,  3.0752e-01, -3.1113e-01,  5.1795e-01, -9.4753e-02,
         -5.7106e-01,  5.4270e-01, -7.7945e-02,  1.2067e+00, -5.3213e-01,
         -1.4911e-01,  9.8372e-01, -2.1954e-01, -2.1041e-02,  3.6621e-01,
          1.7244e-01,  1.0807e+00,  7.2316e-01, -1.4327e+00,  5.1862e-01,
         -1.0203e+00,  1.5916e-01,  8.0286e-02,  1.8359e-01, -1.1854e-01,
          1.3533e+00,  5.6979e-02,  4.4181e-01, -3.9340e-01,  2.9487e-01,
         -5.6481e-01, -2.9753e-01,  1.2856e+00,  7.5437e-01, -4.4568e-01,
         -1.0121e+00, -1.4310e+00,  6.2169e-01, -2.7196e-01,  1.5859e-01,
         -5.4618e-02, -9.3260e-01, -6.0375e-01, -8.4579e-01, -6.9165e-01,
         -6.4110e-01,  9.8860e-01, -8.2532e-02,  4.3525e-01, -1.3493e-01,
          7.2447e-01,  8.1049e-01, -2.3911e-01, -4.4767e-01, -6.8853e-01,
          9.5735e-01,  4.2351e-01, -5.9532e-01, -3.8985e-01,  1.6900e-02,
         -1.7104e-01, -1.0015e+00,  1.3534e-01, -5.1001e-01,  2.5451e-01,
          5.7262e-01, -9.1649e-01, -1.