In [None]:
!pip install transformers
!pip install spacy
!pip install pandas 
!pip install torch 
!pip install datasets #



In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForQuestionAnswering, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import string
from tqdm import tqdm


In [None]:
df=pd.read_csv("/kaggle/input/layoutlm/medquad.csv")
df.info()

In [None]:
df.head(15)

In [None]:
df['answer'][0]

In [None]:
missing_values = df.isnull().sum()
print(df[df.isnull().any(axis=1)])
#刪除缺失值
df = df.dropna()
#檢查是否還有缺失
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
#選擇model
#創建問答管道
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

QA_input = {
    'question': 'why is model conversion important',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}

res = nlp(QA_input)

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
import torch

def preprocess(df, tokenizer):
    #找到每個答案和問題在context中的起始和結束位置
    start_positions = []
    end_positions = []

    for idx, row in df.iterrows():
        question, context, answer = row['question'], row['focus_area'], row['answer']
        
        # 對context and answer進行分詞
        tokenized_context = tokenizer.tokenize(context)
        tokenized_answer = tokenizer.tokenize(answer)
        
        # 找到答案在focus_area的起始位置和结束位置
        answer_start_index = context.find(answer)
        answer_end_index = answer_start_index + len(answer)
        
        #將字符位置轉換為 token 位置
        token_start_index = len(tokenizer.tokenize(context[:answer_start_index]))
        token_end_index = token_start_index + len(tokenized_answer) - 1
        
        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    # 转换为 tensor
    start_positions = torch.tensor(start_positions)
    end_positions = torch.tensor(end_positions)

    # 编码输入数据
    encodings = tokenizer(df['question'].tolist(),
                          df['focus_area'].tolist(), 
                          padding='max_length',
                          max_length=512,
                          truncation=True,
                          return_tensors="pt")
    
    return start_positions, end_positions, encodings


start_positions, end_positions, encodings = preprocess(df, tokenizer)
print(start_positions)
print(end_positions)
print(encodings)


In [None]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

encodings['start_positions'] = start_positions
encodings['end_positions'] = end_positions


input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
start_positions = encodings['start_positions']
end_positions = encodings['end_positions']

# 将数据划分为训练集和验证集
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_start_positions, val_start_positions, train_end_positions, val_end_positions = train_test_split(
    input_ids, attention_mask, start_positions, end_positions,
    test_size=0.2, random_state=42)

# 创建训练集和验证集的 TensorDataset
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_start_positions, train_end_positions)
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_start_positions, val_end_positions)

# 创建训练集和验证集的 DataLoader
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs=2
for epoch in range(num_epochs):
    # 训练模式
    model.train()
    epoch_loss = 0
    total_correct = 0
    total_samples = 0
    
    # tqdm 用于显示训练进度条
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids, attention_mask, start_positions, end_positions = batch

        # 清除梯度
        optimizer.zero_grad()

        # 前向传播
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss

        # 反向传播和优化
        loss.backward()
        optimizer.step()

        # 累加损失
        epoch_loss += loss.item()

        # 计算预测准确度
        start_pred = outputs.start_logits.argmax(dim=1)
        end_pred = outputs.end_logits.argmax(dim=1)
        correct = ((start_pred == start_positions) & (end_pred == end_positions)).sum().item()
        total_correct += correct
        total_samples += input_ids.size(0)

    # 打印每个 epoch 的平均损失和准确度
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    train_accuracy = total_correct / total_samples
    print(f"Epoch {epoch+1}, Training Loss: {avg_epoch_loss}, Training Accuracy: {train_accuracy}")


In [None]:
#驗證
 model.eval()
    valid_loss = 0
    # 不计算梯度
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Validation Epoch {epoch+1}/{num_epochs}"):
            input_ids, attention_mask, start_positions, end_positions = batch

            # 前向传播
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss

            # 累加损失
            valid_loss += loss.item()

    # 打印验证集的平均损失
    avg_valid_loss = valid_loss / len(valid_dataloader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_valid_loss}")

In [None]:
#回答問題method1
QA_input = {
    'question': 'What is (are) Glaucoma ?',
    'context': 'Glaucoma is a group of diseases that can damage the eyes optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues.'
}
res=nlp(QA_input)#使用管道獲取答案
print("Answer:", res['answer'])
print("Score:", res['score'])


In [None]:
#method2
QA_input = {
    'question': 'What is glaucoma?',
    'context': 'Glaucoma is a group of diseases that can damage the eyes optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60.'
}
inputs = tokenizer(QA_input['question'], QA_input['context'], return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits) + 1

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index]))
print("question:",QA_input[0])
print("Answer:", answer)

**question-answering**

In [None]:
QA_input = [{'question':'why is conversation important?',
             'context':'The option to convert models between FARM and transformers gives freedom to the user between frameworkers'},
            {'question':'How many programming languages does BLOOM support?',
             'context':'BLOOM has 176 billion parameters and can generate text in 46 languages'}]


In [None]:
#method2
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name='deepset/roberta-base-squad2'
model=AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer=AutoTokenizer.from_pretrained(model_name)

In [None]:
inputs0 = tokenizer(QA_input[0]['question'],
                    QA_input[0]['context'],
                    return_tensors="pt")
output0=model(**inputs0)


In [None]:
inputs1 = tokenizer(QA_input[1]['question'],
                    QA_input[1]['context'],
                    return_tensors="pt")
output1=model(**inputs1)


In [None]:

answer_start_idx=torch.argmax(output0.start_logits)
answer_end_idx=torch.argmax(output0.end_logits)
answer_tokens=inputs0.input_ids[0,answer_start_idx:answer_end_idx+1]
answer=tokenizer.decode(answer_tokens)
print("ques:{}\nanswer:{}".format(QA_input[0]['question'],answer))

In [None]:
#
answer_start_idx=torch.argmax(output1.start_logits)
answer_end_idx=torch.argmax(output1.end_logits)
answer_tokens=inputs0.input_ids[0,answer_start_idx:answer_end_idx+1]
answer=tokenizer.decode(answer_tokens)
print("ques:{}\nanswer:{}".format(QA_input[1]['question'],answer))