In [1]:
from transformers import pipeline
import pandas as pd
import numpy as np

# Transformers pipeline으로 QA task

In [2]:
qa = pipeline('question-answering')
qa

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

<transformers.pipelines.question_answering.QuestionAnsweringPipeline at 0x7f8878471850>

In [3]:
context = """
Youn Yuh-jung (born June 19, 1947) is a South Korean actress, \
whose career in film and television spans over five decades. \
Her accolades include an Academy Award.
"""

context

'\nYoun Yuh-jung (born June 19, 1947) is a South Korean actress, whose career in film and television spans over five decades. Her accolades include an Academy Award.\n'

In [4]:
qa(question="What is her job?", context=context)

{'score': 0.7455785274505615, 'start': 54, 'end': 61, 'answer': 'actress'}

In [5]:
qa(question="What did she win?", context=context)

{'score': 0.6411081552505493,
 'start': 149,
 'end': 162,
 'answer': 'Academy Award'}

In [6]:
qa(question="When was she born?", context=context)

{'score': 0.966539740562439, 'start': 21, 'end': 34, 'answer': 'June 19, 1947'}

# BERT 모형 지정해서 pipeline 만들기

In [7]:
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, QuestionAnsweringPipeline

## fine-tunning 안 된 모형(기본 BERT 모형)

In [8]:
model = TFAutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
qa = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

In [10]:
qa(question="What is her job?", context=context)

{'score': 0.0014940481632947922,
 'start': 34,
 'end': 68,
 'answer': ') is a South Korean actress, whose'}

In [11]:
qa(question="What did she win?", context=context)

{'score': 0.0015136413276195526,
 'start': 34,
 'end': 68,
 'answer': ') is a South Korean actress, whose'}

In [12]:
qa(question="When was she born?", context=context)

{'score': 0.0015169153921306133,
 'start': 34,
 'end': 68,
 'answer': ') is a South Korean actress, whose'}

## fine-tunning 된 모형

In [15]:
# distilbert 모델 : 매개변수 개수가 40% 적고 실행 속도는 60% 빠른 반면, BERT 성능의 97%를 보여줍니다.
model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')

Some layers from the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased-distilled-squad and are newly initialized: ['dropout_76']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
qa = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
qa(question="What is her job?", context=context)

{'score': 0.7455775141716003, 'start': 54, 'end': 61, 'answer': 'actress'}

In [17]:
qa(question="What did she win?", context=context)

{'score': 0.6411081552505493,
 'start': 149,
 'end': 162,
 'answer': 'Academy Award'}

In [18]:
qa(question="When was she born?", context=context)

{'score': 0.966539740562439, 'start': 21, 'end': 34, 'answer': 'June 19, 1947'}

# 수동으로(pipeline 말고) fine-tunning 하기

In [19]:
import tensorflow as tf

In [20]:
question = 'What did she win?'
inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="tf")

inputs
# 101 : cls(문장 시작) 
# 102 : sep(문장 구분)

{'input_ids': <tf.Tensor: shape=(1, 48), dtype=int32, numpy=
array([[  101,  1327,  1225,  1131,  1782,   136,   102,  1192,  1179,
        10684,  1324,   118,   179,  4380,   113,  1255,  1340,  1627,
          117,  3138,   114,  1110,   170,  1375,  3947,  3647,   117,
         2133,  1578,  1107,  1273,  1105,  1778, 15533,  1166,  1421,
         4397,   119,  1430,   170, 14566, 19872,  1511,  1126,  2127,
         1698,   119,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 48), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1]], dtype=int32)>}

In [21]:
tokenizer.decode(inputs['input_ids'].numpy()[0])

'[CLS] What did she win? [SEP] Youn Yuh - jung ( born June 19, 1947 ) is a South Korean actress, whose career in film and television spans over five decades. Her accolades include an Academy Award. [SEP]'

In [23]:
outputs = model(inputs)
outputs

TFQuestionAnsweringModelOutput(loss=None, start_logits=<tf.Tensor: shape=(1, 48), dtype=float32, numpy=
array([[-3.2587442 , -4.1159697 , -6.1125183 , -5.7539062 , -6.088905  ,
        -3.9149244 , -4.392663  , -2.031015  , -5.905589  , -3.8264418 ,
        -6.5413895 , -6.29468   , -5.6275845 , -5.9331937 , -5.1946497 ,
        -3.7381017 , -2.6977105 , -3.754214  , -5.7128425 , -2.1804988 ,
        -4.863438  , -4.760378  , -3.5709014 , -0.47888836, -2.6021464 ,
        -1.3134303 , -6.2036543 , -5.0356054 , -4.192493  , -6.555415  ,
        -2.4697018 , -7.352499  , -3.1218576 , -5.6631985 , -4.4715033 ,
        -3.9809296 , -5.1598773 , -2.9622214 ,  2.0888405 , -0.66598433,
        -5.2861366 , -4.834057  , -2.0916882 ,  8.230591  ,  8.933437  ,
         1.8657206 , -2.8914979 , -4.3925595 ]], dtype=float32)>, end_logits=<tf.Tensor: shape=(1, 48), dtype=float32, numpy=
array([[-0.70956326, -3.6142883 , -6.2601013 , -5.975941  , -6.0352697 ,
        -4.777359  , -4.572341  , -4.985

In [24]:
# 시작 점수가 가장 큰 토큰은 
start = tf.argmax(outputs.start_logits, axis=1).numpy()[0]
start
# 44번째 위치의 토큰

44

In [29]:
# 끝 점수가 가장 큰 토큰은 
end = tf.argmax(outputs.end_logits, axis=1).numpy()[0] + 1
end
# 45번째 위치의 토큰. (끝 위치 범위는 +1을 해줘야 됨)

46

In [30]:
# 시작 토큰과 끝 토큰은 44~45위치의 토큰이 된다.
tokenizer.decode(inputs['input_ids'][0, start:end])

'Academy Award'

# 한국어 QA task

## pipeline 이용해서

In [None]:
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, QuestionAnsweringPipeline
import tensorflow as tf

In [33]:
model = TFAutoModelForQuestionAnswering.from_pretrained('beomi/kcbert-base', from_pt=True)
tokenizer = AutoTokenizer.from_pretrained('beomi/kcbert-base')

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# QA pipeline 세팅
qa = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

In [35]:
context = '윤여정은 1966년 연극 배우로 연기 경력을 시작하였고, 2021년 영화 《미나리》의 순자 역으로 아카데미 여우조연상을 수상했다.'

In [None]:
question = '윤여정의 직업은?'

In [37]:
qa(question=question, context=context)

{'score': 0.0025997136253863573, 'start': 42, 'end': 50, 'answer': '미나리》의 순자'}