In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained("yoyomo/KcBERT-Base-finetuned-yok")
model = AutoModelForSequenceClassification.from_pretrained("yoyomo/KcBERT-Base-finetuned-yok", output_attentions=True)


---

In [None]:
def inference_fn2(sentence):
    text_a = sentence
    text_b = sentence
    input_encodings = tokenizer.encode_plus(
        text_a,
        text_b,
        return_tensors='pt'
    )

    with torch.no_grad():
        global model
        device = torch.device("cpu")
        model = model.to(device)

        input_ids = input_encodings['input_ids'].to(device)
        token_type_ids = input_encodings['token_type_ids'].to(device)
        attention_mask = input_encodings['attention_mask'].to(device)

        # [SEP] 토큰 인덱스 찾기
        sep_token_index = (input_ids == tokenizer.sep_token_id).nonzero(as_tuple=True)[1][0].item()
        start_index_b = sep_token_index + 1

        text_a_input_ids = input_ids[:, :sep_token_index + 1]
        text_a_attention_mask = attention_mask[:, :sep_token_index + 1]

        text_a_inputs = {
            'input_ids': text_a_input_ids,
            'attention_mask': text_a_attention_mask
        }

        output = model(**text_a_inputs)
        logits = output.logits
        preds = logits.softmax(dim=1)

        # Attention 값 가져오기
        outputs = model(input_ids, token_type_ids=token_type_ids, output_attentions=True)
        attentions = outputs.attentions

        layer_index = 10  #bertviz layer 
        head_index = 10  

    
        cls_token_index = 0

        # Attention 값 추출
        attention_v1 = attentions[layer_index][0, head_index, cls_token_index, :]

        # text_b 부분의 attention 점수만 추출
        attention_scores_b = attention_v1[start_index_b:]

        # text_b 토큰 변환
        tokens_b = tokenizer.convert_ids_to_tokens(input_ids[0, start_index_b:])

        # 주목도 점수와 토큰을 함께 저장
        token_attention_pairs = list(zip(tokens_b, attention_scores_b.cpu().numpy()))

        # 예측 결과 확률값
        positive_prob = round(preds[0][1].item(), 4)
        negative_prob = round(preds[0][0].item(), 4)

        pred = "욕설아닌문장" if torch.argmax(preds) == 1 else "욕설문장 (negative)"

        # 욕설 단어 추출
        combined_tokens = []
        current_word = ""
        if negative_prob > positive_prob:
            for token, score in token_attention_pairs:
                if score > 0.02 and token not in ["[CLS]", "[SEP]"]:
                    if token.startswith("##"):
                        current_word += token[2:]
                    else:
                        if current_word:
                            combined_tokens.append(current_word)
                        current_word = token
            if current_word:
                combined_tokens.append(current_word)

    return {
        'sentence': text_a,
        'prediction': pred,
        'positive_data': f"욕설 아님 {positive_prob}",
        'negative_data': f" {negative_prob}",
        'positive_width': f"{positive_prob * 100}%",
        'negative_width': f"{negative_prob * 100}%",
        'hate_words' : f"{combined_tokens}"
    }


---

In [None]:
from flask import Flask, request, jsonify


In [None]:
def app_start(inference_fn, is_colab=True):
    app = Flask(__name__)

    @app.route('/')
    def index():
        with open('yokpage.html', 'r') as file:
            html_content = file.read()
        return html_content

    @app.route('/api', methods=['POST'])
    def api():
        query_sentence = request.json
        output_data = inference_fn(query_sentence)
        response = jsonify(output_data)
        return response

    return app


In [None]:
from pyngrok import ngrok

ngrok.set_auth_token(#token) 
public_url = ngrok.connect(5000)
print("ngrok URL:", public_url)


In [None]:

app = app_start(inference_fn2, is_colab=True)

if __name__ == "__main__":
    app.run()

