## API

In [5]:
import gradio as gr
import torch
from model import Bert,BertCNNv1,BertCNNv2
from transformers import AutoTokenizer
import pandas as pd
import re

In [6]:
def preprocess(text):
    #去除首尾"
    text = text.strip('"')
    #去除utl
    #text = BeautifulSoup(text).get_text()   
    #print(text)
    #去除网址
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b(\/\S+)*',' ',text)
    #去除某些没啥意义的符号
    text = re.sub(r'[@|\+\[\]]',' ',text)
    text = re.sub(r'•',' ',text)
    text = re.sub(r'={2,}|\'{2,}|\:{2,}|\"{2,}',' ',text)
    #去除日期
    text = re.sub(r'(\d+:)?\d+:\d+,\s?\d+\s?\w+\s?\d{4}\s\(UTC\)',' ',text)
    text = re.sub(r'(\d+:)?\d+:\d+,\s?\w+\s?\d+,\s?\d{4}\s\(UTC\)',' ',text)    
    #去除类似于ip地址一样的数字
    text = re.sub(r'\d+\.\d+\.\d+\.\d+',' ',text)   
    return text

In [7]:
def toxic_classification(text,kernel):
    #kernel
    if kernel=='Bert':
        model = model1
    elif kernel=='Bert-CNN':
        model = model2
    model.eval()
    #prediction    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    
    text = preprocess(text)
    
    encoded_text = tokenizer(text,
                             padding = 'max_length',#不进行padding的话，比较短的文本没法卷积了
                             truncation = True,
                             max_length = 256, 
                             return_tensors = 'pt')
    logits = model(encoded_text['input_ids'],encoded_text['attention_mask'],encoded_text['token_type_ids'])
    logits=logits[0]
    logits = torch.sigmoid(logits).detach().numpy().tolist()
    logits = [format(round(x,4),'.2%') for x in logits]
    return pd.DataFrame({'type':['toxic','severe_toxic','obscene','threat','insult','identity_hate'], 'confidence':logits})

In [8]:
text=gr.inputs.Textbox(lines=7,label='Comment Text')  
radio=gr.inputs.Radio(choices=['Bert','Bert-CNN'],type='value',default='Bert',label='Classifier Kernel')
df=gr.outputs.Dataframe()
model1 = Bert.load(r'E:\0kaggle\model\bert.bin')
model2 = BertCNNv1.load(r'E:\0kaggle\model\bertCNN_v1.bin') 
interface = gr.Interface(fn=toxic_classification,
                         inputs=[text,radio],outputs=df,
                         examples=[
                             ["Hi! I am back again! Last warning! Stop undoing my edits or die!",'Bert'],#toxic and threat
                             ["Why can't you believe how fat Artie is? Did you see him on his recent appearence on the Tonight Show with Jay Leno? He looks absolutely AWFUL! If I had to put money on it, I'd say that Artie Lange is a can't miss candidate for the 2007 Dead pool! Kindly keep your malicious fingers off of my above comment, . Everytime you remove it, I will repost it!!!",'Bert'],#toxic
                             ["And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.",'Bert'],#正常
                         ])
interface.launch(share=True)

loading best model from E:\0kaggle\model\bert.bin


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


loading best model from E:\0kaggle\model\bertCNN_v1.bin


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running on local URL:  http://127.0.0.1:7861/
Running on public URL: https://48509.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<fastapi.applications.FastAPI at 0x1c84161d748>,
 'http://127.0.0.1:7861/',
 'https://48509.gradio.app')

## 实验

In [14]:
def toxic_classification_2(text):
    #kernel
    model1.eval()
    #prediction    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
    
    text = preprocess(text)
    
    encoded_text = tokenizer(text,
                             padding = 'max_length',#不进行padding的话，比较短的文本没法卷积了
                             truncation = True,
                             max_length = 256, 
                             return_tensors = 'pt')
    logits = model1(encoded_text['input_ids'],encoded_text['attention_mask'],encoded_text['token_type_ids'])
    logits=logits[0]
    logits = torch.sigmoid(logits).detach().numpy().tolist()
    logits = [format(round(x,4),'.2%') for x in logits]
    return pd.DataFrame({'type':['toxic','severe_toxic','obscene','threat','insult','identity_hate'], 'confidence':logits})

In [16]:
%%time
example = "Hi! I am back again! Last warning! Stop undoing my edits or die!"
toxic_classification_2(example)

Wall time: 12.9 s


Unnamed: 0,type,confidence
0,toxic,98.86%
1,severe_toxic,0.55%
2,obscene,2.06%
3,threat,74.48%
4,insult,5.18%
5,identity_hate,0.11%
