## 测试基于mT5的中文语料微调的问题生成预训练模型的效果

In [None]:
!pip install transformers
!pip install question_generation
!pip install torch

In [None]:
!pip install torch



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("algolet/mt5-base-chinese-qg")

model = AutoModelForSeq2SeqLM.from_pretrained("algolet/mt5-base-chinese-qg")

Downloading:   0%|          | 0.00/428 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/7.93M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/731 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

In [None]:
import torch
model.eval()
text = "在一个寒冷的冬天，赶集完回家的农夫在路边发现了一条冻僵了的蛇。他很可怜蛇，就把它放在怀里。当他身上的热气把蛇温暖以后，蛇很快苏醒了，露出了残忍的本性，给了农夫致命的伤害——咬了农夫一口。农夫临死之前说：“我竟然救了一条可怜的毒蛇，就应该受到这种报应啊！”"
text = "question generation: " + text
inputs = tokenizer(text,
                   return_tensors='pt',
                   truncation=True,
                   max_length=512)
with torch.no_grad():
  outs = model.generate(input_ids=inputs["input_ids"],
                        attention_mask=inputs["attention_mask"],
                        max_length=128,
                        no_repeat_ngram_size=4,
                        num_beams=4)
question = tokenizer.decode(outs[0], skip_special_tokens=True) 
questions = [q.strip() for q in  question.split("<sep>") if len(q.strip()) > 0]
print(questions)

['在寒冷的冬天,农夫在哪里发现了一条可怜的蛇?', '农夫是如何看待蛇的?', '当农夫遇到蛇时,他做了什么?']


In [None]:
from question_generation import pipeline

# gpu版本
qg = pipeline("question-generation", device="cuda") 

texts = ["在一个寒冷的冬天，赶集完回家的农夫在路边发现了一条冻僵了的蛇。他很可怜蛇，就把它放在怀里。当他身上的热气把蛇温暖以后，蛇很快苏醒了，露出了残忍的本性，给了农夫致命的伤害——咬了农夫一口。农夫临死之前说：“我竟然救了一条可怜的毒蛇，就应该受到这种报应啊！”"]
qg(texts)

[['在寒冷的冬天,农夫在哪里发现了一条可怜的蛇?', '农夫是如何看待蛇的?', '当农夫遇到蛇时,他做了什么?']]

In [None]:
qa = pipeline("question-answering") 
texts = "在一个寒冷的冬天，赶集完回家的农夫在路边发现了一条冻僵了的蛇。他很可怜蛇，就把它放在怀里。当他身上的热气把蛇温暖以后，蛇很快苏醒了，露出了残忍的本性，给了农夫致命的伤害——咬了农夫一口。农夫临死之前说：“我竟然救了一条可怜的毒蛇，就应该受到这种报应啊！”"
qa({
       'question': '在寒冷的冬天,农夫在哪里发现了一条可怜的蛇?',
       'context': texts
   })

{'answer': '路边', 'end': 20, 'score': 1.0, 'start': 18}

## 使用目前的测试数据进行测试

In [None]:
text_1 = '史记是我国历史上第一部纪传体通史。全书分本纪、列传、表、书、世家等五大部分,共52.65万字。书中记载了从远古到汉初3000年中各代帝王的情况、重大历史事件(本纪)和重要历史人物(列传)。因为它是以“本纪”和“列传”为最主要的内容,所以称为“纪传体”史书。史记的作者是西汉的史官司马迁。司马迁的父亲就是一位历史学家。司马迁从小喜欢读历史书,10岁就能背诵许多古文,20岁他开始到全国各地旅游,了解、调查了祖国的自然环境、风俗人情、历史文化遗产等,为他以后写史记提供了丰富的资料。'
qg(text_1)

['史记中记载了哪些内容?',
 '史记的作者是谁?',
 '20岁司马迁开始到全国各地旅游,了解、调查了祖国的自然环境、风俗人情、历史文化遗产等,为他以后写史记提供了丰富的资料。']

In [None]:
qa({
       'question': '史记的作者是谁?',
       'context': text_1
   })

{'answer': '司马迁', 'end': 142, 'score': 0.9999998807907104, 'start': 139}

In [None]:
text_2 = '甲骨文是刻在龟甲兽骨上文字的简称。主要是商代王室刻在ト问用过的龟甲兽骨上的记录，是公元前1300多年到公元前1100多年之间通行的字体。1899年首次在河南安阳殷墟（殷王朝都城遗址）发现，字数大约有3500个。'
qg(text_2)

['甲骨文是刻在什么地方的文字的简称?', '1899年第一次在河南安阳殷墟发现甲骨文的字数是多少?', '在殷王朝都城遗址发现的甲骨文数量是多少?']

In [None]:
qa([
      {
      'question': '甲骨文是刻在什么地方的文字的简称?',
       'context': text_2
       },
      {
       'question': '1899年第一次在河南安阳殷墟发现甲骨文的字数是多少?',
       'context': text_2
      },
      {
       'question': '在殷王朝都城遗址发现的甲骨文数量是多少?',
       'context': text_2
       }])

[{'answer': '龟甲兽骨', 'end': 10, 'score': 0.999996542930603, 'start': 6},
 {'answer': '3500', 'end': 103, 'score': 0.9999992847442627, 'start': 99},
 {'answer': '3500', 'end': 103, 'score': 0.9999984502792358, 'start': 99}]

In [None]:
!pip install farm

In [None]:
from farm.evaluation.squad_evaluation import compute_f1, compute_exact 
 
label = "6000 hours"
pred = "about 6000 hours"
print(f"EM: {compute_exact(label, pred)}")
print(f"F1: {compute_f1(label, pred)}")

EM: 0
F1: 0.8


In [None]:
label = "6000 hours"
pred = "about  dollars"
print(f"EM: {compute_exact(label, pred)}")
print(f"F1: {compute_f1(label, pred)}")

EM: 0
F1: 0


## 测试使用mT5的微调模型对知识图谱数据进行问题生成的效果

In [None]:
!pip install transformers
!pip install question_generation
!pip install torch

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 27.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 72.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 59.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("algolet/mt5-base-chinese-qg")

model = AutoModelForSeq2SeqLM.from_pretrained("algolet/mt5-base-chinese-qg")

Downloading:   0%|          | 0.00/428 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/7.93M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/731 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

### 生成问题

In [None]:
from question_generation import pipeline

# gpu版本
qg = pipeline("question-generation", device="cuda") 

texts = ["卡宾枪弹、原产地、美国", '卡宾枪弹、生产公司、温彻斯特公司', '底特律、面积、370.2平方公里','王羲之、所处时代、东晋', '范宽、主要作品、溪山行旅图']
for text in texts:
  print(qg(text))


['卡宾枪弹的原产地是哪里?', '卡宾的枪弹原产地是什么?', '美国哪个州的卡宾枪枪弹生产?']
['卡宾枪弹制造公司位于哪个城市?', '哪个公司生产卡宾枪?', '温彻斯特的卡宾枪厂叫什么名字?']
['底特律有多少平方公里?', '有多少平方公里的底特律?', '哪个城市有底特律的面积?']
['王羲之、王羲之,王羲之是哪个时代的人?', '谁是王羲之的祖先?', '东晋的王羲之是哪一年死的?']
['范宽主要的作品是什么?', '范宽的主要作品是什么?', '“溪山行旅图”的作者是谁?']


### 使用文本、问题进行答案生成

In [None]:
qa = pipeline("question-answering") 
qa([
      {
      'question': '卡宾枪弹的原产地是哪里?',
       'context': "卡宾枪弹、原产地、美国"
       },
      {
       'question': '底特律有多少平方公里?',
       'context': '底特律、面积、370.2平方公里'
      },
      {
       'question': '范宽主要的作品是什么?',
       'context': '范宽、主要作品、溪山行旅图'
       }])

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

[{'answer': '美国', 'end': 11, 'score': 0.9999997615814209, 'start': 9},
 {'answer': '370.2平方公里', 'end': 16, 'score': 0.9999971389770508, 'start': 7},
 {'answer': '溪山行旅图', 'end': 13, 'score': 0.9999985694885254, 'start': 8}]

## 对文本数据预处理，然后验证问题生成效果

In [None]:
import pandas as pd
import csv
import json

from question_generation import pipeline

# gpu版本
qg = pipeline("question-generation", device="cpu") 

data_list = []

with open('./drive/MyDrive/NLP/NLP_Corpus/Chinese_culture.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    for row in csvReader:
        context = row['line']
        print(qg(context), '\n.')

['中国传统绘画又被称为什么?', '中国传统的绘画又称什么?', '国画历史悠久,在东周墓葬中出土过最早的帛画作是什么?', '在五代十国以后,中国文人艺术家得到了很高的社会地位,为什么国画题材多以王宫贵族肖像或生活记录等?'] 
.
['《女史箴图》长卷的作者是谁?', '顾恺之的画线条连绵流畅,如“春蚕吐丝”?', '谁写的《女士箴图》?', '谁是顾恺之多才、工诗赋,善书法,被时人称为“才绝、画绝、痴绝”?'] 
.
['宋太宗太平兴国五年(980年)全国有多少户?', '宋徽宗崇宁元年(1102年)全国有几户', '元丰(1078年—1085年)年间,全国有多少户'] 
.
['太极拳与形意拳和八卦掌并称中国三大内家拳?', '张三丰对太极拳的贡献是什么?', '谁考证了太极拳创始者?'] 
.
['王羲之在书法史上取得的成就影响巨大,被后人誉为古今之冠,受到后人誉为什么?', '为什么王羲之曾受录为正一道士,颇受正一符影响?', '《兰亭集序》等帖,皆为后人临摹?'] 
.
['临沂市是哪个省下辖的地级市?', '沂山、蒙山、尼山三大山脉分布于北部,南部为平原,还有什么河流?', '临沂的地势是怎样的?'] 
.
['三皇五帝时期,黄河泛滥时,谁负责治水?', '大禹治水是什么时候发生的?', '谁是舜帝的儿子?'] 
.
['中秋节从嫦娥奔月这个传说而来的节日是什么?', '中国传统节日是从什么而来的?', '古代民间传说叫什么名字?'] 
.
['《庄子·齐物论》的十日是指什么?', '十日的说法最早可见于哪本书?', '《山海经·海外西经》中的十日指的是什么?', '谁派遣神射手羿射下其中九个太阳?'] 
.
['民间认为织女聪明美丽、多才多艺,在七月七晚间向织女乞求智巧,可以除去笨拙,变得眼明手巧,故又叫什么?', '七夕传统节日是从牛郎织女的故事而来的?', '什么是七夕的传统节日?'] 
.
['猪肉是什么时候祭祀的?', '猪肉的社会风俗是什么?', '什么节日是猪肉的祭祀?', '什么是猪肉?'] 
.
['赛龙舟是哪个地区的地方民俗活动?', '2010年亚洲运动会首次成为亚运会中的正式比赛项目是什么时候?', '现时龙舟竞赛已发展为一项水上体育运动,在世界各地有哪些地方亦有定期举行?', '现在龙舟竞赛的简称是什么?'] 
.
['屈

In [None]:
with open('./drive/MyDrive/NLP/NLP_Corpus/Life_and_art.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    for row in csvReader:
        context = row['line']
        print(qg(context), '\n.')

['中国古代货币或古罗马苏币是什么?', '古钱币指的是什么?', '“古钱币”这个词的起源是什么?'] 
.
['在旅行时购买或收集并带回家中的物品,只要有纪念意义,都叫什么?', '除了纪念品之外,还有什么其他形式的纪念品?', '纪念品是保存记忆而取得的物品吗?'] 
.
['川渝火锅的调味是由什么组成的?', '川渝麻辣火锅的主要菜品与北方火锅差异大,主要以什么为主要原料?', '四川辣椒最辣的辣椒是什么?'] 
.
['寿喜烧又被称为什么?', '寿喜燒一般被视为冬季传统料理,常在什么时间食用?', '什么是寿喜烧?'] 
.
['《美食侦探王》是哪个国家的漫画作品?', '台湾纬来日本台于哪一年播出《大胃王神探》?', '2006年由原作改编的电视剧叫什么名字?'] 
.
['古希腊饮食的基础是什么?', '哪一种酒精是古希腊的饮食的基础?', '古古希腊主要的谷物是什么?'] 
.
['茴香的叶子是什么颜色的?', '茴香是什么类型的植物?', '什么类型的草本植物可以生长到2.5米的高度?'] 
.
['大英博物馆的缘起可追溯到哪一世纪?', '英国通过在海外殖民扩张收获了大量文物,使得大英博物馆在伦敦的蒙塔古宫落成?', '汉斯·斯隆爵士去世前立下遗嘱,将他所收藏的珍贵物什全部捐赠给国家,从而奠定了大英博物馆的基础?'] 
.
['世界扑克大赛和世界扑克巡回赛的主赛事项目是什么?', '美国多数赌场内最受欢迎的扑克牌游戏是什么?', '“无限注德州扑克”是什么类型的游戏?'] 
.
['腓尼基人是哪个民族的邻居?', '希伯来字母和拉丁字母同源于腓尼基人的哪一类字母?', '在全盛期,腓尼基控制了什么贸易?'] 
.
['傣族的民间传统舞蹈是什么?', '南亚孔雀舞的傣族版本是什么?', '亚洲的传统民族舞蹈模仿了什么?', '在亚洲的不同地区发展出不同传统的孔雀舞?'] 
.
['西双版纳傣族自治州的首府在哪里?', '东面与缅甸相连,东南与哪个国家接壤?', '该州的总面积是多少平方公里?', '自治州首府是哪里?'] 
.
['《英雄》是何时在中国大陆上映的?', '2002年12月14日在中国大陆上映,中国票房是多少?', '电影《英雄》的主题曲演唱者是谁?'] 
.
['红河哈尼梯田的代表作是什么?', '哈尼的梯田分布在什么地方?', 

In [None]:
qa = pipeline("question-answering") 

with open('./drive/MyDrive/NLP/NLP_Corpus/KG-test.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    for row in csvReader:
        context = row['content']
        questions = qg(context)
        for q in questions:
            print('question: ', q)
            answer = qa({
                'question': str(q),
                'context': str(row)
            })
            print('answer: ', answer, '\n')

question:  卡宾枪弹的原产地是哪里?
answer:  {'answer': '美国', 'start': 35, 'end': 37, 'score': 1.0} 

question:  卡宾的枪弹原产地是什么?
answer:  {'answer': '美国', 'start': 35, 'end': 37, 'score': 1.0} 

question:  美国哪个州的卡宾枪枪弹生产?
answer:  {} 

question:  卡宾枪弹的弹壳有多长?
answer:  {'answer': '33mm', 'start': 35, 'end': 39, 'score': 0.9999985694885254} 

question:  什么类型的枪弹有直径33mm的弹壳?
answer:  {'answer': '卡宾枪弹', 'start': 26, 'end': 30, 'score': 1.0} 

question:  枪弹弹的直径是多少?
answer:  {} 

question:  卡宾枪弹制造公司位于哪个城市?
answer:  {'answer': '温彻斯特', 'start': 36, 'end': 40, 'score': 1.0} 

question:  哪个公司生产卡宾枪?
answer:  {'answer': '温彻斯特公司', 'start': 36, 'end': 42, 'score': 1.0} 

question:  温彻斯特的卡宾枪厂叫什么名字?
answer:  {'answer': '温彻斯特公司', 'start': 36, 'end': 42, 'score': 1.0} 

question:  《八爪女》的出品时间是?
answer:  {'answer': '1983年', 'start': 39, 'end': 44, 'score': 1.0} 

question:  八爪女的出品日期是?
answer:  {'answer': '1983年', 'start': 39, 'end': 44, 'score': 1.0} 

question:  第一批八爪女出品时间?
answer:  {'answer': '1983年', 'start': 39, 'end': 

## 测试基于知识图谱自带标签筛选的数据

In [None]:
!pip install transformers
!pip install question_generation
!pip install torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("algolet/mt5-base-chinese-qg")

model = AutoModelForSeq2SeqLM.from_pretrained("algolet/mt5-base-chinese-qg")

from question_generation import pipeline

# gpu版本
qg = pipeline("question-generation", device="cpu") 

qa = pipeline("question-answering") 

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

In [None]:
with open('./drive/MyDrive/NLP/NLP_Corpus/kg_base-label.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    for row in csvReader:
        context1 = row['实体']
        context2 = row['属性']
        context3 = row['值']
        full_context = context1 + '的'+ context2 +  '是：'+  context3
        with open('./kg-base-label.txt', 'a') as test_file:
            test_file.write(full_context)
            print('full_context: ', full_context)
            questions = qg(full_context)
            for q in questions:
                test_file.write(q)
                print('question: ', q)
                answer = qa({
                    'question': str(q),
                    'context': str(row)
                })
                test_file.write(str(answer))
                test_file.write('\n')
                print('answer: ', answer, '\n')


full_context:  电解质的描述是：电解质是溶于水溶液中或在熔融状态下就能够导电的化合物。
question:  电解质溶于水溶液中或在熔融状态下会产生什么?
answer:  {'answer': '导电', 'start': 84, 'end': 86, 'score': 0.9999998807907104} 

question:  什么类型的电解质会产生电解质?
answer:  {} 

question:  在溶融状态下,电解质能导电什么?
answer:  {'answer': '水溶液', 'start': 70, 'end': 73, 'score': 0.9999912977218628} 

full_context:  电解质的中文名是：电解质
question:  电解质的中文名是什么?
answer:  {'answer': '电解质', 'start': 35, 'end': 38, 'score': 0.9999974966049194} 

question:  在电解质中,电解质是什么意思?
answer:  {} 

question:  什么是电解质?
answer:  {} 

full_context:  电解质的外文名是：Electrolyte
question:  电解质的外文名是什么?
answer:  {'answer': '电解质', 'start': 35, 'end': 38, 'score': 0.9999980926513672} 

question:  什么外文名是电解质?
answer:  {'answer': "OrderedDict([('实体序号', ''), ('实体', '电解质", 'start': 0, 'end': 38, 'score': 0.9995280504226685} 

question:  电解体的外文名字是什么?
answer:  {'answer': '电解质', 'start': 35, 'end': 38, 'score': 1.0} 

full_context:  电解质的属性是：化合物
question:  电解质属性的属性是什么?
answer:  {} 

question:  什么属性是电解质的属性?
answer:  {} 

que

## 问题生成的json数据格式设计的测试

In [8]:
!pip install transformers
!pip install question_generation
!pip install torch

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 31.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 63.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 71.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existin



In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("algolet/mt5-base-chinese-qg")

model = AutoModelForSeq2SeqLM.from_pretrained("algolet/mt5-base-chinese-qg")

from question_generation import pipeline

# gpu版本
qg = pipeline("question-generation", device="cpu") 

qa = pipeline("question-answering") 

Downloading:   0%|          | 0.00/428 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/7.93M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/731 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

### 测试新的数据填充格式

In [16]:
import json
import csv

filename = './drive/MyDrive/Database/qa-results/split0-diliweizhi-100.json'


with open(filename, 'r') as file:
    test_file = json.load(file)

# test_file.append(x)
# test_file[0]['qa_bank'].append(y)

with open('./drive/MyDrive/Database/split0-diliweizhi.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    i = 0
    for row in csvReader:
        y = {
            '_id':'',
            'text':'',
            'qa_pair':[]}
        i = i + 1
        if i < 101:
            context1 = row['实体']
            context2 = row['属性']
            context3 = row['值']
            full_context = context1 + '的'+ context2 +  '是：'+  context3
            with open(filename, 'w') as file:
                y['_id'] = i
                questions = qg(full_context)
                for q in questions:
                    z = {
                        'q':'',
                         'a':''
                    }
                    y['text'] = full_context
                    answer = qa({
                        'question': str(q),
                        'context': str(row)
                    })
                    print(answer)
                    if answer == {}:
                        pass
                    else:
                        z['q'] = q
                        z['a'] = answer['answer']
                        y['qa_pair'].append(z)
                test_file.append(y)
                json.dump(test_file, file, ensure_ascii=False)
        else: 
            break

{'answer': '浙江省温州市雁荡山', 'start': 63, 'end': 72, 'score': 1.0}
{'answer': '浙江省温州市雁荡山', 'start': 63, 'end': 72, 'score': 1.0}
{'answer': '雁荡山', 'start': 69, 'end': 72, 'score': 1.0}
{'answer': '云步桥', 'start': 64, 'end': 67, 'score': 1.0}
{'answer': '云步桥', 'start': 64, 'end': 67, 'score': 1.0}
{'answer': '云步桥', 'start': 64, 'end': 67, 'score': 0.9999997615814209}
{'answer': '浙江省温州乐清', 'start': 62, 'end': 69, 'score': 1.0}
{'answer': '浙江省温州乐清', 'start': 62, 'end': 69, 'score': 1.0}
{'answer': '浙江省温州乐清', 'start': 62, 'end': 69, 'score': 1.0}
{'answer': '浙江', 'start': 63, 'end': 65, 'score': 1.0}
{'answer': '浙江省温州', 'start': 63, 'end': 68, 'score': 1.0}
{'answer': '浙江', 'start': 63, 'end': 65, 'score': 1.0}
{'answer': '香港', 'start': 38, 'end': 40, 'score': 1.0}
{}
{'answer': '新界', 'start': 35, 'end': 37, 'score': 1.0}
{'answer': '鄂西北', 'start': 67, 'end': 70, 'score': 1.0}
{}
{'answer': '鄂西', 'start': 67, 'end': 69, 'score': 1.0}
{'answer': '内蒙古中西部', 'start': 64, 'end': 70, 'score': 1.0}
{'a

In [22]:
import re

test_string = 'miahsusd-200'
new_domain = re.match('[(a-zA-Z)]+', test_string).group(0)
print(new_domain)


miahsusd


In [None]:
answer = {}
if answer == {}:
    print('empty')
else:
    print(answer)

empty


In [None]:
answer = { "answer": "水溶液",
        "start": 70,
        "end": 73,
        "score": 0.9999912977218628}

print(answer['answer'])

水溶液


In [None]:
import json
import csv

filename = 'test8.json'

x = {
    'domain_label':'',
     'qa_bank': [
                 {
                  'id':'',
                  'text':'',
                  'qa_pair':[
                             {

                             }
                  ]
                 }
     ]
}

with open(filename, 'r') as file:
    test_file = json.load(file)

test_file.append(x)

with open('./drive/MyDrive/NLP/NLP_Corpus/small-kg_base-label.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    i = 0
    for row in csvReader:
        i = i + 1
        context1 = row['实体']
        context2 = row['属性']
        context3 = row['值']
        full_context = context1 + '的'+ context2 +  '是：'+  context3
        with open('./test8.json', 'w') as file:
            test_file[0]['domain_label'] = context2
            test_file[0]['qa_bank'][0]['id'] = i
            questions = qg(full_context)
            for q in questions:
                test_file[0]['qa_bank'][0]['text'] = full_context
                answer = qa({
                    'question': str(q),
                    'context': str(row)
                })
                test_file[0]['qa_bank'][0]['qa_pair'][0][q] = answer
            json.dump(test_file, file, ensure_ascii=False)

In [None]:
with open('test7.json', encoding='utf-8') as fh:
    data = json.load(fh)

print(data)

[{'domain_label': '别称', 'qa_bank': [{'id': 9, 'text': '碳酸氢钠的别称是：小苏打，重碳酸钠，酸式碳酸钠，重曹，Baking Soda', 'qa_pair': [{'电解质溶于水溶液中或在熔融状态下会产生什么?': {'answer': '导电', 'start': 84, 'end': 86, 'score': 0.9999998807907104}, '什么类型的电解质会产生电解质?': {}, '在溶融状态下,电解质能导电什么?': {'answer': '水溶液', 'start': 70, 'end': 73, 'score': 0.9999912977218628}, '电解质的中文名是什么?': {'answer': '电解质', 'start': 35, 'end': 38, 'score': 0.9999974966049194}, '在电解质中,电解质是什么意思?': {}, '什么是电解质?': {}, '电解质的外文名是什么?': {'answer': '电解质', 'start': 35, 'end': 38, 'score': 0.9999980926513672}, '什么外文名是电解质?': {'answer': "OrderedDict([('实体序号', ''), ('实体', '电解质", 'start': 0, 'end': 38, 'score': 0.9995280504226685}, '电解体的外文名字是什么?': {'answer': '电解质', 'start': 35, 'end': 38, 'score': 1.0}, '电解质属性的属性是什么?': {}, '什么属性是电解质的属性?': {}, '化合物的属性是什么': {}, '弱电解质的分类': {}, '强电解质是什么': {'answer': '强电解质、弱电解质', 'start': 63, 'end': 72, 'score': 0.9947358965873718}, '弱的电解质是啥': {}, '弱 电解质叫什么': {}, '碳酸氢钠的化学式是什么?': {'answer': 'OrderedDict', 'start': 0, 'end': 11, 'score': 0.999999

In [None]:
filename = 'test4.json'

x = {
    'domain_label':'',
     'qa_bank': [
                 {
                  'id':'',
                  'text':'',
                  'qa_pair':[
                             {

                             }
                  ]
                 }
     ]
}

with open(filename, 'r') as file:
    test_file = json.load(file)

test_file.append(x)
test_file[0]['domain_label'] = '自然科学'
test_file[0]['qa_bank'][0]['id'] = 1
test_file[0]['qa_bank'][0]['text'] = '为什么会打雷下雨，不知道的奥秘万万千千。'
test_file[0]['qa_bank'][0]['qa_pair'][0]['q1'] = 'a1'

with open(filename, 'w') as file:
    json.dump(test_file, file)

## 测试JSON append数据

In [None]:
import json

filename = 'test.json'
data_append = {'name':'yang'}

with open(filename, 'r') as file:
    data = json.load(file)

data.append(data_append)

with open(filename, 'w') as file:
    json.dump(data, file)

In [None]:
data2_append = {'hometown': 'lianyungang'}
with open(filename, 'r+') as file:
    data = json.load(file)
    data.append(data2_append)
    file.seek(0)
    json.dump(data, file)

In [None]:
import json

filename = 'test9.json' 

lst = [{'alice': 24, 'bob': 27}]

# Write the initial json object (list of dicts)
with open(filename, mode='w') as file:
    json.dump(lst, file)

# Append the new dict to the list and overwrite whole file
with open(filename, mode='w') as f:
    lst.append({'carl':33})
    json.dump(lst, f)

In [None]:
filename = 'test9.json'

x = {
    "qa_bank":{

    }
}

with open(filename, 'r') as file:
    data = json.load(file)

data.append(x)

with open(filename, 'w') as file:
    json.dump(data, file, ensure_ascii=False)

In [None]:
data[0]["q2"] = 'what is your name?'


with open(filename, 'w') as file:
    json.dump(data, file)

In [None]:
print(data)

[{'qa_bank': {}, 'q1': 'how old are you?', 'q2': 'what is your name?'}]


In [None]:
data[0]['qa_bank']= {'q1':'test1'}

In [None]:
print(data[0]['qa_bank'])

test1


In [None]:
with open(filename, 'w') as file:
    json.dump(data, file)

In [None]:
data[0]['qa_bank']['q2'] = 'test'

In [None]:
with open(filename, 'w') as file:
    json.dump(data, file)

### 设计问题集的JSON文件格式


In [None]:
import json
filename = 'test1.json'

x = {
    'domain_label':'',
     'qa_bank': []
}

y = {
    'id':'',
    'text':'',
    'qa_pair':[
                {
                    'q':'',
                    'a':''
                }
    ]
}


with open(filename, 'r') as file:
    data = json.load(file)

data.append(x)
data[0]['qa_bank'].append(y)

print(data)
data[0]['domain_label'] = '自然科学'
data[0]['qa_bank'][0]['id'] = 1
data[0]['qa_bank'][0]['text'] = '为什么会打雷下雨，不知道的奥秘万万千千。'
data[0]['qa_bank'][0]['qa_pair'][0]['q1'] = 'a1'

with open(filename, 'w') as file:
    json.dump(data, file, ensure_ascii=False )

[{'domain_label': '', 'qa_bank': [{'id': '', 'text': '', 'qa_pair': [{}]}]}]


In [6]:
y = {
    'id':'',
    'text':'',
    'qa_pair':[]
}

z = {
    'q':'',
     'a':''
}

y['id'] = 2
y['text'] = '天天都会有好天气是不可能的。'
z['q'] = '今天天气如何？'
z['a'] = '今天天气不好，在下雨'
y['qa_pair'].append(z)
# data[0]['qa_bank'].append(y)
print(y)

{'id': 2, 'text': '天天都会有好天气是不可能的。', 'qa_pair': [{'q': '今天天气如何？', 'a': '今天天气不好，在下雨'}]}


In [7]:
z = {
    'q':'',
     'a':''
}
z['q'] = '你今天身体还好吗？'
z['a'] = '我今天好像一点头晕。'
y['qa_pair'].append(z)

print(y)

{'id': 2, 'text': '天天都会有好天气是不可能的。', 'qa_pair': [{'q': '今天天气如何？', 'a': '今天天气不好，在下雨'}, {'q': '你今天身体还好吗？', 'a': '我今天好像一点头晕。'}]}


In [None]:
y = {
    'id':'',
    'text':'',
    'qa_pair':[
                {

                }
    ]}
data[0]['qa_bank'].append(y)
print(data)

[{'domain_label': '自然科学', 'qa_bank': [{'id': 1, 'text': '为什么会打雷下雨，不知道的奥秘万万千千。', 'qa_pair': [{'q1': 'a1'}]}, {'id': 1, 'text': '为什么会打雷下雨，不知道的奥秘万万千千。', 'qa_pair': [{'q1': 'a1'}]}, {'id': '', 'text': '', 'qa_pair': [{}]}]}]


In [None]:

data[0]['qa_bank'][1]['id'] = 2
data[0]['qa_bank'][1]['text'] = '天天都会有好天气是不可能的。'
data[0]['qa_bank'][1]['qa_pair'][0]['q2'] = 'a2'

with open(filename, 'w') as file:
    json.dump(data, file, ensure_ascii=False )

### 测试csv文件读取及不同column拼接

In [None]:
import csv

with open('./drive/MyDrive/NLP/NLP_Corpus/kg_base-label.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    for row in csvReader:
        context1 = row['实体']
        context2 = row['属性']
        context3 = row['值']
        full_context = context1 + '、'+ context2 +  '、'+  context3
        with open('./test2.text', 'a') as test_file:
            test_file.write(full_context)
            test_file.write('\n')

## 尝试分割大的csv文件

In [None]:
import pandas as pd
import numpy as np

chunksize = 50000
i=0
df = pd.read_csv("./drive/MyDrive/Database/ownthink_v2.csv")

In [None]:
chunksize = 50000
i=0

file_num = len(df) // chunksize
print(file_num)

for i in range(10, 100, 1):
    df_file = df[i*chunksize:i*chunksize+49999]
    df_file.to_csv(f'./drive/MyDrive/Database/split/split-ownthink-{i}.csv', encoding='utf-8-sig')

In [None]:
print(df.head())

     实体   属性                                      值
0    胶饴   描述                        别名: 饴糖、畅糖、畅、软糖。
1    词条   描述  词条（拼音：cí tiáo）也叫词目，是辞书学用语，指收列的词语及其释文。
2    词条   标签                                     文化
3  红色食品   描述                红色食品是指食品为红色、橙红色或棕红色的食品。
4  红色食品  中文名                                   红色食品


## 使用ownthink知识图谱筛选的数据进行问题生成测试

- 属性值：描述
    - 3650条
- 属性值：地理位置
    - 213条

In [None]:
!pip install transformers
!pip install question_generation
!pip install torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Yangdf/mt5-base-chinese-qg")

model = AutoModelForSeq2SeqLM.from_pretrained("Yangdf/mt5-base-chinese-qg")

from question_generation import pipeline

# gpu版本
qg = pipeline("question-generation", device="cpu") 

qa = pipeline("question-answering") 

Downloading:   0%|          | 0.00/420 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.17G [00:00<?, ?B/s]

In [None]:
import csv

with open('./drive/MyDrive/Database/split0-miaoshu.csv', 'r', encoding='utf-8-sig') as csvfile:
    csvReader = csv.DictReader(csvfile)
    for row in csvReader:
        context1 = row['实体']
        context2 = row['属性']
        context3 = row['值']
        full_context = context1 + '的'+ context2 +  '是：'+  context3
        with open('./split0-miaoshu.txt', 'a') as test_file:
            test_file.write(full_context)
            test_file.write("\n")
            print('full_context: ', full_context)
            questions = qg(full_context)
            for q in questions:
                test_file.write(q)
                print('question: ', q)
                answer = qa({
                    'question': str(q),
                    'context': str(row)
                })
                test_file.write(str(answer))
                test_file.write("\n")
                print('answer: ', answer, '\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
answer:  {'answer': '陕西', 'start': 77, 'end': 79, 'score': 0.999998927116394} 

question:  吴宓是哪个县人?
answer:  {'answer': '泾阳县', 'start': 80, 'end': 83, 'score': 1.0} 

question:  谁是吴宓的父亲?
answer:  {} 

full_context:  按揭[商业术语]的描述是：“按揭”一词是英文“Mortgage”的粤语音译，指以房地产等实物资产或有价证券、契约等作抵押，获得银行贷款并依合同分期付清本息，贷款还清后银行归还抵押物。
question:  按揭指的是什么?
answer:  {'answer': '房地产', 'start': 96, 'end': 99, 'score': 1.0} 

question:  按揭是什么意思?
answer:  {'answer': '贷款', 'start': 121, 'end': 123, 'score': 0.9999998807907104} 

question:  贷款还清后银行归还什么?
answer:  {} 

full_context:  命题的描述是：在现代哲学、数学、逻辑学、语言学中，命题是指一个判断（陈述）的语义（实际表达的概念），这个概念是可以被定义并观察的现象。
question:  在现代哲学、数学、逻辑学、语言学和语言学中,命题是什么?
answer:  {} 

question:  什么是命题的定义?
answer:  {} 

question:  什么类型的概念可以被定义并观察?
answer:  {'answer': '命题', 'start': 36, 'end': 38, 'score': 1.0} 

full_context:  极光[自然现象]的描述是：极光（Aurora），是一种绚丽多彩的发光现象，其发生是由于太阳带电粒子流（太阳风）进入地球磁场，在地球南北两极附近地区的高空，夜间出现的灿烂美丽的光辉。
question:  极光是什么类型的发光现象?
an

KeyboardInterrupt: ignored

In [None]:
!transformers-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: yangdafu123@gmail.com
Password: 
ERROR:root:HfApi.login: This method is deprecated in favor of `set_access_token`.
Login successful
Your token: WkTlJJhLZYliTNwiVhyaQnJKDxHxCmBSlYAeTspLMFTaQoZcZQxNmtZAuiIBYVRyTtmzsTiYngWnRoxqYrCdBJvaEpIcqXviIykrcVjorMEzcdhbVBitmeKFupDUcWmR 

Your token has been saved to /root/.huggingface/token


In [None]:
!transformers-cli upload ./path/to/pretrained_model/

usage: transformers-cli <command> [<args>]
Transformers CLI tool: error: invalid choice: 'upload' (choose from 'convert', 'download', 'env', 'run', 'serve', 'login', 'whoami', 'logout', 'repo', 'add-new-model', 'add-new-model-like', 'lfs-enable-largefiles', 'lfs-multipart-upload')


In [None]:
!ls -a

.   .config  .ipynb_checkpoints  split0-diliweizhi-qa.txt
..  drive    sample_data


In [None]:
from transformers import AutoModel

In [None]:
!apt-get install git-lfs=2.13.3

Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Version '2.13.3' for 'git-lfs' was not found


**Clone repository. First install git lfs. Pulling all checkpoints may take a while**

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs
!git lfs install

Detected operating system as Ubuntu/bionic.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... done.
Running apt-get update... done.

The repository is setup! You can now install packages.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 40 not upgraded.
Need to get 6,800 kB of archives.
After this operation, 15.3 MB of additional disk space will be used.
Get:1 https://packagecloud.io/github/git-lfs/ubuntu bionic/main amd64 git-lfs amd64 3.1.2 [6,800 kB]
Fetched 6,800 kB in 1s (5,646 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/per

In [None]:
model.push_to_hub("mt5-base-chinese-qg", use_temp_dir=True)

Cloning https://huggingface.co/Yangdf/mt5-base-chinese-qg into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/2.17G [00:00<?, ?B/s]

To https://huggingface.co/Yangdf/mt5-base-chinese-qg
   435354c..5e92dab  main -> main



'https://huggingface.co/Yangdf/mt5-base-chinese-qg/commit/5e92dabb48dfa5caff54f550d6cb2ff968cd2491'

In [None]:
tokenizer.push_to_hub("mt5-base-chinese-qg", use_temp_dir=True)

Cloning https://huggingface.co/Yangdf/mt5-base-chinese-qg into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.48k/2.17G [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/2.17G [00:00<?, ?B/s]

Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


Upload file tokenizer.json:   0%|          | 32.0k/15.6M [00:00<?, ?B/s]

To https://huggingface.co/Yangdf/mt5-base-chinese-qg
   5e92dab..cd9134e  main -> main



'https://huggingface.co/Yangdf/mt5-base-chinese-qg/commit/cd9134e579d36a1c38d8671e84769442ee2a26ee'