In [1]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import operator

def cleanData(name):    #句子切分
    setlast = jieba.cut(name, cut_all=False)
    seg_list = [i.lower() for i in setlast]
    return " ".join(seg_list)


def calculateSimilarity(sentence, doc):  # 根据句子和句子，句子和文档的余弦相似度
    if doc == []:
        return 0
    vocab = {}
    for word in sentence.split():
        vocab[word] = 0  # 生成所在句子的单词字典，值为0

    docInOneSentence = '';
    for t in doc:
        docInOneSentence += (t + ' ')  # 所有剩余句子合并
        for word in t.split():
            vocab[word] = 0  # 所有剩余句子的单词字典，值为0

    cv = CountVectorizer(vocabulary=vocab.keys())

    docVector = cv.fit_transform([docInOneSentence])
    sentenceVector = cv.fit_transform([sentence])
    return cosine_similarity(docVector, sentenceVector)[0][0]


def TextRank_sent(sentence):
  texts = [sentence]  # 读行
  texts = [i[:-1] if i[-1] == '\n' else i for i in texts]

  sentences = []
  clean = []
  originalSentenceOf = {}

  # Data cleansing
  for line in texts:
      parts = line.split('。')[:-1]  # 句子拆分
      #   print (parts)
      for part in parts:
          cl = cleanData(part)  # 句子切分
          #       print (cl)
          sentences.append(part)  # 原本的句子
          clean.append(cl)  # 干净有重复的句子
          originalSentenceOf[cl] = part  # 字典格式
  setClean = set(clean)  # 干净无重复的句子

  # calculate Similarity score each sentence with whole documents
  scores = {}
  for data in clean:
      temp_doc = setClean - set([data])  # 在除了当前句子的剩余所有句子
      score = calculateSimilarity(data, list(temp_doc))  # 计算当前句子与剩余所有句子的相似度
      scores[data] = score  # 得到相似度的列表
      # print score

  # calculate MMR
  n = 10 * len(sentences) / 100  # 摘要的比例大小
  alpha = 0.7
  summarySet = []
  while n > 0:
      mmr = {}
      # kurangkan dengan set summary
      for sentence in scores.keys():
          if not sentence in summarySet:
              mmr[sentence] = alpha * scores[sentence] - (1 - alpha) * calculateSimilarity(sentence,
                                                                                          summarySet)  # 公式
      selected = max(mmr.items(), key=operator.itemgetter(1))[0]
      summarySet.append(selected)
      #   print (summarySet)
      n -= 1


  summary = ""
  summary+"sasa"
  for sentence in summarySet:
    summary = summary + originalSentenceOf[sentence].lstrip('')
  return summary

In [2]:
! pip install datasets

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/86/27/9c91ddee87b06d2de12f134c5171a49890427e398389f07f6463485723c3/datasets-1.9.0-py3-none-any.whl (262kB)
[K     |█▎                              | 10kB 14.8MB/s eta 0:00:01[K     |██▌                             | 20kB 21.2MB/s eta 0:00:01[K     |███▊                            | 30kB 24.6MB/s eta 0:00:01[K     |█████                           | 40kB 26.8MB/s eta 0:00:01[K     |██████▎                         | 51kB 27.8MB/s eta 0:00:01[K     |███████▌                        | 61kB 29.8MB/s eta 0:00:01[K     |████████▊                       | 71kB 24.9MB/s eta 0:00:01[K     |██████████                      | 81kB 26.0MB/s eta 0:00:01[K     |███████████▎                    | 92kB 26.5MB/s eta 0:00:01[K     |████████████▌                   | 102kB 27.0MB/s eta 0:00:01[K     |█████████████▊                  | 112kB 27.0MB/s eta 0:00:01[K     |███████████████                 | 122kB 27

In [4]:
from datasets import load_dataset
dataset = load_dataset('json', data_files='/content/drive/MyDrive/Summarization/nlpcc_data.json', field='data')
def flatten(example):
    return {
        "document": example["content"],
        "summary": example["title"],
        "id":"0"
    }
dataset = dataset["train"].map(flatten, remove_columns=["title", "content"]) # , remove_columns=["title", "content"]
from datasets import dataset_dict
import datasets

train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.1).values()
train_data_txt, test_data_tex = train_data_txt.train_test_split(test_size=0.1).values()
# 装载数据
dd = datasets.DatasetDict({"train":train_data_txt,"validation": validation_data_txt,"test":test_data_tex }) 

raw_datasets = dd



Using custom data configuration default-de599cbfef392bb5


Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-de599cbfef392bb5/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-de599cbfef392bb5/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))




In [5]:
pred_list = []
for text in raw_datasets["test"]["document"][0:100]:
    pred_list.append(TextRank_sent(text))
gold_list = list(raw_datasets["test"]["summary"][0:100])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.028 seconds.
Prefix dict has been built successfully.


In [7]:
! pip install lawrouge

Collecting lawrouge
  Downloading https://files.pythonhosted.org/packages/cc/9c/cc411fd95b5fdf1924b2336f33d1eb304b65c029d77666315d4d8ff8ba0b/lawrouge-2.0.0.tar.gz
Building wheels for collected packages: lawrouge
  Building wheel for lawrouge (setup.py) ... [?25l[?25hdone
  Created wheel for lawrouge: filename=lawrouge-2.0.0-cp37-none-any.whl size=9291 sha256=a6c6d9a90746b1b66dc2b81e66d5f69fffc0e1f86d193c827c80d96f72e769e5
  Stored in directory: /root/.cache/pip/wheels/a5/b9/f5/1cc0dcd988dce81a890faa06afcfa03f69bdac0bc847dd6197
Successfully built lawrouge
Installing collected packages: lawrouge
Successfully installed lawrouge-2.0.0


In [11]:
import lawrouge
rouge = lawrouge.Rouge()
score_s = 0
for i in range(len(gold_list)):
    score = rouge.get_scores([pred_list[i]], [gold_list[i]], avg=0)
    score_s += score[0]["rouge-1"]["r"]
score_ave = score_s/len(gold_list)
print('weighted score: ', score_ave)

weighted score:  0.524947547381317
