In [1]:
import os
import re
import pandas as pd
from glob import glob

## THUCNews

Topics: 体育  娱乐  家居  彩票  房产  教育  时尚  时政  星座  游戏  社会  科技  股票  财经

### Data Statistics

* Get total 23161 articles and 453149 sentences on topic 科技

### Garbage Cases

#### Author name related

```txt
英国记者用相机拍摄阅兵
　　去年的国庆60周年大阅兵想必大家还记忆犹新，通过电视直播，建国以来几十年的发展变化全都呈现在了观众眼前，据说央视也为了此次直播准备了很久，期间彩排的几次其也是次次到场，而最后的直播画面也被做成集锦，播出了很长时间，但是近期笔者发现了一部由英国记者用数码相机拍摄的国庆阅兵场面。


60家网媒编辑记者广东采访 
　　本报讯(记者/杨大正)昨日,改革开放30周年全国重点网络媒体广东行在广州拉开序幕。来自人民网、新华网、新浪、网易等60多家全国重点网络媒体的百余名网络编辑、记者将对广州、肇庆、佛山、东莞、深圳等5个城市进行为期一周的采访报道。 
```

Author name in the last line

* usually can be remove by the `remove_too_short` > 4, except some of the cases

```txt
(任秋凌)(杨孝文)
```

In [2]:
# data count
for item in os.listdir('data/THUCNews'):
    if os.path.isdir(os.path.join('data/THUCNews', item)):
        print(item, len(os.listdir(os.path.join('data/THUCNews', item))))

科技 162929
时政 63086
家居 32586
社会 50849
股票 154398
时尚 13368
彩票 7588
娱乐 92632
财经 37098
教育 41936
游戏 24373
星座 3578
房产 20050
体育 131604


In [3]:
# THUCNews constants

topics_to_select = [
    {
        'label': 'pos',
        'topic': '科技'
    },
    {
        'label': 'neg',
        'topic': '体育'
    }
]

data_to_split = [
    {
        'type': 'train',
        'amount': None, # the rest
    },
    {
        'type': 'dev',
        'amount': 2000,
#         'amount': 20, # debug
    },
    {
        'type': 'test',
        'amount': 500,
#         'amount': 5, # debug
    }
    
]

In [4]:
def get_word_list(sentence):
    """
    https://blog.csdn.net/weixin_44208569/article/details/90315208
    
    TODO: this will remove punctuation but we want them
    """
    to_split = re.compile('[\\W]*') # string except for what we want
    chinese = re.compile(r'([\u4e00-\u9fa5])') # Chinese characters

    word_list = []
    # TODO: need some debug here
    try:
        for string in to_split.split(sentence):
            if chinese.search(string):
                for char in chinese.split(string):
                    word_list.append(char)
            else:
                word_list.append(string)
    except:
        import ipdb; ipdb.set_trace()

    return [word for word in word_list if len(word.strip()) > 0]  # remove empty string



def single_news_process(text, drop_empty_line=True, strip=True, remove_author=True, remove_dummy_word_title=True, remove_too_short=5):
    """
    * Seperate sentences
    * Seperate paragraph?! (currently we don't do this, if we want to do this maybe we might need to mark the sentence ID of seperation)
    
    reference: https://github.com/blmoistawinde/HarvestText/blob/73c28ab6549d8a16392fca9803823eaa94221100/harvesttext/harvesttext.py#L711
    """
    
    text = re.sub('([。！？\?!])([^”’])', r"\1\n\2", text)
    text = re.sub('(\.{6})([^”’])', r"\1\n\2", text)
    text = re.sub('(\…{2})([^”’])', r"\1\n\2", text)
    text = re.sub('([。！？\?!][”’])([^，。！？\?])', r'\1\n\2', text)
    text = text.rstrip()
    sentences = text.split("\n")
    if strip:
        sentences = [sent.strip() for sent in sentences]
        
    if drop_empty_line:
        to_include = max(remove_too_short, 0)
        sentences = [sent for sent in sentences if len(sent.strip()) > to_include]
        
    if remove_author:
        author_keywords = ['记者 ', '记者：', '作 者', '作者：', '□ ', '策划/', '策划/']
        author_index = min(3, len(sentences)) # only look the first few sentences
        i = 1 # usually author information start from second sentence
        while i < min(3, len(sentences)):
            delete = False
            for keyword in author_keywords:
                if keyword in sentences[i]:
                    del sentences[i]
                    delete = True
                    break
            if not delete:
                i += 1
        
    if remove_dummy_word_title and len(sentences) > 0:
        dummy_word = ['(组图)', '(图)']
        for keyword in dummy_word:
            if sentences[0][-len(keyword):] == keyword:
                sentences[0] = sentences[0][:-len(keyword)]
                

    return sentences

In [5]:
def get_topic_of_THUCNews(topic, base_dir = 'data/THUCNews', process_fn = lambda x: x,
                          filter_garbage = True, drop_too_short = 10, keep_structure=False, seperate_char=True,
                          verbose=False):
    all_articles = []
    
    garbage = re.compile(r'(【ZOL-七天在线|【四川行情】|【3C168 中关村湖南】|【IT168|[参考价格])')
    article_count = 0
    for article_path in glob(os.path.join(base_dir, topic, '*.txt')):
        with open(article_path, 'r') as stream:
            article = stream.read()
        
        if filter_garbage:
            if garbage.search(article):
                continue
        
        article_sents = process_fn(article)
        if drop_too_short > 0 and drop_too_short > len(article_sents):
            continue
        
        if article_sents:
            article_count += 1
            
            if seperate_char:
                # seperate characters
                article_sents = [' '.join(sent) for sent in article_sents]
                # TODO: combine english but keep punctuation
                # article_sents = [' '.join(get_word_list(sentences)) for sent in article_sents]

            if keep_structure:
                # single article single list
                all_articles.append(article_sents)
            else:
                # all sentences in one list
                all_articles.extend(article_sents)
                
        # debug
        # if article_count > 57:
        #     break
        
    if verbose:
        if keep_structure:
            print('Get total', article_count, 'articles and', sum([len(sents) for sents in all_articles]), 'sentences on topic', topic)
        else:
            print('Get total', article_count, 'articles and', len(all_articles), 'sentences on topic', topic)

    return all_articles

In [6]:
# get_topic_of_THUCNews('科技', process_fn=single_news_process, verbose=True)

In [7]:
data = {item['label']: get_topic_of_THUCNews(item['topic'], process_fn=single_news_process, verbose=True) for item in topics_to_select}

Get total 21953 articles and 423614 sentences on topic 科技
Get total 31960 articles and 708573 sentences on topic 体育


In [8]:
# data

In [9]:
# split data
# maybe shuffle the data?!
# import random
# random.seed(87)

splitted_data = {}

for label, sents in data.items():
    for_the_rest_set = None
    total_indices = list(range(len(sents)))
    data_temp = {}
    start_index = 0
    for data_set in data_to_split:
        data_temp[data_set['type']] = []
        if data_set['amount'] is None:
            for_the_rest_set = data_set['type']
        else:
            # sample_indices = random.sample(total_indices, data_set['amount'])
            # for index in sample_indices:
            #     data_temp[data_set['type']].append(sents[index])
            #     total_indices.remove(index)
            data_temp[data_set['type']] = sents[start_index:start_index + data_set['amount']]
            start_index = data_set['amount']
                
    if for_the_rest_set:
        # for index in total_indices:
        #     data_temp[for_the_rest_set].append(sents[index])
        data_temp[for_the_rest_set] = sents[start_index:]
    
    splitted_data[label] = data_temp


In [10]:
for label, sub_data in splitted_data.items():
    print(label)
    for data_set, sents in sub_data.items():
        print(data_set, len(sents))

pos
train 423114
dev 2000
test 500
neg
train 708073
dev 2000
test 500


In [11]:
# Write data

for label, sub_data in splitted_data.items():
    for data_set, sents in sub_data.items():
        with open(os.path.join('data/THUCNews', f'{data_set}.{label}'), 'w') as stream:
            for sent in sents:
                stream.write(sent + '\n')

In [12]:
!cp 'data/THUCNews/test.pos' 'evaluator/THUCNews.refs.1'
!cp 'data/THUCNews/test.neg' 'evaluator/THUCNews.refs.0'

## Preparing Evaluator

* [Preperation for Evaluator for New Dataset - HackMD](https://hackmd.io/NgYXPtOqRCWKHV33L1NofQ?view)


### Fasttext classifier

* [How does FastText classifier work under the hood? | by Amjad Abu-Rmileh | Towards Data Science](https://towardsdatascience.com/fasttext-bag-of-tricks-for-efficient-text-classification-513ba9e302e7#:~:text=FastText%2C%20by%20Facebook%20Research%2C%20is,representations%20of%20words%20and%20sentences.)
* [Text classification · fastText](https://fasttext.cc/docs/en/supervised-tutorial.html)
* [Cheatsheet · fastText](https://fasttext.cc/docs/en/cheatsheet.html)

In [13]:
# Generate training files

with open('data/THUCNews_data_train.txt', 'w') as stream:
    for label, sub_data in splitted_data.items():
        for sent in sub_data['train']:
            stream.write(f'__label__{label} {sent}\n')

In [14]:
# Train classifier

import fasttext


model = fasttext.train_supervised('data/THUCNews_data_train.txt')
model.save_model('evaluator/acc_THUCNews.bin')

In [15]:
# Quick evaluation

with open('data/THUCNews_data_valid.txt', 'w') as stream:
    for label, sub_data in splitted_data.items():
        for sent in sub_data['dev']:
            stream.write(f'__label__{label} {sent}\n')

#(#sample, precision at one, recall at one)
model.test('data/THUCNews_data_valid.txt')

(4000, 0.9415, 0.9415)

### kenlm for perplexity evaluation

> (make sure you have execute `setup.sh` before to get the kenlm executable)

related links:

* [使用KenLM训练n-gram语言模型 （中文）_benbenls的博客-CSDN博客_kenlm 中文](https://blog.csdn.net/benbenls/article/details/102898960)

In [16]:
with open('data/THUCNews_lm_data.txt', 'w') as stream:
    for label, sub_data in splitted_data.items():
        for sent in sub_data['train']:
            stream.write(sent + '\n')

In [17]:
!kenlm/build/bin/lmplz -o 5 <data/THUCNews_lm_data.txt >data/THUCNews.arpa

=== 1/5 Counting and sorting n-grams ===
Reading /tf/t-dawli/TextStyleTransfer/ChineseStyleTransformer/data/THUCNews_lm_data.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 47797171 types 5683
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:68196 2:21103661056 3:39569367040 4:63310983168 5:92328517632
Statistics:
1 5683 D1=0.468584 D2=1.195 D3+=1.47069
2 1230755 D1=0.614144 D2=1.07658 D3+=1.52494
3 8110204 D1=0.737731 D2=1.11805 D3+=1.42726
4 19161533 D1=0.827161 D2=1.16742 D3+=1.4093
5 28392852 D1=0.790367 D2=1.35546 D3+=1.5094
Memory estimate for binary LM:
type      MB
probing 1139 assuming -p 1.5
probing 1303 assuming -r models -p 1.5
trie     491 without quantization
trie     254 assuming -q 8 -b 8 quantization 
trie     433 assuming -a 22 array pointer compression
trie     196 assuming -a 22

In [18]:
!kenlm/build/bin/build_binary data/THUCNews.arpa evaluator/ppl_THUCNews.binary

Reading data/THUCNews.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS
