<a href="https://colab.research.google.com/github/d61h6k4/notebooks/blob/master/TelegramsContest/DataClustering/Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install --upgrade pip
!pip install tensorflow-gpu
!pip install python-dateutil

Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (19.3.1)


In [0]:
import datetime
import dateutil.parser
import html
import pathlib
import pprint

import tensorflow as tf

In [4]:
data_clustering_sample_archive = tf.keras.utils.get_file("data_clustering_sample.tar.gz", "https://zenodo.org/record/3544758/files/DataClusteringSample0107.tar.gz?download=1", extract=True, archive_format='tar')
data_clustering_sample = tf.data.Dataset.list_files(str(pathlib.Path(data_clustering_sample_archive).parent / '*/*/*.html'))

Downloading data from https://zenodo.org/record/3544758/files/DataClusteringSample0107.tar.gz?download=1


In [0]:
def read_file(file_path):
    return tf.io.read_file(file_path)

In [0]:
class SampleParser(html.parser.HTMLParser):
    def __init__(self):
        super(SampleParser, self).__init__()

        self.url = "NO_URL"
        self.site_name = "NO_SITE_NAME"
        self.published_timestamp = 0
        self.title = "NO_TITLE"
        self.description = "NO_DESCRIPTION"
        self.header = ""
        self.text = []
        self.author = "UNK_AUTHOR"
        self.not_text = []

        self.header_in = False
        self.address_in = False
        self.paragraph_in = False
        self.author_in = False
        self.address_in = False
        self.article_in = False
    
    def handle_starttag(self, tag, attrs):
        if tag == "meta":
            self.parse_meta(attrs)
        elif tag == "article":
            self.article_in = True
        elif tag == "h1":
            self.header_in = True
        elif tag == "h2":
            self.header_in = True
        elif tag == "address":
            self.address_in = True
        elif tag == "p":
            self.paragraph_in = True
        elif tag == "a" and self.address_in:
            self.author_in = True

    def handle_endtag(self, tag):
        if tag == "article":
            self.article_in = False
        elif tag == "h1":
            self.header_in = False
        elif tag == "h2":
            self.header_in = False
        elif tag == "address":
            self.address_in = False
        elif tag == "p":
            self.paragraph_in = False
        elif tag == "a" and self.author_in:
            self.author_in = False

    def handle_data(self, data):
        if self.header_in:
            self.header += data
        elif self.paragraph_in:
            self.text.append(data)
        elif self.author_in:
            self.author = data
        elif self.article_in:
            self.not_text.append(data.strip())

    def parse_meta(self, attrs):
        tag_property = "NO_PROPERTY"
        tag_content = "NO_CONTENT"
        for name, value in attrs:
            if name == "property":
                tag_property = value
            elif name == "content":
                tag_content = value
        
        if tag_property == "og:url":
            self.url = tag_content
        elif tag_property == "og:site_name":
            self.site_name = tag_content
        elif tag_property == "article:published_time":
            self.published_timestamp = int(dateutil.parser.parse(tag_content).timestamp())
        elif tag_property == "og:title":
            self.title = tag_content
        elif tag_property == "og:description":
            self.description = tag_content


class ParseFile(object):
    def __init__(self):
        self.__sample_parser = SampleParser()

    def __call__(self, raw_html_text):
        self.__sample_parser.feed(raw_html_text.numpy().decode('utf-8'))
        return (self.__sample_parser.url,
                self.__sample_parser.site_name,
                self.__sample_parser.published_timestamp,
                self.__sample_parser.title,
                self.__sample_parser.description,
                self.__sample_parser.header,
                ' '.join(self.__sample_parser.text),
                self.__sample_parser.author,
                ' '.join(self.__sample_parser.not_text))

In [0]:
def preprocess_raw_html(file_body):
    url, site_name, published_timestamp, title, description, header, text, author, not_text = tf.py_function(ParseFile(), inp=[file_body], Tout=(tf.string, tf.string, tf.uint32, tf.string, tf.string, tf.string, tf.string, tf.string, tf.string))
    return {"url": url,
            "site_name": site_name,
            "published_timestamp": published_timestamp,
            "title": title,
            "description": description,
            "header": header,
            "text": text,
            "not_text": not_text,
            "author": author}

In [9]:
for f in data_clustering_sample.map(read_file).map(preprocess_raw_html).take(20):
    pprint.pprint(f)
    print('\n\n')

{'author': <tf.Tensor: id=2007, shape=(), dtype=string, numpy=b'UNK_AUTHOR'>,
 'description': <tf.Tensor: id=2008, shape=(), dtype=string, numpy=b'HONG KONG: The Chinese Communist Party said it would "perfect" the system for choosing the leader of Hong Kong after months of street protests\xc2\xa0and ...'>,
 'header': <tf.Tensor: id=2009, shape=(), dtype=string, numpy=b"China says it will 'perfect' system for choosing Hong Kong leader">,
 'not_text': <tf.Tensor: id=2010, shape=(), dtype=string, numpy=b"   05 Nov 2019, 20:23     Advertisement  Advertisement   READ:\xc2\xa0Chinese leadership says it will ensure Hong Kong's stability and prosperity         READ:\xc2\xa0Xi voices 'high degree of trust' in Hong Kong leader over unrest    READ:\xc2\xa0Hong Kong's Carrie Lam to discuss helping people to live and work in mainland China          READ:\xc2\xa0Scores injured, one critical in chaotic weekend of Hong Kong protests                Source: Agencies/ga/nh ">,
 'published_timestamp': <tf