<a href="https://colab.research.google.com/github/d61h6k4/notebooks/blob/master/TelegramsContest/DataClustering/Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -q --upgrade pip
!pip install -q tensorflow-gpu==2.0.0
!pip install -q python-dateutil
!pip install -q tensorflow-text==2.0.0
!pip install -q tensorflow-hub                               
!pip install -q numpy scipy scikit-learn numba
!pip install -q umap-learn
!pip install -q matplotlib
!pip install -q tqdm

In [0]:
import datetime
import dateutil.parser
import html
import pathlib
import pprint
import tqdm
import umap

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_text as tf_text
import tensorflow_hub as tf_hub

In [0]:
%matplotlib inline
plt.style.use('fivethirtyeight')

In [0]:
data_clustering_sample_archive = tf.keras.utils.get_file("data_clustering_sample.tar.gz", "https://zenodo.org/record/3544758/files/DataClusteringSample0107.tar.gz?download=1", extract=True, archive_format='tar')
data_clustering_sample = tf.data.Dataset.list_files(str(pathlib.Path(data_clustering_sample_archive).parent / '*/*/*.html'))

In [0]:
def read_file(file_path):
    return tf.io.read_file(file_path)

In [0]:
class SampleParser(html.parser.HTMLParser):
    def __init__(self):
        super(SampleParser, self).__init__()

        self.reset_state()

    def reset_state(self):
        self.url = "NO_URL"
        self.site_name = "NO_SITE_NAME"
        self.published_timestamp = 0
        self.title = "NO_TITLE"
        self.description = "NO_DESCRIPTION"
        self.header = ""
        self.text = []
        self.author = "UNK_AUTHOR"
        self.not_text = []

        self.header_in = False
        self.address_in = False
        self.paragraph_in = False
        self.author_in = False
        self.address_in = False
        self.article_in = False
    
    def handle_starttag(self, tag, attrs):
        if tag == "meta":
            self.parse_meta(attrs)
        elif tag == "article":
            self.article_in = True
        elif tag == "h1":
            self.header_in = True
        elif tag == "h2":
            self.header_in = True
        elif tag == "address":
            self.address_in = True
        elif tag == "p":
            self.paragraph_in = True
        elif tag == "a" and self.address_in:
            self.author_in = True

    def handle_endtag(self, tag):
        if tag == "article":
            self.article_in = False
        elif tag == "h1":
            self.header_in = False
        elif tag == "h2":
            self.header_in = False
        elif tag == "address":
            self.address_in = False
        elif tag == "p":
            self.paragraph_in = False
        elif tag == "a" and self.author_in:
            self.author_in = False

    def handle_data(self, data):
        if self.header_in:
            self.header += data
        elif self.paragraph_in:
            self.text.append(data)
        elif self.author_in:
            self.author = data
        elif self.article_in:
            self.not_text.append(data.strip())

    def parse_meta(self, attrs):
        tag_property = "NO_PROPERTY"
        tag_content = "NO_CONTENT"
        for name, value in attrs:
            if name == "property":
                tag_property = value
            elif name == "content":
                tag_content = value
        
        if tag_property == "og:url":
            self.url = tag_content
        elif tag_property == "og:site_name":
            self.site_name = tag_content
        elif tag_property == "article:published_time":
            self.published_timestamp = int(dateutil.parser.parse(tag_content).timestamp())
        elif tag_property == "og:title":
            self.title = tag_content
        elif tag_property == "og:description":
            self.description = tag_content


class ParseFile(object):
    def __init__(self):
        self.__sample_parser = SampleParser()

    def __call__(self, raw_html_text):
        self.__sample_parser.reset_state()
        self.__sample_parser.feed(raw_html_text.numpy().decode('utf-8'))
        return (self.__sample_parser.url,
                self.__sample_parser.site_name,
                self.__sample_parser.published_timestamp,
                self.__sample_parser.title,
                self.__sample_parser.description,
                self.__sample_parser.header,
                ' '.join(self.__sample_parser.text),
                self.__sample_parser.author,
                ' '.join(self.__sample_parser.not_text))

In [0]:
def preprocess_raw_html(file_body):
    url, site_name, published_timestamp, title, description, header, text, author, not_text = tf.py_function(ParseFile(), 
                                                                                                             inp=[file_body], 
                                                                                                             Tout=(tf.string, tf.string, tf.uint32, tf.string, tf.string, tf.string, tf.string, tf.string, tf.string))
    return {"url": url,
            "site_name": site_name,
            "published_timestamp": published_timestamp,
            "title": tf.reshape(title, (1,)),
            "description": tf.reshape(description, (1,)),
            "header": header,
            "text": text,
            "not_text": not_text,
            "author": author}

In [0]:
def preprocess_text(doc):
    return doc["description"]

In [0]:
embed = tf_hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/2")

def transform_embed(doc):
    return embed(doc)["outputs"]

In [0]:
processed_data = data_clustering_sample.map(read_file) \
                                       .map(preprocess_raw_html) \
                                       .map(preprocess_text) \
                                       .map(transform_embed)

In [0]:
N = 500000
sample = np.empty((N, 512))

for i, record in tqdm.tqdm_notebook(enumerate(processed_data.take(N))):
    sample[i] = record.numpy()[0]

In [0]:
reducer = umap.UMAP()
embeddings = reducer.fit_transform(sample)

fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(14, 7))
axs.scatter(embeddings[:, 0], embeddings[:, 1])
plt.show()