# Preprocesser

In [1]:
import nbimporter

In [2]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.probability import FreqDist
from tqdm import tqdm

from scraper_data_reader import ReaderScrapedData
from utils_os import UtilsOS
from constants import Constants

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import tensorflow_hub as hub

import logging
logging.getLogger('tensorflow').disabled = True #OPTIONAL - to disable outputs from Tensorflow

Importing Jupyter notebook from scraper_data_reader.ipynb
Importing Jupyter notebook from utils_os.ipynb
Importing Jupyter notebook from scraper_config_reader.ipynb
Importing Jupyter notebook from constants.ipynb


W0503 23:04:24.951239 4676937152 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [3]:
class Preprocesser:
    def __init__(self, on_field, produce_field, verbose=False):
        self._on_field = on_field
        self._produce_field = produce_field
        self._verbose = verbose
        
    def apply(self, dataset):
        raise NotImplementedError

In [4]:
class PreprocesserCleanText(Preprocesser):
    def __init__(self, on_field, produce_field, verbose=False):
        super().__init__(on_field, produce_field, verbose)
        self.name = "PreprocesserCleanText"
    
    def _clean_html(self, raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        cleantext = re.sub("(<!--.*?-->)", "", cleantext, flags=re.DOTALL)
        return cleantext

    def _remove_newlines(self, content):
        return content.replace("\n", " ")

    def _remove_extra_white_spaces(self, content):
        content = re.sub(' +', ' ', content)
        content = content.strip()
        return content

    def _remove_urls(self, content):
        content = re.sub(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', content, flags=re.MULTILINE)
        return content

    def _remove_code(self, content):
        content = re.sub(r'(\w+(\.\w+)*\([^\)]*\))', '', content, flags=re.MULTILINE) # matches a.b.c(d)
        return content
    
    def _remove_backslashes(self, content):
        return content.replace("\n", "").replace("\t", "")

    def _remove_alt_html(self, content):
        content = content.split("&lt")[0]
        return content

    def apply(self, dataset):
        for sample in tqdm(dataset):
            content = sample[self._on_field]
            content = self._clean_html(content)
            content = self._remove_newlines(content)
            content = self._remove_extra_white_spaces(content)
            content = self._remove_urls(content)
            content = self._remove_code(content)
            content = self._remove_alt_html(content)
            sample[self._produce_field] = content
        return content

In [5]:
class PreprocesserTokenizer(Preprocesser):
    _token_blacklist = ["was", "wasn", "did", "didn", "you", "your", "isn", "wouldn", "doesn", "don"]
    
    def __init__(self, path_to_stem_dictionary, on_field, produce_field, verbose=False):
        super().__init__(on_field, produce_field, verbose)
        self.name = "PreprocesserTokenizer"
        self._path_to_stem_dictionary = path_to_stem_dictionary
        
        # Initialize stemmer and punctuation
        self._stemmer = nltk.stem.snowball.EnglishStemmer()
        self._punctuation = set(string.punctuation)
    
    def _add_to_stem_dictionary(self, stemmed_word, word, stem_dictionary):
        """Adds a stemmed_word -> word instance to the stem_dictionary"""
        if stemmed_word not in stem_dictionary:
            stem_dictionary[stemmed_word] = {word: 1}
        else:
            d = stem_dictionary[stemmed_word]
            if word not in d:
                d[word] = 1
            else:
                d[word] += 1
            
    def _tokenize(self, sample, stem_dictionary):
        # splits "I am Fabio" into ["I", "am", "Fabio"]. It's a little smarter than a .split(" ")
        tokenList = nltk.word_tokenize(sample)
        
        # Convert all text to lower case
        textList = [word.lower() for word in tokenList if not word.isupper()]

        # Remove punctuation
        textList = [word for word in textList if word not in self._punctuation]
        textList = ["".join(c for c in word if c not in self._punctuation) for word in textList ]

        # Convert digits into NUM
        textList = [re.sub("\d+", "NUM", word) for word in textList]  

        # Stem words 
        stemmedTextList = [self._stemmer.stem(word) for word in textList]
        for sw, w in zip(stemmedTextList, textList):
            self._add_to_stem_dictionary(sw, w, stem_dictionary)
        textList = stemmedTextList

        # Remove blanks
        textList = [word for word in textList if word != ' ']
        textList = [word for word in textList if word != '']

        # Remove short words
        textList = [word for word in textList if len(word) > 2]

        # token blacklist
        textList = [word for word in textList if word not in PreprocesserTokenizer._token_blacklist]

        return textList
    
    def apply(self, dataset):
        stem_dictionary = {}
        
        for sample in tqdm(dataset):
            sample[self._produce_field] = self._tokenize(sample[self._on_field], stem_dictionary)
            
        # Save stem dictionary
        UtilsOS.write_to_json(stem_dictionary, self._path_to_stem_dictionary)
        if self._verbose:
            print("Saved stem dictionary")

In [6]:
class PreprocesserReadTime(Preprocesser):
    def __init__(self, wps, on_field, produce_field, verbose=False):
        super().__init__(on_field, produce_field, verbose)
        self.name = "PreprocesserReadTime"
        self._wps = wps
    
    def _get_read_time(self, text):
        num_of_words = len(text.split(" "))
        read_time = num_of_words / self._wps
        return read_time
    
    def apply(self, dataset):
        for sample in dataset:
            sample[self._produce_field] = self._get_read_time(sample[self._on_field])

In [7]:
class PreprocesserTFIDF(Preprocesser):
    def __init__(self, path_to_wiki_tfidf, on_field, produce_field, verbose=False):
        super().__init__(on_field, produce_field, verbose)
        self.name = "PreprocesserTFIDF"
        
        # Read wikipedia idf
        self._idf = pd.read_csv(path_to_wiki_tfidf)
        self._idf = self._idf.set_index('term')
        if self._verbose:
            print("Number of words considered in wikipedia: {0}".format(self._idf.shape[0]))
        
    def apply(self, dataset):
        """From tokenized text to TF-IDF vector (as Python dictionary)"""
        for sample in tqdm(dataset):
            tokens = sample[self._on_field]
            
            # Create FreqDF with word frequencies and convert it to a data frame
            freq = FreqDist(tokens)
            freqDF = pd.DataFrame.from_dict(freq, orient='index')
            freqDF.columns = ['freq']

            # Merge freqDF with idf data frame
            freqit = freqDF.join(self._idf[['idf', 'logidf']])

            # Replace null values with max
            maxidf = max(freqit['idf'].dropna())
            maxlogidf = max(freqit['logidf'].dropna())
            freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf
            freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf

            # Create tfidf columns
            freqit['tfidf'] = freqit['freq'] * freqit['idf']
            freqit['logtfidf'] = freqit['freq'] * freqit['logidf']

            sample[self._produce_field] = freqit.to_dict() # To dictionary

In [8]:
def grouper(n_elements_in_batch, l):
    return [l[i:i + n_elements_in_batch] for i in range(0, len(l), n_elements_in_batch)]

In [9]:
class PreprocesserBERT(Preprocesser):
    def _load_bert(self):
        url = "https://tfhub.dev/google/elmo/2"
        return hub.Module(url)
    
    def _from_texts_to_vectors(self, texts):
        embeddings = self._bert(
            texts,
            signature="default",
            as_dict=True)["default"]

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            x = sess.run(embeddings)

        return x # array with length len(texts), where each element is an array with lenght 1024
    
    def __init__(self, on_field, produce_field, verbose=False):
        super().__init__(on_field, produce_field, verbose)
        self.name = "PreprocesserBERT"
        self._bert = self._load_bert()
        
    def apply(self, dataset):
        for samples in tqdm(grouper(50, dataset)):
            contents = [" ".join(sample[self._on_field]) for sample in samples]
            vectors = self._from_texts_to_vectors(contents)
            for sample, vector in zip(samples, vectors):
                sample[self._produce_field] = vector

In [10]:
class PreprocesserMain:    
    def __init__(self, preprocessers, path_to_preprocessed, verbose=False):
        self._preprocessers = preprocessers
        self._path_to_preprocessed = path_to_preprocessed
        self._verbose = verbose
        
    def run_preprocessing(self, dataset):                
        # Create target directory
        UtilsOS.directory_maybe_create(self._path_to_preprocessed)
        
        # Preprocessing
        for preprocesser in preprocessers:
            if self._verbose:
                print("Starting preprocesser {0}".format(preprocesser.name))
            preprocesser.apply(dataset)
            if self._verbose:
                print("Finished preprocesser {0}".format(preprocesser.name))

        # Save data
        for i, sample in tqdm(enumerate(dataset)):
            UtilsOS.write_to_json(sample, self._path_to_preprocessed + "/{0}.json".format(i))

# Run preprocessing

In [11]:
if __name__ == "__main__":
    # Read scraped data
    data = ReaderScrapedData.read_data(Constants.path_to_scraper_config, Constants.path_to_articles)
    dataset_nested = [data[website][domain] for website in data.keys() for domain in data[website].keys()]
    dataset = [el for subl in dataset_nested for el in subl]
    print("Read {0} articles".format(len(dataset)))
    
    wps = 200 / 60 # = 3.33wps
    
    # Initialize preprocessers
    preprocessers = []
    preprocessers.append(PreprocesserCleanText(on_field="content", produce_field="content"))
    preprocessers.append(PreprocesserCleanText(on_field="title", produce_field="title"))
    preprocessers.append(PreprocesserTokenizer(Constants.path_to_stem_dictionary, on_field="content", produce_field="content_tokenized"))
    preprocessers.append(PreprocesserReadTime(wps=wps, on_field="content", produce_field="read_time"))
    preprocessers.append(PreprocesserTFIDF(Constants.path_to_wiki_tfidf, on_field="content_tokenized", produce_field="tfidf"))
    #preprocessers.append(PreprocesserBERT(on_field="content_tokenized", produce_field="bert_vector"))
    
    # Run preprocessing
    preprocesser = PreprocesserMain(preprocessers, Constants.path_to_preprocessed, verbose=True)
    preprocesser.run_preprocessing(dataset)

  1%|▏         | 64/4443 [00:00<00:06, 637.30it/s]

Read 4443 articles
Starting preprocesser PreprocesserCleanText


100%|██████████| 4443/4443 [00:05<00:00, 759.27it/s] 
100%|██████████| 4443/4443 [00:00<00:00, 34236.73it/s]
  0%|          | 0/4443 [00:00<?, ?it/s]

Finished preprocesser PreprocesserCleanText
Starting preprocesser PreprocesserCleanText
Finished preprocesser PreprocesserCleanText
Starting preprocesser PreprocesserTokenizer


100%|██████████| 4443/4443 [02:00<00:00, 36.93it/s]


Finished preprocesser PreprocesserTokenizer
Starting preprocesser PreprocesserReadTime


  0%|          | 1/4443 [00:00<12:38,  5.86it/s]

Finished preprocesser PreprocesserReadTime
Starting preprocesser PreprocesserTFIDF


100%|██████████| 4443/4443 [09:38<00:00,  7.33it/s]
16it [00:00, 141.00it/s]

Finished preprocesser PreprocesserTFIDF


4443it [00:31, 140.70it/s]
