# Preprocesser

In [1]:
import nbimporter

In [2]:
import numpy as np
import pandas as pd
from alphabet_detector import AlphabetDetector
import re
import string
import nltk
from nltk.probability import FreqDist

from scraper_data_reader import ReaderScrapedData
from utils_os import UtilsOS

import warnings
warnings.filterwarnings('ignore')

Importing Jupyter notebook from scraper_data_reader.ipynb
Importing Jupyter notebook from utils_os.ipynb
Importing Jupyter notebook from scraper_config_reader.ipynb


In [3]:
class Preprocesser:
    def _ok_title(self, title, ad):
        num_words = len(title.split(" "))
        ok_num_words = num_words >= 2 and num_words <= 20
        ok_alphabet = ad.only_alphabet_chars(title, "LATIN")
        return ok_num_words and ok_alphabet

    def _ok_content(self, content, ad):
        num_words = len(content.split(" "))
        ok_num_words = num_words >= 100
        ok_alphabet = ad.only_alphabet_chars(content, "LATIN")
        return ok_num_words and ok_alphabet

    def _filter_articles(self, dataset, ad):
        dataset_copy = []
        not_ok_title = 0
        not_ok_content = 0
        for i,sample in enumerate(dataset):
            title_ok, content_ok = self._ok_title(sample["title"], ad), self._ok_content(sample["content"], ad)
            if title_ok and content_ok:
                dataset_copy.append(sample)
            if not title_ok:
                not_ok_title += 1
            if not content_ok:
                not_ok_content += 1

        if self._verbose:
            print("Prev length: {0}".format(len(dataset)))
            print("New length: {0}".format(len(dataset_copy)))
            print("Dropped total: {0}".format(len(dataset) - len(dataset_copy)))
            print("\tDropped title: {0}".format(not_ok_title))
            print("\tDropped content: {0}".format(not_ok_content))

        return dataset_copy
    
    def _clean_html(self, raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        cleantext = re.sub("(<!--.*?-->)", "", cleantext, flags=re.DOTALL)
        return cleantext

    def _remove_newlines(self, content):
        return content.replace("\n", " ")

    def _remove_extra_white_spaces(self, content):
        content = re.sub(' +', ' ', content)
        content = content.strip()
        return content

    def _remove_urls(self, content):
        content = re.sub(r'https?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
        content = re.sub(r'http?:\/\/.*[\r\n]*', '', content, flags=re.MULTILINE)
        return content

    def _remove_code(self, content):
        content = re.sub(r'(\w+(\.\w+)*\([^\)]*\))', '', content, flags=re.MULTILINE) # matches a.b.c(d)
        return content

    def _remove_alt_html(self, content):
        content = content.split("&lt")[0]
        return content

    def _clean_text(self, content):
        content = self._clean_html(content)
        content = self._remove_newlines(content)
        content = self._remove_extra_white_spaces(content)
        content = self._remove_urls(content)
        content = self._remove_code(content)
        content = self._remove_alt_html(content)
        return content
    
    def _add_to_stem_dictionary(self, stemmed_word, word, stem_dictionary):
        """Adds a stemmed_word -> word instance to the stem_dictionary"""
        if stemmed_word not in stem_dictionary:
            stem_dictionary[stemmed_word] = {word: 1}
        else:
            d = stem_dictionary[stemmed_word]
            if word not in d:
                d[word] = 1
            else:
                d[word] += 1

    def _clean_tokens(self, tokenList, stem_dictionary, token_blacklist, stemmer, punctuation):
        # Convert all text to lower case
        textList = [word.lower() for word in tokenList if not word.isupper()]

        # Remove punctuation
        textList = [word for word in textList if word not in punctuation]
        textList = ["".join(c for c in word if c not in punctuation) for word in textList ]

        # Convert digits into NUM
        textList = [re.sub("\d+", "NUM", word) for word in textList]  

        # Stem words 
        stemmedTextList = [stemmer.stem(word) for word in textList]
        for sw,w in zip(stemmedTextList, textList):
            self._add_to_stem_dictionary(sw, w, stem_dictionary)
        textList = stemmedTextList

        # Remove blanks
        textList = [word for word in textList if word != ' ']
        textList = [word for word in textList if word != '']

        # Remove short words
        textList = [word for word in textList if len(word) > 2]

        # token blacklist
        textList = [word for word in textList if word not in token_blacklist]

        return textList
    
    def _from_sample_to_tfidf(self, sample, stem_dictionary, token_blacklist, idf,
                        stemmer, ad, punctuation):
        """From text string to TF-IDF vector (as Python dictionary)"""
        # Tokenize
        tl = nltk.word_tokenize(sample) # splits "I am Fabio" into ["I", "am", "Fabio"]. It's a little smarter than a .split(" ")
        raw_text = ' '.join(tl) # Join back the tokens with a space between them
        tokens = self._clean_tokens(tl, stem_dictionary, token_blacklist, stemmer, punctuation)

        # Create FreqDF with word frequencies and convert it to a data frame
        freq = FreqDist(tokens)
        freqDF = pd.DataFrame.from_dict(freq, orient='index')
        freqDF.columns = ['freq']

        # Merge freqDF with idf data frame
        freqit = freqDF.join(idf[['idf', 'logidf']])

        # Replace null values with max
        maxidf = max(freqit['idf'].dropna())
        maxlogidf = max(freqit['logidf'].dropna())
        freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf
        freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf

        # Create tfidf columns
        freqit['tfidf'] = freqit['freq'] * freqit['idf']
        freqit['logtfidf'] = freqit['freq'] * freqit['logidf']

        # Order by logtfidf weight
        #freqit = freqit.sort_values(by='logtfidf', ascending=False) 

        return freqit.to_dict() # To dictionary
    
    def _get_read_time(self, text, wps):
        num_of_words = len(text.split(" "))
        read_time = num_of_words / wps
        return read_time

    def __init__(self, verbose=False):
        self._verbose = verbose
        
    def run_preprocessing(self, dataset):
        stem_dictionary = {}
        token_blacklist = ["was", "wasn", "did", "didn", "you", "your", "isn", "wouldn", "doesn"]
        wps = 200 / 60 # = 3.33

        # Read wikipedia idf
        idf = pd.read_csv("../resources/wiki-30k-10-IDF.csv")
        idf = idf.set_index('term')
        print("Number of words considered in wikipedia: {0}".format(idf.shape[0]))

        # Initialize stemmer, alphabet detector and punctuation
        stemmer = nltk.stem.snowball.EnglishStemmer()
        ad = AlphabetDetector()
        punctuation = set(string.punctuation)

        # Preprocessing
        dataset = self._filter_articles(dataset, ad)
        for i,sample in enumerate(dataset):
            sample["title"], sample["content"] = self._clean_text(sample["title"]), self._clean_text(sample["content"])
            sample["tfidf"] = self._from_sample_to_tfidf(sample["content"],
                                                   stem_dictionary, token_blacklist, idf,
                                                   stemmer, ad, punctuation)
            sample["read_time"] = self._get_read_time(sample["content"], wps)

            # Save data
            UtilsOS.write_to_json(sample, "../preprocessed/" + str(i) + '.json')

            if self._verbose and i % 50 == 0:
                print(".. Processed articles: " + str(i) + "/" + str(len(dataset)))

        # Save stem dictionary
        UtilsOS.write_to_json(stem_dictionary, '../stemmer/stem_dictionary.json')
        if self._verbose:
            print("Saved stem dictionary")

# Run preprocessing

In [5]:
if __name__ == "__main__":
    # Read scraped data
    data = ReaderScrapedData.read_data("scraper_configs.json")
    dataset_nested = [data[website][domain] for website in data.keys() for domain in data[website].keys()]
    dataset = [el for subl in dataset_nested for el in subl]
    print("Read {0} articles".format(len(dataset)))
    
    # Run preprocessing
    preprocesser = Preprocesser(verbose=True)
    preprocesser.run_preprocessing(dataset)

Read 3191 articles
Number of words considered in wikipedia: 87709
Prev length: 3191
New length: 2617
Dropped total: 574
	Dropped title: 161
	Dropped content: 428
.. Processed articles: 0/2617
.. Processed articles: 50/2617
.. Processed articles: 100/2617
.. Processed articles: 150/2617
.. Processed articles: 200/2617
.. Processed articles: 250/2617
.. Processed articles: 300/2617
.. Processed articles: 350/2617
.. Processed articles: 400/2617
.. Processed articles: 450/2617
.. Processed articles: 500/2617
.. Processed articles: 550/2617
.. Processed articles: 600/2617
.. Processed articles: 650/2617
.. Processed articles: 700/2617
.. Processed articles: 750/2617
.. Processed articles: 800/2617
.. Processed articles: 850/2617
.. Processed articles: 900/2617
.. Processed articles: 950/2617
.. Processed articles: 1000/2617
.. Processed articles: 1050/2617
.. Processed articles: 1100/2617
.. Processed articles: 1150/2617
.. Processed articles: 1200/2617
.. Processed articles: 1250/2617
.. 

In [4]:
data = ReaderScrapedData.read_data("scraper_configs.json")
dataset_nested = [data[website][domain] for website in data.keys() for domain in data[website].keys()]
dataset = [el for subl in dataset_nested for el in subl]

In [7]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn import preprocessing

#!python -m spacy download en_core_web_md #you will need to install this on first load
import spacy
from spacy.lang.en import English
from spacy import displacy
nlp = spacy.load('en_core_web_md')
from IPython.display import HTML
import logging
logging.getLogger('tensorflow').disabled = True #OPTIONAL - to disable outputs from Tensorflow

Collecting en_core_web_md==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz#egg=en_core_web_md==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz (120.8MB)
[K    100% |████████████████████████████████| 120.9MB 3.9MB/s 
[?25hInstalling collected packages: en-core-web-md
  Running setup.py install for en-core-web-md ... [?25ldone
[?25hSuccessfully installed en-core-web-md-2.0.0
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m

[93m    Linking successful[0m
    /anaconda3/envs/education/lib/python3.5/site-packages/en_core_web_md -->
    /anaconda3/envs/education/lib/python3.5/site-packages/spacy/data/en_core_web_md

    You can now load the model via spacy.load('en_core_web_md')



In [8]:
url = "https://tfhub.dev/google/elmo/2"
embed = hub.Module(url)

AttributeError: module 'tensorflow' has no attribute 'init_scope'

In [None]:
text = dataset[0]["content"]
doc = nlp(text)

sentences = []
for i in doc.sents:
    if len(i) > 1:
        sentences.append(i.string.strip())
    
len(sentences)

In [None]:
embeddings = embed(
    sentences,
    signature="default",
    as_dict=True)["default"]

In [None]:
%%time
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  sess.run(tf.tables_initializer())
  x = sess.run(embeddings)