In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize
import spacy

import pandas as pd
import swifter

import numpy as np
import math
import random

import os
import sys
import re

import heapq
import itertools
import collections

import pickle

from matplotlib import pyplot as plt
import seaborn as sns


  return f(*args, **kwds)


In [2]:
import mistune
import mistune.renderers

In [3]:
from tqdm import tqdm
tqdm.pandas()

In [4]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True,nb_workers=32)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
nlp = spacy.load("en_core_web_lg")
nlp.max_length *= 5

# Load data

In [6]:
#Load the data
with open("../../data/deduped_policy_text_v11no_html_with_links_and_emails.pickle", "rb") as f:
    df_all = pd.read_pickle(f)
df_sample = df_all.sample(1000)

In [7]:
#df = df_sample
df = df_all

# Clean up

In [8]:
valid_end_punct = set((".","!","?",'"',"'"))
def strip_incomplete_sentences(text,flag_components=False):
    sentences = nltk.tokenize.sent_tokenize(text)
    if len(sentences) == 0:
        return ""
    words = nltk.tokenize.word_tokenize(sentences[-1])
    if not words[-1] in valid_end_punct:
        if flag_components:
            sentences[-1] = "_cut_" + sentences[-1] + "_cut_"
        else:
            sentences = sentences[:-1]
    return " ".join(sentences)

class StraightTextRenderer(mistune.renderers.BaseRenderer):
    
    def __init__(self,flag_components):
        self.flag_components = flag_components
    
    def text(self, text):
        return text

    def link(self, link, text=None, title=None):
        if text is None:
            return "link"
        else:
            return text

    def image(self, src, alt="", title=None):
        return ""

    def emphasis(self, text):
        return text

    def strong(self, text):
        return text

    def codespan(self, text):
        if self.flag_components:
            return "\n_codespan_%s_codespan\n" % text
        else:
            return "\n"

    def linebreak(self):
        if self.flag_components:
            return "\n_line break_\n"
        else:
            return "\n"

    def inline_html(self, html):
        if self.flag_components:
            return '\n_inline-html_%s_inline-html_\n' % html
        else:
            #HTML isn't prose
            return "\n"

    def paragraph(self, text):
        if text == '': return text
        paragraphs = text.split('\n')
        paragraphs = (strip_incomplete_sentences(para,flag_components=self.flag_components) for para in paragraphs)
        text = "\n".join(paragraphs)
        if self.flag_components:
            return "\n_paragraph_\n" + text + "\n_paragraph_\n"
        else:
            return text + "\n"

    def heading(self, text, level):
        if self.flag_components:
            return '\n_heading %d_ %s\n' % (level,text)
        else:
            #Headings aren't prose
            return "\n"

    def newline(self):
        if self.flag_components:
            return '\n_newline_\n'
        else:
            return "\n"

    def thematic_break(self):
        if self.flag_components:
            return '\n_thematic-break_\n'
        else:
            return "\n"

    def block_text(self, text):
        if self.flag_components:
            return '\n_block-text_%s_block-text_\n' % text
        else:
            return "%s\n" % text

    def block_code(self, code, info=None):
        if self.flag_components:
            if not code.strip():
                return "\n"
            else:
                return '\n_block-code_%s_block-code_\n' % code
        else:
            #This stuff usually isn't code, treat it as a paragraph
            return self.paragraph(code)

    def block_quote(self, text):
        if self.flag_components:
            return '\n_block-quote_%s_block-quote_\n' % text
        else:
            return "%s\n" % text

    def block_html(self, html):
        if self.flag_components:
            return "\n_block-html_%s_block-html\n" % html
        else:
            #HTML isn't prose
            return  "\n"

    def block_error(self, html):
        if self.flag_components:
            return "\n_block-error_%s_block-error\n" % html
        else:
            #Errors aren't prose
            return "\n"

    def list(self, text, ordered, level, start=None):
        if text == '': return text
        paragraphs = text.split('\n')
        paragraphs = [strip_incomplete_sentences(para,flag_components=self.flag_components) for para in paragraphs]
        text = "\n".join(paragraphs)
        if self.flag_components:
            return "\n_list %s %d_\n%s\n_list_\n" % (ordered, level, text)
        else:
            #Lists are inconsistent in how many sentences they represent
            #Rule: If all of the rows are sentences, then we'll keep them
            #If any stripped list item is empty, return empty for everything
            if any((not list_item.strip() for list_item in paragraphs)):
                return "\n"
            else:
                return text + "\n"

    def list_item(self, text, level):
        return "%s\n" % text
    
    def strikethrough(self, text):
        return ""
    
    def table(self, text):
        if self.flag_components:
            return '\n_table_%s_table_\n' % (text)
        else:
            return "\n"
    
    def table_cell(self, content, align=None, is_head=False):
        if self.flag_components:
            return '\n_cell_\n'
        else:
            return ""
    
    def table_head(self, content):
        if self.flag_components:
            return '\n_head_\n'
        else:
            return ""
        
    def table_row(self, content):
        if self.flag_components:
            return '_row_%s_row_\n' % content
        else:
            return ""
        
    def table_body(self, content):
        if self.flag_components:
            return '_body_%s_body_\n' % content
        else:
            return ""
    
markdown = mistune.create_markdown(renderer=StraightTextRenderer(False))
markdown_debug = mistune.create_markdown(renderer=StraightTextRenderer(True))

#Install mistune plugins
import mistune.plugins
mistune.plugins.plugin_table(markdown)
mistune.plugins.plugin_strikethrough(markdown)
mistune.plugins.plugin_table(markdown_debug)
mistune.plugins.plugin_strikethrough(markdown_debug)

def clean(policy_text):
    return markdown(policy_text)

def clean_debug(policy_text):
    return markdown_debug(policy_text)

In [9]:
df["policy_text_cleaned"] = df.policy_text.swifter.apply(clean)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=910546.0, style=ProgressStyle(descript…




# Remove website-specific terms

#### Regex substitutions

In [10]:
truste_regex = re.compile(r"(?<![a-z])truste(?![a-z])",flags=re.IGNORECASE)
#https://emailregex.com/
email_regex = re.compile(r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])",flags=re.IGNORECASE)
#https://gist.github.com/gruber/8891611
url_regex = re.compile(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))",flags=re.IGNORECASE)

In [11]:
def truste_sub(text):
    return truste_regex.sub("TrustArc",text)

def email_sub(text):
    return email_regex.sub("email_sub",text)

def url_sub(text):
    return url_regex.sub("url_sub",text)

df["policy_text_cleaned"] = df.policy_text_cleaned.swifter.apply(truste_sub).swifter.apply(email_sub).swifter.apply(url_sub)


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=910546.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=910546.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=910546.0, style=ProgressStyle(descript…




#### NER

In [None]:
# First pass -- find entities

TAGS_TO_SWAP = [
    "ORG",
    "PERSON",
    #    "FAC",
    #"WORK_OF_ART",
]

def get_entities(text):
    entities = set()
    doc = nlp(text)
    for entity in sorted(doc.ents, key=lambda x: -len(x.text)):
        if entity.label_ in TAGS_TO_SWAP:
            entities.add(entity.text.lower())
    return entities

entities = sorted(list(set(itertools.chain.from_iterable(df.policy_text_cleaned.swifter.apply(get_entities)))),key=len,reverse=True)
len(entities)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=910546.0, style=ProgressStyle(descript…

In [None]:
blacklist = ["email","url","number"]
entities_filtered = list(filter(lambda x: x in blacklist or any(map(str.isalpha,x)),entities))
print(len(entities_filtered))
print(entities_filtered)

In [None]:
entity_re = re.compile(r"\b(?:%s)\b" % "|".join(map(re.escape,entities_filtered)),re.IGNORECASE)
num_regex = re.compile(r"(?<![a-z])\d+(\.\d+)?(?![a-z])",flags=re.IGNORECASE)

def entity_sub(text):
    return entity_re.sub("ENTITY",text)

def num_sub(text):
    return num_regex.sub("NUMBER",text)

df["policy_tex_cleaned"] = df.policy_text_cleaned.swifter.apply(entity_sub).map(num_sub)

# N-Grams

In [None]:
def get_ngrams(text,n=2):
    ngrams = set()
    for sentence in nltk.tokenize.sent_tokenize(text):
        if n is None:
            sentence = sys.intern(sentence)
            ngrams.add(sentence)
            continue
        words = nltk.tokenize.word_tokenize(sentence)
        words = filter(str.isalnum,words)
        phrases = (" ".join(phrase) for phrase in nltk.ngrams(words,n))
        phrases = map(sys.intern,phrases)
        ngrams.update(phrases)
    return ngrams
        
counters = {}
for ys, sub_df in df.groupby("year_season"):
    counters[ys] = collections.Counter(itertools.chain.from_iterable(sub_df["policy_text_cleaned"].swifter.apply(get_ngrams)))

In [None]:
ys_values = sorted(list(counters.keys()))

In [None]:
phrases = set(itertools.chain.from_iterable(map(collections.Counter.elements,counters.values())))
trends = {
    phrase: [counters[ys][phrase] for ys in ys_values] for phrase in phrases
}

In [None]:
del phrases
for ys in ys_values:
    del counters[ys]
del counters

# Rank

In [None]:
def rank_phrases(key,topN=50):
    ranks = (
        (key(counts),phrase) for phrase, counts in trends.items()
    )
    heap = []
    for rank in ranks:
        if len(heap) < topN:
            heapq.heappush(heap,rank)
        else:
            heapq.heappushpop(heap,rank)
    return sorted(heap,reverse=True)

rank_phrases(max)