In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize
import spacy

import pandas as pd
import swifter

import numpy as np
import math
import random

import os
import sys
import re

import heapq
import itertools
import collections

import pickle

from matplotlib import pyplot as plt
import seaborn as sns


  return f(*args, **kwds)


In [2]:
import mistune
import mistune.renderers

In [3]:
from tqdm import tqdm
tqdm.pandas()

In [4]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True,nb_workers=32)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
nlp = spacy.load("en_core_web_lg")
nlp.max_length *= 5

# Load data

In [6]:
#Load the data
with open("../data/deduped_policy_text_v11no_html_with_links_and_emails.pickle", "rb") as f:
    df_all = pd.read_pickle(f)
df_sample = df_all.sample(1000)

In [7]:
#df = df_sample
df = df_all

In [8]:
df.site_url

237698                        http://army.mil
1043500                      http://femina.in
892937                   http://jiffstore.com
1118180          http://couponconnections.com
337273     http://memory-improvement-tips.com
                          ...                
1221991                 http://bigseminar.com
340360                   http://linksgain.net
892528      http://jewelrymakingmagazines.com
79317                      http://nypress.com
547901                 http://timeforkids.com
Name: site_url, Length: 910546, dtype: object

# Clean up

In [9]:
valid_end_punct = set((".","!","?",'"',"'"))
def strip_incomplete_sentences(text,flag_components=False):
    sentences = nltk.tokenize.sent_tokenize(text)
    if len(sentences) == 0:
        return ""
    words = nltk.tokenize.word_tokenize(sentences[-1])
    if not words[-1] in valid_end_punct:
        if flag_components:
            sentences[-1] = "_cut_" + sentences[-1] + "_cut_"
        else:
            sentences = sentences[:-1]
    return " ".join(sentences)

class StraightTextRenderer(mistune.renderers.BaseRenderer):
    
    def __init__(self,flag_components):
        self.flag_components = flag_components
    
    def text(self, text):
        return text

    def link(self, link, text=None, title=None):
        if text is None:
            return "link"
        else:
            return text

    def image(self, src, alt="", title=None):
        return ""

    def emphasis(self, text):
        return text

    def strong(self, text):
        return text

    def codespan(self, text):
        if self.flag_components:
            return "\n_codespan_%s_codespan\n" % text
        else:
            return "\n"

    def linebreak(self):
        if self.flag_components:
            return "\n_line break_\n"
        else:
            return "\n"

    def inline_html(self, html):
        if self.flag_components:
            return '\n_inline-html_%s_inline-html_\n' % html
        else:
            #HTML isn't prose
            return "\n"

    def paragraph(self, text):
        if text == '': return text
        paragraphs = text.split('\n')
        paragraphs = (strip_incomplete_sentences(para,flag_components=self.flag_components) for para in paragraphs)
        text = "\n".join(paragraphs)
        if self.flag_components:
            return "\n_paragraph_\n" + text + "\n_paragraph_\n"
        else:
            return text + "\n"

    def heading(self, text, level):
        if self.flag_components:
            return '\n_heading %d_ %s\n' % (level,text)
        else:
            #Headings aren't prose
            return "\n"

    def newline(self):
        if self.flag_components:
            return '\n_newline_\n'
        else:
            return "\n"

    def thematic_break(self):
        if self.flag_components:
            return '\n_thematic-break_\n'
        else:
            return "\n"

    def block_text(self, text):
        if self.flag_components:
            return '\n_block-text_%s_block-text_\n' % text
        else:
            return "%s\n" % text

    def block_code(self, code, info=None):
        if self.flag_components:
            if not code.strip():
                return "\n"
            else:
                return '\n_block-code_%s_block-code_\n' % code
        else:
            #This stuff usually isn't code, treat it as a paragraph
            return self.paragraph(code)

    def block_quote(self, text):
        if self.flag_components:
            return '\n_block-quote_%s_block-quote_\n' % text
        else:
            return "%s\n" % text

    def block_html(self, html):
        if self.flag_components:
            return "\n_block-html_%s_block-html\n" % html
        else:
            #HTML isn't prose
            return  "\n"

    def block_error(self, html):
        if self.flag_components:
            return "\n_block-error_%s_block-error\n" % html
        else:
            #Errors aren't prose
            return "\n"

    def list(self, text, ordered, level, start=None):
        if text == '': return text
        paragraphs = text.split('\n')
        paragraphs = [strip_incomplete_sentences(para,flag_components=self.flag_components) for para in paragraphs]
        text = "\n".join(paragraphs)
        if self.flag_components:
            return "\n_list %s %d_\n%s\n_list_\n" % (ordered, level, text)
        else:
            #Lists are inconsistent in how many sentences they represent
            #Rule: If all of the rows are sentences, then we'll keep them
            #If any stripped list item is empty, return empty for everything
            if any((not list_item.strip() for list_item in paragraphs)):
                return "\n"
            else:
                return text + "\n"

    def list_item(self, text, level):
        return "%s\n" % text
    
    def strikethrough(self, text):
        return ""
    
    def table(self, text):
        if self.flag_components:
            return '\n_table_%s_table_\n' % (text)
        else:
            return "\n"
    
    def table_cell(self, content, align=None, is_head=False):
        if self.flag_components:
            return '\n_cell_\n'
        else:
            return ""
    
    def table_head(self, content):
        if self.flag_components:
            return '\n_head_\n'
        else:
            return ""
        
    def table_row(self, content):
        if self.flag_components:
            return '_row_%s_row_\n' % content
        else:
            return ""
        
    def table_body(self, content):
        if self.flag_components:
            return '_body_%s_body_\n' % content
        else:
            return ""
    
markdown = mistune.create_markdown(renderer=StraightTextRenderer(False))
markdown_debug = mistune.create_markdown(renderer=StraightTextRenderer(True))

#Install mistune plugins
import mistune.plugins
mistune.plugins.plugin_table(markdown)
mistune.plugins.plugin_strikethrough(markdown)
mistune.plugins.plugin_table(markdown_debug)
mistune.plugins.plugin_strikethrough(markdown_debug)

def clean(policy_text):
    return markdown(policy_text)

def clean_debug(policy_text):
    return markdown_debug(policy_text)

In [11]:
df["policy_text_cleaned"] = df.policy_text.progress_map(clean)

100%|██████████| 910546/910546 [4:19:10<00:00, 58.55it/s]   


In [13]:
directory = "/n/fs/scratch/rbamos/coh-metrix_files"
os.makedirs(directory,exist_ok=True)
for index, row in df.iterrows():
    fn = f"{directory}/{row.year_season}-{index}.txt"
    with open(fn,"w+") as f:
        f.write(row.policy_text_cleaned)

In [None]:
del df 

In [None]:
%%javascript
Jupyter.notebook.session.delete();