### 1. Read in raw scraped files from directory
### 2. Strip HTML
### 3. Extract entities (using spacy)
### 3. Remove special characters / unicode
#### Created by Jackie Weiser

In [64]:
import os, re, string
from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
from bs4.element import Comment

In [16]:
# spacy is used for Part of Speech tagging and Named Entity Recognition
# spacy is a non-standard python library which can be installed using 'pip install spacy' from the command line
# language models can be downloaded by running 'python -m spacy download <language>' from the command line
import spacy
language = 'en'
nlp_model  = spacy.load('en')
    
def get_multilingual_entities(text):
    
    doc = nlp_model(text)
    labels = [{ent.text:ent.label_} for ent in doc.ents]

    return labels

In [6]:
path_to_docs = './'
output_clean = 'clean_scraped_text/clean_scraped_text.txt'

In [7]:
def stripTags(text):
    scripts = re.compile(r'<script.*?/script>')
    css = re.compile(r'<style.*?/style>')
    tags = re.compile(r'<.*?>')

    text = scripts.sub('', text)
    text = css.sub('', text)
    text = tags.sub('', text)

    return text

def punctuation_remove(text):
    """
    Mutates and returns text where all punctuation are replaced
    """
    chars = re.escape(string.punctuation)
    return re.sub(r'['+chars+']', ' ',text)

def doublespace_remove(text):
    return re.sub(' +',' ',text)

In [8]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

In [9]:
webfiles = [webfile for webfile in os.listdir(path_to_docs) if '.' in webfile]
htmlfiles = [{webfile:htmlfile} for htmlfile in os.listdir(path_to_docs + webfile) if htmlfile.endswith('.html') for webfile in webfiles]

In [54]:
characters_to_replace = ['\u']
htmldict = {}
for webfile in webfiles[0:10]:
    if not webfile in ['.gitignore','.DS_Store', '.git', '.ipynb_checkpoints','RawUrls.txt','README.md','SPLC_Strip_HTML.ipynb']:
        htmlfiles = [htmlfile for htmlfile in os.listdir(path_to_docs + webfile) if htmlfile.endswith('.html')]
        for htmlfile in htmlfiles:
            htmldict[webfile] = {}
            htmldict[webfile][htmlfile] = {}
            with open(path_to_docs + webfile + '/' + htmlfile, "r") as myfile:
                result = myfile.read()
            htmldict[webfile][htmlfile]['text'] = text_from_html(result)
            entities = get_multilingual_entities(htmldict[webfile][htmlfile]['text'])
            for char in characters_to_replace:
                htmldict[webfile][htmlfile]['text'] = htmldict[webfile][htmlfile]['text'].encode('ascii','replace').lower().replace(char," ")
            htmldict[webfile][htmlfile]['text'] = punctuation_remove(htmldict[webfile][htmlfile]['text'])
            htmldict[webfile][htmlfile]['text'] = doublespace_remove(htmldict[webfile][htmlfile]['text'])
            htmldict[webfile][htmlfile]['entities'] = entities
            with open(output_clean, "a") as myfile:
                myfile.write(str(htmldict[webfile][htmlfile]))

In [60]:
htmldict.keys()

['americanfreepress.net',
 'active-democracy.com',
 'addr.ws',
 'americanvikings.us',
 'americanvikings.com']

In [65]:
htmldict['americanfreepress.net']['index.html']['entities'][0:10]

[{u'                       ': u'GPE'},
 {u'Terms Of Service': u'ORG'},
 {u' ': u'NORP'},
 {u'ABOUT  HELP  PRIVACY': u'QUANTITY'},
 {u'POLICY       Facebook': u'ORG'},
 {u'Twitter': u'GPE'},
 {u'Youtube       ': u'PERSON'},
 {u' ': u'NORP'},
 {u'Multimedia   Radio': u'ORG'},
 {u'Audio': u'PERSON'}]

In [67]:
htmldict['americanfreepress.net']['index.html']['text'][0:2000]

' terms of service about help privacy policy facebook twitter youtube search for home multimedia radio video audio the andrew carrington hitchcock afp show archives support store subscribe about login news ticker march 30 2017 american free press is under attack national news november 11 2017 trump will release jfk documents audio november 10 2017 that bloodbath in the old dominion politics november 10 2017 gop tax plan increases the most insidious tax freedom november 10 2017 globalists seek to scuttle brexit world november 7 2017 democrats behind fake news dossier featured audio trump will release jfk documents politics that bloodbath in the old dominion freedom gop tax plan increases the most insidious tax world globalists seek to scuttle brexit featured democrats behind fake news dossier featured cleaning house conspiracy buzz rhode island tackles geoengineering audio homelessness infectious diseases combine to create health disaster audio top tier treason and the uss liberty natio