This notebook is designed to pre-process pickled news articles that have been scraped from html or text documents.

The environment that was used is printed out below.

In [1]:
print(__import__('sys').version)
!conda list -n NLP37

3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]
# packages in environment at C:\Anaconda3\envs\NLP37:
#
# Name                    Version                   Build  Channel
_pytorch_select           1.1.0                       cpu  
altair                    3.1.0                    py37_0    conda-forge
asn1crypto                0.24.0                   py37_0  
atomicwrites              1.3.0                    py37_1  
attrs                     19.1.0                   py37_1  
backcall                  0.1.0                    py37_0  
beautifulsoup4            4.7.1                    pypi_0    pypi
blas                      1.0                         mkl  
boto                      2.49.0                   py37_0    anaconda
boto3                     1.9.162                    py_0    anaconda
botocore                  1.12.163                   py_0    anaconda
branca                    0.3.1                      py_0    conda-forge
bs4                       0.

In [1]:
%pylab

from dateutil.parser import parse
from dateutil.tz import tz
from datetime import date, datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm
from toolz import compose, curry, concat
from collections import Counter
from unicodedata import normalize

import os, time, re, pytz

import pickle

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
MIN_DOC_SIZE = 20 # minimum size of a document for filtering

###########################################################################
#        Some regex pattern for removing invalid unicode characters       #
###########################################################################

NORMALIZATION_PROTOCOL = 'NFKD' # Alt: NFKD, NFKC, NFC, NFD

CTRL_REGEX = re.compile(r'[\x00-\x1f\x7f-\x9f]|\s{2,}') # Alt: #r'[^\x00-\x7f]'
                                                        # replace 2 or more space: \s{2,}
    
WORD_REGEX = re.compile(r'[\w\']+')

In [3]:
def save_pickle(filename, data, protocol=2):
    with open(os.path.normpath(filename), 'wb') as open_file:
        pickle.dump(data, open_file, protocol=protocol)

def load_pickle(filename):
    with open(os.path.normpath(filename), 'rb') as open_file:
        return pickle.load(open_file)

def replace_bad_chars(text, regex=CTRL_REGEX):
    '''
    Removes all ctrl characters such as '\n'
    and consecutive spaces
    '''
    return regex.sub(r' ', text)

def split_word_boundary(text, regex=WORD_REGEX):
    return regex.findall(text)

def normalize_unicode(text, protocol=NORMALIZATION_PROTOCOL):
    '''
    Converts all text to ascii encoding and then back to utf-8 in order
    to remove invalid characters for nlp.
    '''
    return normalize(protocol, text).encode('ascii', 'ignore').decode('utf8')

clean_text = compose(lambda x: x.replace('&', 'and'),
                     replace_bad_chars,
                     normalize_unicode)

def map_clean_text(data):
    print('Cleaning text...')
    progress = tqdm(range(len(data)))
    for doc in data:
        doc['content'] = clean_text(doc['content'])
        progress.update(1)
    return data

def stamp_paragraph_breaks(data):
    print('Stamping paragraphs...')
    progress = tqdm(range(len(data)))
    for doc in data:
        doc['content'] = doc['content'].replace('\n', ' NEWPARAGRAPH ').replace('  ', ' ')
        progress.update(1)
    return data

In [4]:
def directory_explorer(extension, directory):
    '''
    A generator to find filenames with a given extension within a given 
    directory
    '''
    ext_upper, ext_lower = extension.lower(), extension.upper()
    for filename in os.listdir(os.path.normpath(directory)):
        if filename.endswith(ext_upper) \
        or filename.endswith(ext_lower):
            yield '%s/%s' % (directory, filename)

def aggregate_docs(filenames):
    output = []
    for filename in filenames:
        output.extend(load_pickle(filename))
    return output

In [5]:
def partition_by_date(lis):
    output = {}
    for doc in lis:
        date = doc['datetime'].date()
        if date in output:
            output[date].append(doc)
        else:
            output[date] = [doc]
    return [output[date] for date in output]

In [11]:
def window_brackets(window_size, step, start='01 Jan 2017 00:00', end='05 Jan 2018 00:00'):
    '''
    Creates a list of tuples where each tuple contains two datetime
    objects a 'window_size' apart. Each tuple will be a step apart
    i.e. No. tuples = (start - end) / step
    '''
    start = datetime.strptime(start, '%d %b %Y %H:%M').replace(tzinfo=pytz.utc)
    end = datetime.strptime(end, '%d %b %Y %H:%M').replace(tzinfo=pytz.utc)
    window_size = timedelta(hours=window_size)
    step = timedelta(hours=step)
    brackets = []
    
    while start < end:
        brackets.append((start, start + window_size))
        start += step
        
    return brackets

In [12]:
# Filter documents which are too short and have no useful information
filter_length = curry(filter)(lambda x: len(x['content']) > MIN_DOC_SIZE)

def datetime_sort(data):
    '''
    Sort documents based on datetime
    '''
    return sorted(data, key=lambda x: x['datetime'])

def datetime_array(lis):
    '''
    Creates an array of datetime objects from the corpus
    '''
    return array([doc['datetime'] for doc in lis])

def assign_ids(data):
    for i, doc in enumerate(data):
        doc['id'] = i
    return data

In [13]:
def clean_gtd_info(data):
    regex = re.compile(r'   Incident Summary:   "?\d{2}/\d{2}/\d{4}: ')
    return [regex.sub('', info).strip() for info in tqdm(data)]

In [14]:
def datetime_correction(data):
    '''
    Some docs seem to be missing datetime which is because the date seems to have 
    been collected in the content on some docs. This function will correct this.
    '''
    
    print('Correcting datetime captures...')
    def datetime_interpretor(*args, default_tzinfo=tz.gettz('UTC'), **kwargs):
        dt = parse(*args, **kwargs)[0]
        return dt.replace(tzinfo=dt.tzinfo or default_tzinfo)
    
    custom_tz_path = os.path.normpath(
    r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Timezones__/timezones.pkl')

    timezones = {t: pytz.timezone(t) for t in pytz.all_timezones}

    if custom_tz_path:
        with open(custom_tz_path, 'rb') as file:
            custom_tz = pickle.load(file)
            for tz_code in custom_tz.keys():
                if tz_code not in timezones:
                    timezones[tz_code] = custom_tz[tz_code]
                    
    deletes = []

    for i, doc in enumerate(data):
        if type(doc['datetime']) == str:
            try:
                content = doc['content'].split(' ')
                date = ' '.join(content[:3])
                doc['raw-date'] = date
                doc['datetime'] = datetime_interpretor(date, 
                                                    fuzzy_with_tokens=True, 
                                                    tzinfos=timezones
                                                    ).astimezone(pytz.timezone('utc'))
                doc['content'] = ' '.join(content[3:])
            except ValueError:
                deletes.append(i)
    print('%s docs removed due to invalid dates' % len(deletes))
    for i in sorted(deletes, reverse=True):
        del data[i]
            
    return data

In [15]:
def publication_correction(data):
    print('Correcting publication captures...')
    for doc in data:
        if doc['publication'] == 'Unknown':
            content = doc['content'].split('. ')
            if content[0].startswith('The Guardian'):
                doc['publication'] = content[0].strip()
                doc['content'] = '. '.join(content[1: ])
            elif doc['content'][:55] == 'Supplied by BBC Worldwide Monitoring. By BBC Monitoring':
                doc['publication'] = 'BBC Monitoring'
                doc['content'] = doc['content'][55:]
            elif doc['content'][:17] == 'By BBC Monitoring':
                doc['publication'] = 'BBC Monitoring'
                doc['content'] = doc['content'][17:]    
            elif doc['content'][:36] == 'Supplied by BBC Worldwide Monitoring':
                doc['publication'] = 'BBC Monitoring'
                doc['content'] = doc['content'][36:]
        if 'The Guardian' in doc['publication']:
            doc['publication'] = 'The Guardian'
        elif 'BBC Monitoring' in doc['publication'] \
        or 'Supplied by BBC' in doc['publication']:
            doc['publication'] = 'BBC Monitoring'
        elif doc['publication'] == 'EuroNews - English Version':
            doc['publication'] = 'EuroNews'
        elif doc['publication'] == 'Ukrinform(Ukraine)':
            doc['publication'] = 'Ukrinform'
        elif doc['publication'] == 'Agence France Presse -- English':
            doc['publication'] = 'Agence France Presse'
    return data

In [16]:
def windowed_dedupe(brackets, lis):
    '''
    Moves a sliding window accross the dataset and removes documents with 
    identical content within the window. The oldest document will be
    retained in each case.
    '''
    print('Removing duplicate documents... ')
    start_size = len(lis)
    progress = tqdm(range(len(brackets)))
    for start, end in brackets:
        
        dt_array = array([doc['datetime'] for doc in lis])
        keeps = []
        idx = where((start <= dt_array) & (dt_array < end))[0]

        if len(idx) > 0:
            idx_range = range(len(idx))
            deletes = set()
            for i in idx_range:
                for j in idx_range[i + 1:]:
                    if lis[idx[i]]['content'] == lis[idx[j]]['content']:

                        deletes.add(idx[j])

            deletes = sorted(deletes, reverse=True)
            for i in deletes:
                del lis[i]
        
        progress.update(1)
            
    end_size = len(lis)
    
    print('removed %d docs' % (start_size - end_size))
    
    return lis

In [17]:
def windowed_dechild(brackets, lis, datetime_array=datetime_array):
    '''
    Moves a sliding window accross the dataset and removes documents which 
    are children i.e. their content is identical to a substring in another
    (parent) document within the window. The most unique document will be
    retained.
    '''
    print('Removing child documents... ')
    start_size = len(lis)
    progress = tqdm(range(len(brackets)))
    for start, end in brackets:
        
        dt_array = array([doc['datetime'] for doc in lis])
        keeps = []
        idx = where((start <= dt_array) & (dt_array < end))[0]
        idx_range = range(len(idx))
        deletes = set()
        for i in idx_range:
            for j in idx_range[i + 1:]:
                if lis[idx[i]]['content'] in lis[idx[j]]['content']:
                    deletes.add(idx[j])
                elif lis[idx[j]]['content'] in lis[idx[i]]['content']:
                    deletes.add(idx[i])
                    
        deletes = sorted(deletes, reverse=True)
        for i in deletes:
            del lis[i]
        
        progress.update(1)
            
    end_size = len(lis)
    
    print('removed %d docs' % (start_size - end_size))
    
    return lis

#### Choose a pipe and make sure the input and output paths are correct. 

## Process document corpus

In [18]:
FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\02 HTML Scrapes\out\EuroNews_-_English_Version_Sky_News_Ukrinfor2019-07-18_03-41.pkl'
OUTPUT_FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\03 Preprocessing\out\pre_processed.pkl'

preprocess_pipe = compose(curry(save_pickle)(OUTPUT_FILENAME),
                          assign_ids,
                          curry(windowed_dechild)(window_brackets(48, 12)),
                          curry(windowed_dedupe)(window_brackets(48, 12)),
                          datetime_sort,
                          filter_length,
                          datetime_correction,
                          publication_correction,
                          map_clean_text,
                          load_pickle,
                          )

In [19]:
preprocess_pipe(FILENAME)

Cleaning text...


100%|██████████████████████████████████████████████████████████████████████████████| 236/236 [00:00<00:00, 1903.27it/s]


Correcting publication captures...
Correcting datetime captures...
0 docs removed due to invalid dates
Removing duplicate documents... 


100%|██████████████████████████████████████████████████████████████████████████████▊| 736/738 [00:00<00:00, 931.27it/s]

removed 8 docs


100%|██████████████████████████████████████████████████████████████████████████████| 738/738 [00:00<00:00, 1033.26it/s]


Removing child documents... 


 90%|██████████████████████████████████████████████████████████████████████▉        | 663/738 [00:01<00:00, 108.33it/s]

removed 1 docs


100%|███████████████████████████████████████████████████████████████████████████████| 738/738 [00:01<00:00, 430.57it/s]


In [1]:
load_pickle(r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\03 Preprocessing\out\pre_processed.pkl')

In [15]:
FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\03 Preprocessing\Raw Scrape.pkl'
OUTPUT_FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\03 Preprocessing\Paragraphs.pkl'

preprocess_pipe = compose(
                          curry(save_pickle)(OUTPUT_FILENAME),
                          assign_ids,
                          curry(windowed_dechild)(window_brackets(48, 12)),
                          curry(windowed_dedupe)(window_brackets(48, 12)),
                          datetime_sort,
                          filter_length,
                          datetime_correction,
                          publication_correction,
                          map_clean_text,
                          #stamp_paragraph_breaks,
                          load_pickle,
                          )

In [2]:
preprocess_pipe(FILENAME)

## Process SVM training data

In [13]:
INPUT_FOLDER = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\04 Training Data\SVM Negatives'
INPUT_FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\04 Training Data\SVM Negatives\Agg.pkl'
OUTPUT_FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\04 Training Data\SVM Negatives\Agg2.pkl'

pipe = compose(curry(save_pickle)(OUTPUT_FILENAME),
               curry(windowed_dedupe)(window_brackets(48, 24, '01 Jan 2009 00:00')),
               datetime_correction,
               publication_correction,
               map_clean_text,
               #load_pickle,
               aggregate_docs,
               curry(directory_explorer)('.pkl'),
               )

pipe(INPUT_FOLDER)

Cleaning text...


100%|████████████████████████████████████████████████████████████████████████████| 6829/6829 [00:01<00:00, 3642.20it/s]


Correcting publication captures...
Correcting datetime captures...
27 docs removed due to invalid dates
Removing duplicate documents... 


100%|████████████████████████████████████████████████████████████████████████████▋| 3278/3291 [00:29<00:00, 154.53it/s]

removed 2232 docs


100%|█████████████████████████████████████████████████████████████████████████████| 3291/3291 [00:40<00:00, 154.53it/s]

## Process GTD descriptions

In [31]:
INPUT_FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\GTD\Info.pkl'
OUTPUT_FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\GTD\Preprocessed Info.pkl'


process_gtd_info = compose(curry(save_pickle)(OUTPUT_FILENAME),
                           clean_gtd_info,
                           curry(map)(clean_text),
                           load_pickle,
                           )
                           


In [32]:
process_gtd_info(INPUT_FILENAME)