In [1]:
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Python imports
import os
import sys; print(sys.version);
import time
import codecs
import re
import regex
import collections
import pathlib
import datetime

# Local imports
import definitions as defs

# Scrapy imports
import w3lib.html
import bs4 

# Utilities 
from tqdm.notebook import tqdm
from termcolor import colored

# Concurrency
import requests
import multiprocessing

def current_timestamp():
    return datetime.datetime.now().strftime("%d-%b-%Y %H:%M:%S.%f")
print(current_timestamp())

3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
18-Nov-2020 17:51:03.448980


# Terminal section

In [63]:
ls

wiki_00_m.txt  wiki_20_m.txt  wiki_40_m.txt  wiki_60_m.txt  wiki_80_m.txt
wiki_01_m.txt  wiki_21_m.txt  wiki_41_m.txt  wiki_61_m.txt  wiki_81_m.txt
wiki_02_m.txt  wiki_22_m.txt  wiki_42_m.txt  wiki_62_m.txt  wiki_82_m.txt
wiki_03_m.txt  wiki_23_m.txt  wiki_43_m.txt  wiki_63_m.txt  wiki_83_m.txt
wiki_04_m.txt  wiki_24_m.txt  wiki_44_m.txt  wiki_64_m.txt  wiki_84_m.txt
wiki_05_m.txt  wiki_25_m.txt  wiki_45_m.txt  wiki_65_m.txt  wiki_85_m.txt
wiki_06_m.txt  wiki_26_m.txt  wiki_46_m.txt  wiki_66_m.txt  wiki_86_m.txt
wiki_07_m.txt  wiki_27_m.txt  wiki_47_m.txt  wiki_67_m.txt  wiki_87_m.txt
wiki_08_m.txt  wiki_28_m.txt  wiki_48_m.txt  wiki_68_m.txt  wiki_88_m.txt
wiki_09_m.txt  wiki_29_m.txt  wiki_49_m.txt  wiki_69_m.txt  wiki_89_m.txt
wiki_10_m.txt  wiki_30_m.txt  wiki_50_m.txt  wiki_70_m.txt  wiki_90_m.txt
wiki_11_m.txt  wiki_31_m.txt  wiki_51_m.txt  wiki_71_m.txt  wiki_91_m.txt
wiki_12_m.txt  wiki_32_m.txt  wiki_52_m.txt  wiki_72_m.txt  wiki_92_m.txt
wiki_13_m.txt  wiki_33_m.

In [51]:
cd ../output

/home/filip_markoski/scraping/output


In [2]:
%cd /home/filip_markoski/scraping/output/sr/text/AG/

/home/filip_markoski/scraping/output/sr/text/AG


In [None]:
%cd /home/filip_markoski/scraping/output/sr/text/AG/

In [None]:
cat wiki_10_m.txt

# Python section

In [6]:

class EntityTemplate:

    def generate_template_regex(self, injection):
        template_regex_injection = r''
        for character in injection:
            template_regex_injection += f'[{character}]'
        regex_string = self.BASE_TEMPLATE_REGEX.format(injection=template_regex_injection)
        return  re.compile(regex_string, re.IGNORECASE | re.UNICODE | re.MULTILINE)
    
    def __init__(self, entity_name:str, entity_regex:str,
                 group_to_capture:int = 0, groups_to_concat:list=None,
                 do_split=False,
                 split_string='',
                 wrapper_string='{}', 
                 joiner_string=''):
        self.entity_name = entity_name 
        self.entity_presentation = ''.join([c for c in self.entity_name if c.isalpha()]).upper()
        self.entity_regex = re.compile(r'' + entity_regex, re.IGNORECASE | re.UNICODE | re.MULTILINE)
        self.group_to_capture = group_to_capture
        self.groups_to_concat = groups_to_concat
        self.wrapper_string = wrapper_string # e.g. '<section title={}>'
        self.joiner_string = joiner_string
        self.dictionary = dict()
        self.counter = 0
        
        self.do_split = do_split
        self.split_string = split_string

        self.BASE_TEMPLATE_STRING = '|%___{injection}_{{}}___%|'
        self.BASE_TEMPLATE_REGEX = '\|\%\_\_\_{injection}\_[0-9]+\_\_\_\%\|'
        self.template_string = self.BASE_TEMPLATE_STRING.format(injection=self.entity_presentation)
        self.template_regex = self.generate_template_regex(injection=self.entity_presentation)
        
    def fn_entity_to_template_replace(self, match):
        capture = None
        
        if self.groups_to_concat: # is not None, list(), or len() = 0 
            concatenated = collections.deque() 
            for group_i in self.groups_to_concat:
                concatenated.append(match.group(group_i))
            capture = self.joiner_string.join(concatenated)
        else:
            capture = match.group(self.group_to_capture)
        
        if capture == None:
            raise ValueError('The captured RegEx groups are invalid.')
            
        if self.do_split:
            parts = capture.split(self.split_string)
            capture = parts[-1]
        
        self.dictionary[self.counter] = (capture, match.start(), match.end())
        self.counter += 1
        return self.template_string.format(self.counter - 1)
    
    def entity_to_template(self, content):
        return self.entity_regex.sub(self.fn_entity_to_template_replace, content)
    
    def fn_template_to_entity_replace(self, match):
        substitution = self.dictionary[len(self.dictionary) - self.counter][0]
        self.counter -= 1
        substitution = self.wrapper_string.format(substitution)
        return substitution
    
    def template_to_entity(self, processed_content):
        return self.template_regex.sub(self.fn_template_to_entity_replace, processed_content)


In [None]:
source_file = os.path.join(defs.DATA_DIR, 'destination_item.txt')

# Prints every third line from each Doc object
with codecs.open(source_file, 'r', 'utf-8') as fin:
    content = fin.read()
    soup = bs4.BeautifulSoup(content, 'html.parser')
    docs = soup.find_all('doc')
    for doc in docs:
        parts = doc.text.split(os.linesep)
        inner = bs4.BeautifulSoup(doc.text)
        print(parts[2])

# For one file

In [29]:
%%time

mapping = {
    '__NOEDITSECTION__' : '',
    '__БЕЗКН__': '',
    '__БЕЗСАДРЖАЈА__': '',
    
    u"\u200B": ' ',
    u'\u0020': ' ',
    u'\u00A0': ' ',
    u'\u2000': ' ',
    u'\u2001': ' ',
    u'\u2002': ' ',
    u'\u2003': ' ',
    u'\u2004': ' ',
    u'\u2005': ' ',
    u'\u2006': ' ',
    u'\u2007': ' ',
    u'\u2008': ' ',
    u'\u2009': ' ',
    u'\u200A': ' ',
    u'\u2028': ' ',
    u'\u205F': ' ',
    u'\u3000': ' ',
       
    ' () ' : '',
    '()': '',
    '„' : '"',
    '“' : '"',
    '""': '"',
    ' .' : '.',
    ',\n' : ', ', 
    '-{' : '',
    '}-' : '',
    '„"': '"',
    '"“' : '"',
    ' (, - , ) ': ' ',

    'BULLET::::- ': '',
    'BULLET::::': '',
    'BULLET': '',
    '( ': '(',
    ' )': ')',
    '‎': ' ',
    'м   I': 'м I',
    'п. н. е.': 'п.н.е.',
    
    ' () ': '',
    '(– ': '(',
    '(, , ': '(',
    ' ; ': ';',
    '(; ; ': '(',
    '(, ': '(',
    '(, , ) ': '',
    '(, , )': '',    
    '(,)': '',
    '(;)' : '',
    '(; ;)': '',
    '(; - )': '',
    '(; -)': '',
    ', )': ')',
    '(, ;) ': '',
    '(; ': '(',
    '(; ; ': '(',
    '(", , "': '(',
    '("; ; "': '(',
    ' –)': ')',
    ' ,)': ')',
    ' ;)': ')',
    ' .)': ')',
    ',)': ')',
    ';)': ')',
    
    '(на , " ; на ; на ; на)': '',
    '(на , "" ; на ; на ; на)': '',
    '\n\n\n\n':'\n',
    '\n\n\n':'\n',
    '\n\n':'\n',
    '..':'.',
    '  ': ' ',
    '   ': ' ',
    '   ': ' ',
    '   ': ' ',
    
    # HTML Tags
    '<br>': '',
}

def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

def iterate_item(item):
    source_file, destination_file = item
    
    process = multiprocessing.current_process().name
    try:
        
        with codecs.open(source_file, 'r', 'utf-8') as fin:
            with codecs.open(destination_file, 'w', 'utf-8') as fout:

                #################
                ## PREPARATION ##
                #################
                
                # For Section:::: segments
                SectionEntityTemplate = EntityTemplate(entity_name='Section', entity_regex=r'(\bSection\b\:{4})(.+)', 
                     group_to_capture=2, wrapper_string='<section title=\"{}\">')
                
                # For Ordered Lists
                OrderedListEntityTemplate = EntityTemplate(entity_name='OrderedList',
                                                entity_regex=r'^([-\d+][\. \t]*)([^\r\n]+)(?=[\r\n]*)', 
                                                groups_to_concat=[1, 2])

                # For URLs
                UrlEntityTemplate = EntityTemplate(entity_name='Url', 
                    entity_regex = r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})')
                
                # For non-standard websites and various file formats (e.g. google.com, file.txt)
                DomainUrlEntityTemplate = EntityTemplate(entity_name='DomainUrl',
                                                        entity_regex=r'[ \'\"\(\[\,]{0,1}([\-a-zA-Z0-9]+)\.(\.|com|org|net|int|edu|gov|mil|arpa|ac|ac|ac|uk|ac|in|ac|id|ad|ad|ae|af|ag|ai|al|am|ao|aq|ar|as|at|at|au|aw|ax|al|ad|az|ba|bb|bd|be|bf|bg|bg|bh|bi|bj|bm|bn|bo|bq|br|ed|br|go|br|bs|bt|bw|by|bz|st|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cx|cy|cz|de|dj|dk|dm|do|dz|ec|in|ec|ee|eg|eh|er|es|et|eu|eu|fi|fj|fk|fm|fo|fr|ga|gd|ge|gf|gg|gh|gi|gl|ga|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|aq|hn|hr|ht|hu|id|id|id|id|id|id|id|id|ie|il|im|in|in|in|in|in|in|in|io|iq|ir|ir|is|is|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|me|me|mg|mh|mk|ml|mm|mn|go|mn|or|mn|ed|mn|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|ne|ne|nf|ng|ni|nl|no|no|no|np|np|nr|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pn|pn|pr|ps|pt|pw|py|qa|re|ro|rs|sr|ru|su|rw|sa|sb|sc|sd|se|sg|sh|si|sk|sl|sm|sn|so|sr|ss|st|bz|su|sv|sx|si|sm|ms|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tp|tm|tn|to|tr|nc|tr|tt|tv|tw|tz|ua|ug|uk|uk|us|go|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|json|txt|aif|cda|mid|midi|mp3|mpa|ogg|wav|wma|wpl|zip|7z|arj|deb|pkg|rar|rpm|tar|gz|z|zip|bin|dmg|iso|toast|vcd|csv|dat|db|dbf|log|mdb|sav|g|sql|tar|xml|email|eml|emlx|msg|oft|ost|pst|vcf|exe|apk|bat|bin|cgi|pl|com|exe|gadget|jar|msi|py|wsf|fnt|fon|otf|ttf|ai|bmp|gif|ico|jpeg|jpg|png|ps|psd|svg|tif|tiff|asp|aspx|cer|cfm|cgi|pl|css|htm|html|js|jsp|part|php|py|rss|xhtml|key|odp|pps|ppt|pptx|c|cgi|pl|class|cpp|cs|h|java|php|py|sh|swift|vb|ods|xls|xlsm|xlsx|bak|cab|cfg|cpl|cur|dll|dmp|drv|icns|ico|ini|lnk|msi|sys|tmp|3g2|3gp|avi|flv|h264|264|m4v|mkv|mov|mp4|mpg|mpeg|rm|swf|vob|wmv|doc|docx|odt|pdf|rtf|tex|txt|wpd)+[^a-zA-Z]')
                
                # For Ellipses (...)
                ThreeDotsEntityTemplate = EntityTemplate(entity_name='ThreeDots',
                                                        entity_regex=r'(\.{3})')
                

                # For Intralinks
                SquareBracketEntityTemplate = EntityTemplate(entity_name='SquareBracket', 
                                                             entity_regex=r'\[{2}([^\]\]]*)\]{2}',
                                                             group_to_capture=1,
                                                            do_split=True,
                                                            split_string='|')
                
                ############################
                ## PROCESSING STARTS HERE ##
                ############################
                
                content = fin.read()
                content = content.encode('utf-8', 'strict').decode('utf-8', 'strict')
                
                content = w3lib.html.remove_tags(content, which_ones=('a','b','i','br','hr'))
                
                # Replaces formulas with a <formula> tag
                content = re.sub(r'formula_[0-9]+', '<formula>', content, flags=re.IGNORECASE | re.UNICODE | re.MULTILINE)
                # Removes faulty beginnings which start with a punctuation
                content = re.sub(r'^[\,\.\;\:\!\? ]+[ ]*', '', content, flags=re.IGNORECASE | re.UNICODE | re.MULTILINE)
                # Removes faulty brackets (,) (, ,) commonly found near the start of a wikipedia article
                content = re.sub(r"""[ \( ,;]+[-– "':\)]+""", ' ', content, flags=re.IGNORECASE | re.UNICODE | re.MULTILINE)

                content = ThreeDotsEntityTemplate.entity_to_template(content)
                content = replace_all(content, mapping)
                
                #############################
                ## BULLLET LIST PROCESSING ##
                #############################
                
                parts = content.split(os.linesep)
                parts = list(map(lambda x: x.strip(), parts))

                bullet_flag = False
                bullet_marker = 'BULLET::::'
                bullet_list = list()
                bullet_start = 0
                bullet_end = 0

                for index, part in enumerate(parts):
                    if bullet_marker in part:
                        parts[index] = '\n '.join(item.split(bullet_marker))

                content = os.linesep.join(parts)
                
                #######################
                ## ENTITY EXTRACTION ##
                #######################
                
                content = SquareBracketEntityTemplate.entity_to_template(content)
                content = OrderedListEntityTemplate.entity_to_template(content)
                content = SectionEntityTemplate.entity_to_template(content)
                content = UrlEntityTemplate.entity_to_template(content)
                content = DomainUrlEntityTemplate.entity_to_template(content)
                
                # Catches all full stops except those in numbers and URLs.
                content = re.sub(r'\.(?=[^0-9 ,\"\')])', '. ', content)
                content = re.sub(r'\,(?=[^0-9 .\"\'])', ', ', content)
                
                content = replace_all(content, mapping)
                
                #########################
                ## ENTITY RE-INJECTION ##
                #########################
                
                content = OrderedListEntityTemplate.template_to_entity(content)
                content = SquareBracketEntityTemplate.template_to_entity(content)
                content = DomainUrlEntityTemplate.template_to_entity(content)
                content = UrlEntityTemplate.template_to_entity(content)
                content = SectionEntityTemplate.template_to_entity(content)
                content = ThreeDotsEntityTemplate.template_to_entity(content)
                
                fout.write(content)
    except:
        colored(text=f'\t[{process}]: Error found at item - {source_file}, {error}', color='red')
        return item 
    
# source_file = os.path.join(defs.DATA_DIR, 'sl', 'text', 'AC', 'wiki_10')
# destination_file = os.path.join(defs.DATA_DIR, 'mk', 'text', 'AD', 'wiki_10_mod.txt')

source_file = os.path.join(defs.DATA_DIR, 'source_item.txt')
destination_file = os.path.join(defs.DATA_DIR, 'destination_item.txt')

item = (source_file, destination_file)
content = iterate_item(item)

CPU times: user 2.4 s, sys: 0 ns, total: 2.4 s
Wall time: 2.41 s


# For all files

In [30]:
%%time

session = None

def set_global_session():
    global session
    if not session:
        session = requests.Session()

def iterate_items_parallelized(items:list, level=1):
    if not items or len(items) == 0 or level > 3:
        print('Terminating...')
        return

    print(colored(text=f'Starting to iterate {len(items)} items... (Trial no. {level}){os.linesep}',
                  attrs=['bold']))

    with multiprocessing.Pool(initializer=set_global_session) as pool:
        pool.map(iterate_item, items)


def main(language):
    start_time = time.time()
    print('Start Time:', current_timestamp())
    
    lang_directory = os.path.join(defs.DATA_DIR, language, 'text')
    os.makedirs(lang_directory, exist_ok=True)
    subdirectories = list(map(lambda subdir: os.path.join(lang_directory, subdir), os.listdir(lang_directory)))
    # print(subdirectories) # returns absolute paths
    
    for subdirectory in subdirectories:

        print('Subdirectory:', subdirectory)

        # Construct the output directory path
        output_subdirectory = subdirectory.replace(defs.DATA_DIR, defs.OUTPUT_DIR)
        print('Output Subdirectory:', output_subdirectory)
        
        if not os.path.isdir(output_subdirectory):
            pathlib.Path(output_subdirectory).mkdir(parents=True, exist_ok=True)

        files = os.listdir(subdirectory)
        items = list(map(lambda file: (f'{subdirectory}/{file}', f'{output_subdirectory}/{file}_m.txt'), files))
            
        # Processing the items
        iterate_items_parallelized(items)
    
    duration = time.time() - start_time
    print(f"... terminated after {duration:.4f} seconds")
    
    
if __name__ == '__main__':
    languages = ['mk', 'bg', 'bs', 'hr', 'sh', 'sl', 'sr']
    
    for language in languages:
        main(language)


Start Time: 18-Nov-2020 19:25:18.209968
Subdirectory: /home/filip_markoski/scraping/data/mk/text/AD
Output Subdirectory: /home/filip_markoski/scraping/output/mk/text/AD
[1mStarting to iterate 101 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/mk/text/AA
Output Subdirectory: /home/filip_markoski/scraping/output/mk/text/AA
[1mStarting to iterate 100 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/mk/text/AB
Output Subdirectory: /home/filip_markoski/scraping/output/mk/text/AB
[1mStarting to iterate 100 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/mk/text/AF
Output Subdirectory: /home/filip_markoski/scraping/output/mk/text/AF
[1mStarting to iterate 100 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/mk/text/AG
Output Subdirectory: /home/filip_markoski/scraping/output/mk/text/AG
[1mStarting to iterate 29 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/s

... terminated after 27.9310 seconds
Start Time: 18-Nov-2020 19:27:59.159234
Subdirectory: /home/filip_markoski/scraping/data/sr/text/AD
Output Subdirectory: /home/filip_markoski/scraping/output/sr/text/AD
[1mStarting to iterate 100 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/sr/text/AA
Output Subdirectory: /home/filip_markoski/scraping/output/sr/text/AA
[1mStarting to iterate 101 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/sr/text/AI
Output Subdirectory: /home/filip_markoski/scraping/output/sr/text/AI
[1mStarting to iterate 100 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/sr/text/AT
Output Subdirectory: /home/filip_markoski/scraping/output/sr/text/AT
[1mStarting to iterate 18 items... (Trial no. 1)
[0m
Subdirectory: /home/filip_markoski/scraping/data/sr/text/AK
Output Subdirectory: /home/filip_markoski/scraping/output/sr/text/AK
[1mStarting to iterate 100 items... (Trial no. 1)
[0m