# PROVIDEDH Collaborative platform
## Jupyter notebook

In [1]:
import re
import os
from functools import reduce
from lxml import etree as et
from lxml.etree import Element
import itertools

## 1 Date Processing

In [2]:
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'}

In [3]:
def extract_date(text):
    date_dict = {'day': None, 'month': None, 'year': '-'}
    months = {
        'january':'01', 'jan':'01', 'february':'02', 'feb':'02', 'febr':'02', 'march':'03', 
        'mar':'03', 'april':'04', 'jpr':'04', 'may':'05', 'june':'06', 
        'jun':'06', 'july':'07', 'jul':'07', 'august':'08', 'aug':'08', 
        'september':'09', 'sep':'09', 'october':'10', 'oct':'10', 'november':'11', 
        'nov':'11', 'december':'12', 'dec':'12'
    }
    
    text = text.replace('-', ' ')
    text = text.replace(',', ' ')
    text = text.replace(';', ' ')
    text = text.replace('.', ' ')
    words = text.strip().split(' ')
        
    extract = True
    # 1. Match a full date
    if extract:
        match = re.match('.*([0-9]{1,2} [0-9]{1,2} [0-9]{4}).*', text)
        if not match is None:
            info = match.groups()[0].split(' ')
            date_dict['year'] = info[2]
            date_dict['month'] = info[1]
            date_dict['day'] = info[0]
            extract = False
        else:
            match = re.match('.*([0-9]{4} [0-9]{1,2} [0-9]{1,2}).*', text)
            if not match is None:
                info = match.groups()[0].split(' ')
                date_dict['year'] = info[0]
                date_dict['month'] = info[1]
                date_dict['day'] = info[2]
                extract = False

    # 2. Textual match of months
    if extract:
        month = reduce(lambda ac, dc: str(dc) if dc.lower() in months else ac, words, None)
        if not month is None:
            index = words.index(month)
            words.pop(index)
            date_dict['month'] = months[month.lower()]

    # 3. textual match of days 
    if extract:
        day = reduce(lambda ac, dc: str(dc) if re.match('[0-9]{1,2}th', dc) else ac, words, None)
        if not day is None:
            index = words.index(day)
            words.pop(index)
            date_dict['day'] = day.replace('th', '')
        else:
            day = reduce(lambda ac, dc: str(dc) if dc.endswith('nd') else ac, words, None)
            if not day is None:
                index = words.index(day)
                words.pop(index)
                date_dict['day'] = '02'
            else:
                day = reduce(lambda ac, dc: str(dc) if dc.endswith('rd') else ac, words, None)
                if not day is None:
                    index = words.index(day)
                    words.pop(index)
                    date_dict['day'] = '03'
                else:
                    day = reduce(lambda ac, dc: str(dc) if dc.endswith('st') else ac, words, None)
                    if not day is None:
                        index = words.index(day)
                        words.pop(index)
                        date_dict['day'] = '01'
            
    # 4. Matching four digit years
    if extract:
        year = reduce(lambda ac, dc: str(dc) if re.match('.*[0-9]{4}.*', dc)  else ac, words, None)
        if not year is None:
            index = words.index(year)
            words.pop(index)
            date_dict['year'] = re.match('([0-9]{4})', year).groups()[0]

    if not date_dict['day'] is None and date_dict['month'] is None:
        date_dict['month'] = '-'
            
    date_fields = filter(lambda x: not x is None, [date_dict['year'], date_dict['month'], date_dict['day']])
    date = '-'.join(date_fields)
    
    # 4. Matching days after the 12th
    if extract:
        day = reduce(lambda ac, dc: str(dc) if re.match('.*[0-9]{2}.*', dc)  else ac, words, None)
        if not day is None:
            day_int = int(re.match('.*([0-9]{2}).*', day).groups()[0])
            if day_int > 12:
                index = words.index(day)
                words.pop(index)
                date_dict['day'] = str(day_int)

    if not date_dict['day'] is None and date_dict['month'] is None:
        date_dict['month'] = '-'
            
    date_fields = filter(lambda x: not x is None, [date_dict['year'], date_dict['month'], date_dict['day']])
    date = '-'.join(date_fields)
    
    # 5. Matching days when the month is already specified
    if extract:
        day = reduce(lambda ac, dc: str(dc) if re.match('.*[0-9]{1,2}.*', dc)  else ac, words, None)
        if not day is None:
            if not date_dict['month'] is None:
                index = words.index(day)
                words.pop(index)
                date_dict['day'] = re.match('.*([0-9]{1,2}).*', day).groups()[0]

    if not date_dict['day'] is None and date_dict['month'] is None:
        date_dict['month'] = '-'
            
    date_fields = filter(lambda x: not x is None, [date_dict['year'], date_dict['month'], date_dict['day']])
    date = '-'.join(date_fields)
    
    if date.strip() == '-':
        date = ''
    
    return date

print('>', extract_date('06-09-1653'))
print('>', extract_date('1653-06-09'))
print('>', extract_date('24th January 1653'))
print('>', extract_date('1653'))
print('>', extract_date('January 1653'))
print('>', extract_date('24th of January'))
print('>', extract_date('24th of January, 1996'))
print('>', extract_date('24th'))
print('>', extract_date('march'))
print('>', extract_date('third sep'))
print('>', extract_date('23 sep'))
print('>', extract_date('09 June 1653#xd;Being'))

> 1653-09-6
> 1653-06-09
> 1653-01-24
> 1653
> 1653-01
> --01-24
> 1996-01-24
> ----24
> --03
> --09-03
> --09-23
> 1653-06-9


## 2 File processing

In [12]:
dep_folder = './depositions_subset/to_process/processed'
depositions = os.listdir(dep_folder)
processed = './depositions_subset/to_process/processed'

In [13]:
dates = []

for dep_name in depositions:
    with open(os.path.join(dep_folder, dep_name),'r') as file:
        dep_tree = et.fromstring(file.read().encode())
        dep_dates = dep_tree.xpath('.//tei:date', namespaces=namespaces)
        
        for date in dep_dates:
            if not date.text is None:
                date_formated = extract_date(date.text)
                if date_formated != '':
                    date.set('when', date_formated)
                
    with open(os.path.join(processed, dep_name),'w') as file:
        file.write(et.tostring(dep_tree).decode('UTF-8'))