# PK Date & Time Reconciliation

In [None]:
import csv
import json
import pprint
import datetime
import os
import re
from matplotlib import pyplot

%matplotlib inline

## Create ouput folder

Create the output folder in which all output files will be placed.

In [None]:
outputfolder = './test/output'

if not os.path.exists(outputfolder):
    os.mkdir(outputfolder, 0o766)

%ls -l ./test

In [None]:
def writeFile(filename: str, contents: object):
    # Create function to allow for file creation in output.
    path = os.path.join(outputfolder, filename)
    with open(path, 'w') as outputfile:
        outputfile.write(pprint.pformat(contents))

def writeJSON(filename: str, contents: object):
    # Create function to allow for file creation in output.
    path = os.path.join(outputfolder, filename)
    with open(path, 'w') as outputfile:
        outputfile.write(json.dumps(contents))

## Create Mapping

Map pathwest csv file records to the csv file records of medrio.

In [None]:
randfilename = './test/rand.csv'

subjectMap = dict() # dictionary for randomisation to subject id.

randCol = 'DARandNum_C' # column header for randomisation number
subjectCol = 'Subject ID' # column header for subject id

with open(randfilename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        subjectMap[f'{row[randCol]}'] = row[subjectCol]

writeFile('subjectMapping.txt', subjectMap)

## Read Configuration File

Parse config file and establish relationships.

In [None]:
configfilename = './test/config.json'

configDict = dict()

with open(configfilename) as jsonfile:
    configDict = json.load(jsonfile)

pprint.pp(configDict)

## Read Medrio Source File

Create dictionary mapping for each subject to allow for hash map search of data.

In [None]:
sourcefilename = './test/source.csv'

sourceMap = dict() # dictionary for subject data

subjectidcol = 'Subject ID'
visitCol = 'Visit'

# Regular expression objects
dateregex = re.compile(r'dat', flags=re.I)
timeregex = re.compile(r'tim', flags=re.I)
periodRegex = re.compile(r'period.*([0-9])', flags=re.I)

# Read File
with open(sourcefilename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Get Period of Row
        match = periodRegex.search(row.get(visitCol).strip())
        if match is None:
            print("Error: Visit period could not be matched.")
            continue
        
        if row[subjectidcol] not in sourceMap:
            sourceMap[f'{row[subjectidcol]}'] = dict()
        if match.group(1) not in sourceMap[f'{row[subjectidcol]}']:
            sourceMap[f'{row[subjectidcol]}'][f'{match.group(1)}'] = dict()
        
        # Record data values
        for key in row:
            if row[key] == '':
                continue
            value = None
            if dateregex.search(key) is not None:
                # Date
                value = datetime.datetime.strptime(row.get(key).strip(), '%m/%d/%Y')
            elif timeregex.search(key) is not None:
                # Time
                value = datetime.datetime.strptime(row.get(key).strip(), '%H:%M')
            else:
                value = row.get(key, '')
            
            sourceMap[f'{row[subjectidcol]}'][f'{match.group(1)}'][f'{key}'] = value

writeFile('medrioData.txt', sourceMap)

## Read Comparison File

In [None]:
comparisonMap = dict() # emtpy dictionary

comparisonfilename = './test/comparison.csv'

randomisationCol = 'Subject'
periodCol = 'Period'

periodRegex = re.compile(r'period.*([0-9])', flags=re.I)

scheduleCol = 'Scheduled time (hrs post dose)'
dateCol = 'Blood Sample date'
timeCol = 'Blood Sample time (24 hrs format)'

# Read File
with open(comparisonfilename) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row[dateCol] == '':
            continue # Skip empty rows
        
        # Interpret Period
        match = periodRegex.search(row.get(periodCol,''))
        if match is None:
            print("Error: Visit period could not be matched.")
            continue
            
        if row[randomisationCol] not in comparisonMap:
            comparisonMap[f'{row[randomisationCol]}'] = dict()
        if match.group(1) not in comparisonMap[f'{row[randomisationCol]}']:
            comparisonMap[f'{row[randomisationCol]}'][f'{match.group(1)}'] = dict()
        
        # Fetch infomation
        info = {
            "timepoint": row.get(scheduleCol, ''),
            "date": datetime.datetime.strptime(row.get(dateCol).strip(), '%d-%b-%y'),
            "time": datetime.datetime.strptime(row.get(timeCol).strip(), '%H:%M:%S')
        }
        
        # Append new information
        if info['timepoint'] not in comparisonMap[f'{row[randomisationCol]}'][f'{match.group(1)}']:
            comparisonMap[f'{row[randomisationCol]}'][f'{match.group(1)}'][f'{info["timepoint"]}'] = info
        elif comparisonMap[f'{row[randomisationCol]}'][f'{match.group(1)}'][f'{info["timepoint"]}']['date'] != info['date']:
            # the date in script does not match previously recorded dates
            raise Exception("ERROR: date for data point is not consistent within file!")
        elif comparisonMap[f'{row[randomisationCol]}'][f'{match.group(1)}'][f'{info["timepoint"]}']['time'] != info['time']:
            # the date in script does not match previously recorded dates
            raise Exception("ERROR: time for data point is not consistent within file!")

writeFile('comparisonData.txt', comparisonMap)

## Parse Data & Assess Equality

In [None]:
outputLog = {"errors": 0, "total": 0}

for subject in comparisonMap:
    outputLog[f'{subject}'] = list() # status reporting

    # Loop through each subject
    id = subjectMap.get(subject, None) # medrio subject id
    if id is None:
        outputLog[f'{subject}'].append({
            "status": "FAILED",
            "msg": "Medrio subject id could not be identified."
        })
        outputLog['errors'] += 1
        continue
        #raise Exception(f'Could not identify subject: {subject}')
    
    # Compare data & log any errors
    for period in comparisonMap[subject]:
        for timepoint in comparisonMap[subject][period]:
            lookuptable = configDict['match'][timepoint]
            errorObject = dict()
            # Cycle through date and time variables to be compared
            try:
                datecmp = sourceMap[f'{id}'][period][f'{lookuptable["date"]}'].date() == comparisonMap[subject][period][timepoint]['date'].date()
                timecmp = sourceMap[f'{id}'][period][f'{lookuptable["time"]}'].time() == comparisonMap[subject][period][timepoint]['time'].time()
            except KeyError as e:
                outputLog['errors'] += 1
                errorObject = {
                    "error": True,
                    'msg': "Variable most likely not defined for subject in source.",
                }
            except:
                print('Unknown Error Occured.')
                outputLog['errors'] += 1
                errorObject = {
                    "error": True,
                    'msg': "Unknown error occured",
                }
            
            dateError = {'error': False}
            timeError = {'error': False}
            outputLog['total'] += 1 # increment number of variables assessed
            
            if not datecmp:
                dateError = {
                        "variable": lookuptable['date'],
                        "source": sourceMap[f'{id}'][period][f'{lookuptable["date"]}'].date().isoformat(),
                        "pk": comparisonMap[subject][period][timepoint]['date'].date().isoformat(),
                        "error": True
                    }
            if not timecmp:
                timeError = {
                        "variable": lookuptable['time'],
                        "source": sourceMap[f'{id}'][period][f'{lookuptable["time"]}'].time().isoformat(),
                        "pk": comparisonMap[subject][period][timepoint]['time'].time().isoformat(),
                        "error": True
                    }
            if not datecmp or not timecmp or errorObject.get('error', False):
                outputLog['errors'] += 1
                outputLog[f'{subject}'].append({
                    **errorObject, 
                    'period': f'Period {period}',
                    'timepoint': timepoint,
                    'subjectid': id,
                    'date': dateError,
                    'time': timeError,
                })

            
writeJSON('output.json', outputLog)
writeFile('output.txt', outputLog)

## Plotting Results

In [None]:
pyplot.pie([outputLog['errors'], outputLog['total']], labels=[outputLog['errors'], outputLog['total']], colors=['#f42613', '#47f747'])