# Mumbai legacy linelist data: Suspected dates

Identify and convert suspected dates.

Write out expressions to be satisfied for each column.
For each column, a suspected cell has two possible dates - the current date and an alternate date.
Check which of these two dates satisfies the necessary column conditions. 
Set the cell value and status (GOOD_PARSE or BAD_PARSE) accordingly.

In [None]:
import pandas as pd
import numpy as np

import datetime as dt
from datetime import timedelta

from toolz import interleave

## Read data

In [None]:
infile="../data/consolidated_wards_clean.csv"

In [None]:
D = pd.read_csv(infile,header=0, low_memory=False)

In [None]:
original = D.copy(deep=True) # Used later for validation

In [None]:
D.columns

## Declarations

In [None]:
# Order dates in ascending order? Order of processing

date_columns = [
    'Date of Admission',
    'Date of Outcome',
    'Date of Sample Collection (FIRST)',
    'Date of Sample Collection (Second)', 
    'Date of Sample Collection (Third)',
    'Date of Sample Collection (Fourth)',
    'Date of Sample Collection (Fifth)', 
    'Date of Sample Collection (Sixth)',
    'Date of last test (to be left blank)'
]

review_columns = [col+'_Review' for col in date_columns]

In [None]:
valid_interval = [dt.date(2020, 3, 10), dt.date(2020, 6, 10)]

In [None]:
order_conditions={}

order_conditions['Date of Admission'] = [
    {
        'other_col': "Date of Outcome", 
        'expression':"dt.datetime.strptime(x['Date of Admission'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Outcome'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of Sample Collection (FIRST)", 
        'expression':"(dt.datetime.strptime(x['Date of Admission'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (FIRST)'],'%Y-%m-%d')\
        +timedelta(10)) \
        and (dt.datetime.strptime(x['Date of Admission'],'%Y-%m-%d') \
        >=dt.datetime.strptime(x['Date of Sample Collection (FIRST)'],'%Y-%m-%d')\
        -timedelta(10))"
    }                              
]

order_conditions['Date of Outcome'] = [
    {
        'other_col': "Date of Admission", 
        'expression':"dt.datetime.strptime(x['Date of Outcome'],'%Y-%m-%d')\
        >=dt.datetime.strptime(x['Date of Admission'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of Sample Collection (FIRST)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (FIRST)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Outcome'],'%Y-%m-%d')"
    }                             
]

order_conditions['Date of Sample Collection (FIRST)'] = [
    {
        'other_col': "Date of Outcome", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (FIRST)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Outcome'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of Sample Collection (Second)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (FIRST)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Second)'],'%Y-%m-%d')"
    }                             
]

order_conditions['Date of Sample Collection (Second)'] = [
    {
        'other_col': "Date of Sample Collection (FIRST)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (FIRST)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Second)'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of Sample Collection (Third)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Second)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Third)'],'%Y-%m-%d')"
    }                             
]

order_conditions['Date of Sample Collection (Third)'] = [
    {
        'other_col': "Date of Sample Collection (Second)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Second)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Third)'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of Sample Collection (Fourth)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Third)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Fourth)'],'%Y-%m-%d')"
    }                             
]

order_conditions['Date of Sample Collection (Fourth)'] = [
    {
        'other_col': "Date of Sample Collection (Third)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Third)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Fourth)'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of Sample Collection (Fifth)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Fourth)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Fifth)'],'%Y-%m-%d')"
    }                             
]

order_conditions['Date of Sample Collection (Fifth)'] = [
    {
        'other_col': "Date of Sample Collection (Fourth)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Fourth)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Fifth)'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of Sample Collection (Sixth)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Fifth)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Sixth)'],'%Y-%m-%d')"
    }                             
]

order_conditions['Date of Sample Collection (Sixth)'] = [
    {
        'other_col': "Date of Sample Collection (Fifth)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Fifth)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of Sample Collection (Sixth)'],'%Y-%m-%d')"
    },
    {
        'other_col': "Date of last test (to be left blank)", 
        'expression':"dt.datetime.strptime(x['Date of Sample Collection (Sixth)'],'%Y-%m-%d')\
        <=dt.datetime.strptime(x['Date of last test (to be left blank)'],'%Y-%m-%d')"
    }  
]

order_conditions['Date of last test (to be left blank)'] = [
    {
        'other_col': "Date of Sample Collection (Sixth)", 
        'expression':"dt.datetime.strptime(x['Date of last test (to be left blank)'],'%Y-%m-%d')\
        >=dt.datetime.strptime(x['Date of Sample Collection (Sixth)'],'%Y-%m-%d')"
    }  
]


# Other conditions:
# Date of Outcome >= Date of Admission
# Date of Outcome >= Date of Sample Collection (FIRST)

# Date of Sample Collection (n) > Date of Sample Collection (n-1)

# Last Test > Sample Collection??

## Parsing functions

In [None]:
def alternate_valid_date(x,valid_interval):
    """Find alternate date when there is ambiguity in date parsing.
    Args:
        x(str): input date
        valid_interval(list[datetime.datetime], len=2): lower and upper bounds of acceptable dates
    Returns:
        str/None: Date obtained by exchanging day and month, None if invalid
    """
    y=None
    valid_start, valid_end = valid_interval
    x = dt.datetime.strptime(x, "%Y-%m-%d")
    y= dt.date(x.year,x.day,x.month)
    if ((y > valid_end) or (y < valid_start)):
         y=None  
    else:
        y = y.strftime("%Y-%m-%d")
    return y

In [None]:
def choose_date(x):
    """Choose a date from two dates and assign review status"""
    review = 'SUSPECTED_PARSE'
    result = x['current']
    
    # both dates - problematic - BAD PARSE
    if (not x['status_current'] and not x['status_alternate']):
        review ='BAD_PARSE'
        
    # only one is good - GOOD PARSE and switch value if necessary
    elif (x['status_current'] or x['status_alternate']):
        if x['status_alternate']:
            result = x['alternate']     
        review ='GOOD_PARSE'
        
    # both values are bad - BAD PARSE
    else:
        review ='BAD_PARSE'
    return (result, review)

In [None]:
def eval_expression(x, expr):
    try:
        return eval(expr)
    except:
        pass
    return False

In [None]:
# TODO: Refactor this (for efficiency)

def process_one_col(D, col, constraints):
    
    # create the alternate date as long as it is valid
    alternate = D[col].apply(lambda x: alternate_valid_date(x,valid_interval))
    D_alternate = D.copy()
    D_alternate[col] =  alternate
    
    status_current = [None]*D[col].shape[0]
    status_alternate = [None]*D[col].shape[0]
    
    # loop through constraints
    for constraint in constraints:
        other_col_review = constraint['other_col'] + "_Review"
        other_good = D[other_col_review].apply(lambda x: x == 'GOOD_PARSE').to_list()

        t1 = D.apply(lambda x: eval_expression(x, constraint['expression']),axis=1).to_list()
        for i in range(len(status_current)):
            if status_current[i] is None:
                status_current[i] = other_good[i] and t1[i]
            else:
                status_current[i] = status_current[i] and (other_good[i] and t1[i])
        t2 = D_alternate.apply(lambda x: eval_expression(x, constraint['expression']),axis=1).to_list()
        for i in range(len(status_alternate)):
            if status_alternate[i] is None:
                status_alternate[i] = other_good[i] and t2[i]
            else:
                status_alternate[i] = status_alternate[i] and (other_good[i] and t2[i])
    
    # collect the ones required for decision
    D_tmp = pd.DataFrame()
    D_tmp['current'] = D[col]
    D_tmp['alternate'] = alternate
    D_tmp['status_alternate'] = status_alternate
    D_tmp['status_current'] = status_current
                                                   
    # get the new value and review status
    if len(D_tmp):
        D[col] = D_tmp.apply(lambda x: choose_date(x)[0], axis=1)
        D[col+"_Review"] = D_tmp.apply(lambda x: choose_date(x)[1], axis=1)
                                    
    return D

## Process dates

In [None]:
vc_1 = D.iloc[:,40:].apply(lambda x: x.value_counts()).T.stack()

In [None]:
for col in date_columns:
    print(col)
    if col in order_conditions.keys():
        
        # Select subset of rows and process
        D_tmp = D.loc[D[col+"_Review"] == 'SUSPECTED_PARSE',:]
        review1 = D_tmp.copy(deep=True)
        D.loc[D[col+"_Review"] == 'SUSPECTED_PARSE',:] = process_one_col(D_tmp, col, order_conditions[col])
        review2 = D.loc[review1.index,:].copy(deep=True)
        
        # Select subset of columns to view
        review_cols = [col]
        temp1 = [d['other_col'] for d in order_conditions[col]]
        review_cols.extend(temp1)
        temp2 = [c+'_Review' for c in review_cols]
        review_cols.extend(temp2)
        review = pd.concat([review1, review2]).sort_index()
        
        # Interleave original and modified rows for review
        review = review.loc[:,review_cols]
        print(review)
        review.to_csv("../reviews/"+col+".csv")

In [None]:
# Recompute overall status

status_col = []

m = D.shape[0]
n = D.shape[1]

i = 0

for i in range(m):
    if not i%1000:
        print("Processed "+str(i))
    suspect_flag = False
    bad_flag = False
    for j in range(40, n):
        if D.iloc[i,j].startswith("BAD"):
            bad_flag = True
        if D.iloc[i,j].startswith("SUSPECTED"):
            suspect_flag = True
            break
    if suspect_flag:
        status_col.append("SUSPECTED")
    elif bad_flag:
        status_col.append("BAD")
    else:
        status_col.append("GOOD")
    i += 1
        
D['overall_status'] = status_col 

In [None]:
vc_2 = D.iloc[:,40:].apply(lambda x: x.value_counts()).T.stack()

vc = pd.concat([vc_1.to_frame(), vc_2.to_frame()], axis=1)

vc.to_csv("../reviews/value_counts_suspected_before_after.csv")

## Write out data

In [None]:
D.to_csv('../data/consolidated_wards_clean_final.csv', sep=',',index=False)