# Database parsing

In [None]:
import pandas, numpy as np
import parsingDatabaseUtils, re
import xml, itertools, collections, xml.etree.ElementTree as ET
import tqdm, importlib, dateparser, dateparser.search
import parseMeasurementsByDay

In [None]:
casos = pandas.read_csv('Venezolanas2/casos.csv', index_col = 0)
pacientes = pandas.read_csv('Venezolanas2/pacientes.csv', index_col = 0)
pacientes.index = pacientes.index.map(str)

registros = pandas.read_csv('Venezolanas2/registros.csv', index_col = 0)
registros.index = registros.index.map(str)

#diagnosis = pandas.read_csv('Venezolanas2/diagnosis.csv', index_col = 0)
procedimientos = pandas.read_csv('Venezolanas2/procedimientos.csv', index_col = 0)
procedimientos.index = procedimientos.index.map(str)

procedimientosDesc = pandas.read_csv('Venezolanas2/procedimientosID.csv', index_col = 0)
registrosByCaso = registros.groupby('Caso')

entriesInfirmery = pandas.read_csv('Venezolanas2/enfermeriaMedidas.csv', index_col = 0)
entriesInfirmeryByCase = entriesInfirmery.groupby('IdAdmision')

In [None]:
classificationProcedures = {'H2968': 'o', 'H2123' : 'o', 'H0165': 'o', 'H0193': 'o', 'H2120': 'o', 'H2379': 'o', 'H2383': 'o', 'H2386': 'o', 'H2407': 'o', 'H2415': 'o', 'H2595': 'o', 'H2684': 'o', 'H2849': 'o', 'H2852': 'o', 'H2880': 'o', 'H2882': 'o', 'H2884': 'o', 'H2892': 'o', 'H2901': 'o', 'H2904': 'o', 'H2910': 'o', 'H2916': 'o', 'H2959': 'o', 'H2963': 'o', 'H2974': 'a', 'H2975': 'a', 'H3038': 'o', 'H3065': 'o', 'H3066': 'o', 'H3078': 'o', 'H3089': 'p', 'H3092': 'p', 'H3094': 'p', 'H3099': 'a', 'H3100': 'a', 'H3108': 'o', 'H3109': 'o', 'H3111': 'o', 'H3114': 'o', 'H3118': 'o', 'H4421': 'o', 'H4494': 'o', 'H4496': 'o', 'HE020': 'o'}

## Select a case
Identify all  cases with an associated procedure

In [None]:
# Identify all  cases with an associated procedure

In [None]:
interventionToCase = {}
caseToProcedureBirths = {}
checkSeveralProcedures = []
for i,r  in tqdm.tqdm_notebook(registros.iterrows()):
    if isinstance(r.RegistroXML, str) and '<row NombreCampo="IdDescripcion"' in r.RegistroXML:
        et = ET.fromstring(r.RegistroXML)
        idDescripcionProcedimiento = et.find('.//row[@NombreCampo="IdDescripcion"]').get('ValorCampo')
        interventionToCase[idDescripcionProcedimiento] = (r.Caso, r.NumeroHistoria, i)
        procedureType = re.findall('<idProcedimiento>([a-zA-Z0-9]*)</idProcedimiento>', procedimientos.loc[idDescripcionProcedimiento].XmlDescripcion)[0]
        if classificationProcedures[procedureType] == 'p':
            
            #If it is the case that they need to stop labour because a c-section is needed
            if '<postDiagnosticoPrincipal>O821 - PARTO POR CESAREA DE EMERGENCIA</postDiagnosticoPrincipal>' in \
                procedimientos.loc[idDescripcionProcedimiento].XmlDescripcion:
                continue
                
            elif r.Caso in caseToProcedureBirths:
                r1 = caseToProcedureBirths[r.Caso]
                r2 =procedimientos.loc[idDescripcionProcedimiento]
                if r1.XmlDescripcion == r2.XmlDescripcion:
                    continue
                else:
                    checkSeveralProcedures.append(r.Caso)
                    print('error', r.Caso)
                    if r1.FechaRegistro > r2.FechaRegistro:
                        caseToProcedureBirths[r.Caso] = procedimientos.loc[idDescripcionProcedimiento]
            else:
                caseToProcedureBirths[r.Caso] = procedimientos.loc[idDescripcionProcedimiento]

In [None]:
import birthDatasetStructure
importlib.reload(birthDatasetStructure)
# Split in newborn
registrosByCaso = registros.groupby('Caso')
processedDatasets = {}
for c, p in tqdm.tqdm_notebook(caseToProcedureBirths.items()):
    if c not in casos.index:
        print('ERROR!', c)
        continue
        
    if c in entriesInfirmeryByCase.groups:
        entriesInf = entriesInfirmeryByCase.get_group(c)
    else:
        entriesInf = pandas.DataFrame()
    processedDatasets[c] = birthDatasetStructure.BirthDataset(c, casos.loc[c], p, registrosByCaso.get_group(c), 
                                                              pacientes,entriesInf)


In [None]:
#Look for twins
for c, p in processedDatasets.items():
    
    if len(p.registrosRecienNacido) > 1:
        print(c, p.procTypeId,  len(p.registrosRecienNacido))


In [None]:
import math
def relErrorInLimits(v1, v2, th = .05):
    try:
        v1 = float(v1)
        v2 = float(v2)
        relErr =  2 * math.fabs(v1 - v2)/(v1 + v2)
        return relErr < th
    except:
        return False
def joinDicts(d1, d2):
    d1 = d1.copy()
    d2 = d2.copy()
    for k in d1:
        if k in d2 and d1[k] != d2[k]:
            if relErrorInLimits(d1[k], d2[k]):
                d1[k] = (float(d1[k]) + float(d2[k]))/2
            else:
                d2['error_key_%s' % str(k)] = (True, (d1[k], d2[k]))
    d2.update(d1)
    return d2

In [None]:
importlib.reload(parsingDatabaseUtils)
resSIP = {}
breakLoop = False
count = 0
for c, p in tqdm.tqdm_notebook(processedDatasets.items()):
    resSIP[c] = parsingDatabaseUtils.getMotherData(p)
    resSIP[c] = joinDicts(resSIP[c], parsingDatabaseUtils.getInformationFromProcedureDescription(p))
    for k in p.registrosRecienNacido:
        resSIP[c] = joinDicts(resSIP[c], parsingDatabaseUtils.getNewbornData(p, k))
    if len(p.registrosRecienNacido):
        resSIP[c]['VAR_0286'] = '0'
df = pandas.DataFrame.from_dict(resSIP, orient = 'index')
#l = list(df[(df.VAR_0294 != df.VAR_0294) & (df.VAR_0293 != df.VAR_0293)].index)
#print(len(l))
print(count)

In [None]:
def toInt(s, d = 1):
    try:
        s = str(int(s))
        return addZeros(s, d)
    except:
        return "X"
def addZeros(s, d):
    return '0' * (d - len(s)) + s 

#Add SIP information
placeId = '806001061-8'
placeId = addZeros(placeId, 20)
df['ID01'] = placeId + df['VAR_0019'].map(lambda s: addZeros(s,20))  \
                                          + df['VAR_0040'].map(lambda s: toInt(s, 2)) + df['VAR_0286'].map(toInt)

df['VAR_0198'] = df['VAR_0198'].map(lambda s: str(int(s)) if s == s else s)
df['VERSION'] = '4.1.2'
df['FECHA'] = datetime.datetime.now().strftime("%Y-%m-%d")
df['HORA'] = datetime.datetime.now().strftime("%H:%M:%S")
df['USUARIO'] = 'CMRC-AUTO'

In [None]:
#Quality control
n = len(df)
for c in df.columns:
    if sum(df[c] != df[c])/n > .05:
        print(c, sum(df[c] != df[c])/n )

In [None]:
import parseMeasurementsByDay
importlib.reload(parseMeasurementsByDay)
measurements = {}
measurementsControlsPrenatal = {}
for c, p in tqdm.tqdm_notebook(processedDatasets.items()):
    measurements[c] = parseMeasurementsByDay.getParaClinicsHospitalisation(p) + parseMeasurementsByDay.getAllVitalSigns(p)
    if p.epicrisis is not None:
        measurementsControlsPrenatal[c] = parseMeasurementsByDay.parseParaclinicsBeforeHospitalisation(p.epicrisis)

In [None]:
for c, p in tqdm.tqdm_notebook(processedDatasets.items()):
    res = {}
    dfMeasurements = pandas.DataFrame(data = measurements[c], columns =['Campo', 'Fecha', 'Valor'])
    dfMeasurements.Valor = dfMeasurements.Valor.map(lambda s: s.replace(',', '.') if isinstance(s, str) else s)
    dfMeasurements.Valor = dfMeasurements.Valor.astype(float)
    dfValues =dfMeasurements.groupby('Campo')['Valor'].agg(['median', 'max', 'min'])
    for var, row in dfValues.iterrows():
        res[var + '_median'] = row.median()
        res[var + '_max'] = row.max()
        res[var + '_min'] = row.min()
    resSIP[c].update(res)

In [None]:
dfResSIP = pandas.DataFrame.from_dict(resSIP, orient = 'index')
dfResSIP.to_csv('resultsSIP.csv')

# Tests

In [None]:
for c,  _ in df[df['sufrimientoFetal'] == 'SI'].iterrows():
    et = ET.fromstring(processedDatasets[c].epicrisis.RegistroXML)
    print(parsingDatabaseUtils.findInXML('MedicamentosAdministrado', et ))

In [None]:
noEpicrsis = []
noIngreso = []

noDischarge = []
for c, p in processedDatasets.items():
    if p.epicrisis is None:
        noDischarge.append(c)
    if p.ingreso is None:
        noIngreso.append(c)
print('no emergency' , len(noIngreso), 'no epi', len(noDischarge), len(processedDatasets))

In [None]:
pacientes.loc[str(casos.loc[noIngreso[0], 'Paciente'])], noIngreso[0], casos.loc[noIngreso[0]]

In [None]:
#count = 0
#for c, p in processedDatasets.items():
#    if p.epicrisis is None:
#        continue
#    et = ET.fromstring(p.epicrisis.RegistroXML)
#    ant = parsingDatabaseUtils.fullCleanTxt(parsingDatabaseUtils.findInXML('AntecedentesHTML', et))
#    if not 'paracli' in ant:
#        count += 1
#    else:
#        print(ant)

In [None]:
c = 'AD284225'
k = next(iter(processedDatasets[c].registrosRecienNacido.keys()))
parsingDatabaseUtils.prettyPrintXML(processedDatasets[c].registrosRecienNacido[k][k].RegistroXML)


In [None]:
c = list(processedDatasets.keys())[884]
count = 0
for c in processedDatasets.keys():
    try:
        if 'ECLAMPS' in processedDatasets[c].epicrisis.RegistroXML:
            count += 1
            #parsingDatabaseUtils.prettyPrintXML(processedDatasets[c].epicrisis.RegistroXML)
    except:
        pass

In [None]:
df.to_csv('test.csv')

In [None]:
c = list(processedDatasets.keys())[244]
parsingDatabaseUtils.prettyPrintXML(processedDatasets[c].epicrisis.RegistroXML)

In [None]:
import cProfile

In [None]:
paraClinics = {}
for c, p in tqdm.tqdm_notebook(processedDatasets.items()):
    if p.epicrisis is None:
        continue
    d =parseParaclinics(p.epicrisis)
    paraClinics[c] = paraclinicsToDF(parseParaclinics(p.epicrisis))


In [None]:
dfParaClinics.noParaclinicalTestsConfirmed.value_counts()

In [None]:
dfParaClinics = pandas.DataFrame.from_dict(paraClinics, orient = 'index')
dfParaClinics = dfParaClinics[sorted(dfParaClinics.columns)]
dfParaClinics.to_csv('paraclinics.csv')

In [None]:
for c, p in paraClinics.items():
    if len(p) != 1:
        print(paraClinics[c]['text'])
        print('-----------------')
        for k in p:
            if k != 'text':
                print(k, p[k])
        print('-----------------')

In [None]:
def merge2DF(df1, df2):
    """
    Returns the merge of 2 df with the same index (1 to 1)
    In case of columns with the same name, check if the values are the same (or one of them is NA)
    Creates new error column to see where there has been problems
    """
    pass