In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from datetime import datetime
import re
import random
import math

In [2]:
# loading data
codes = pd.read_csv('../patients_w_codes.csv')
info = pd.read_csv('../data/info.tsv', sep = '\t', dtype = str)

In [3]:
codes

Unnamed: 0,id,first_event_date,code
0,005cad4958846409,2020-08-14,B42A40A2H1F10A15B21B7B12I24I7F6B28B128A4D77B64...
1,0065b57e5a9e784f,2020-03-26,A37B234F4B4B8B2F2B2F3B4F0G3B2I8G17I215I82B9B23A
2,006e5a1bf72e8b73,2020-06-02,B56B7B35B11B3B11B10B13B14B8A178I21I123B6B5B6B5...
3,009d11a2497f1d47,2020-10-14,A334C142J
4,00a5cf0620b6adfc,2020-10-26,A201J213A
...,...,...,...
3298,ff32e199c9482c20,2020-05-07,B196B12B21B6F1B7B1I6B6A15B7B7I70B14B78B7B7B8B5...
3299,ff46357762ea097e,2020-04-20,B92B99B12B9B6B4B32F6I4B18I232C4A14B17I79D36B3B...
3300,ff9318bd975b7050,2020-08-10,A16A64B74B14A197I
3301,ffabda76a8fbaecd,2020-05-05,B24B182A12B13B7B6F1B0I7B1B5B7F1I56B15F218B4F10...


In [4]:
info

Unnamed: 0,id,hosp,age,sex,maca,pcc,gma,symp_date
0,fc34ff975cf7159e,,91.0,Dona,N/D,1,332.0,
1,dd656556a4d03913,,42.0,Dona,N/D,N/D,322.0,
2,5d4f2cc96647b02d,X,64.0,Dona,N/D,N/D,402.0,
3,0efe2e67b10abefb,,32.0,Dona,N/D,N/D,101.0,
4,6b0f04b8451798ed,,23.0,Dona,N/D,N/D,322.0,
...,...,...,...,...,...,...,...,...
3299,f471895734ac5352,,54.0,Dona,N/D,N/D,322.0,
3300,1b1b507dd7bacf09,,38.0,Home,N/D,N/D,104.0,
3301,310774418891f607,,55.0,Home,N/D,N/D,332.0,
3302,b57220c7796955b1,X,83.0,Home,N/D,N/D,334.0,


Things to do:

1. Identify and exclude patients without reinfection
2. Identify patients with more than one reinfection
3. Identify reinfected patients with PCR(+)-PCR(-)-PCR(+) (case 1) and PCR(+)-90days-PCR(+) (case 2)
4. Classify reinfected patients in different profiles:
    - Confirmed: PCR+(seq) -> PCR- -> PCR+(seq) | at least 90 days between PCR1 and PCR2
    - Suspicious: PCR+ -> PCR- -> PCR+ | at least 90 days between PCR1 and PCR2; no sequentiation
    - Suspicious + clinical crit: same + symptomatology or/and hospitalization
    - Suspicious + serologic crit: same + sero- >= 14 days 1st PCR+ --- sero+ after 2nd episode
    - Suspicious + genomic crit: same + only one of the PCRs is sequenced

In [98]:
# Cases 1 and 2 + exclude non-reinfected + multiple reinfections
# separar en dos listas
# acceder usando i-1
# find one reinfection

In [8]:
# Function to split strings
def split_string(s):
    """
    s is a string code. Returns two lists, with letters (A) and numbers (B).
    """
    S = []
    for x in re.split('(\d+)', s):
        try:
            S.append(int(x))
        except ValueError:
            S.append(x)
    A, B = S[::2], S[1::2]
    return A, B

In [77]:
# trying random strings
r = random.randint(0,3303)
print(codes.iloc[r]['id'])
s = codes.iloc[r]['code']

A, B = split_string(s)

A

b40e67178d5f3029


['A', 'F', 'B', 'A']

In [128]:
def case2(A, B, pos):
    """
    Recognises case 2 reinfections (+|+)
    """
    c = 0
    if A[0] in pos:
        for i in range(len(B)):
            if A[i+1] not in pos:
                c += B[i]
            else:
                if c + B[i] >= 90:
                    return 'Case 2', c + B[i]
                c = B[i]
        return 'Possible case 2', c
    else:
        i = 0
        while A[i] not in pos:
            i += 1
        for j in range(i, len(B)):
            if A[j+1] not in pos:
                c += B[j]
            else:
                if c + B[j] >= 90:
                    return 'Case 2', c + B[j]
                c = B[j]
        return 'Possible case 2', c

In [127]:
def case1(A, B, pos):
    """
    Recognises case 1 reinfections (+|-|+)
    """
    if A[0] in pos:
        i = 0
        c = B[i]
        while A[i+1] != 'B':
            i += 1
            c += B[i]
        i += 1
        c += B[i]
        while A[i+1] not in pos:
            i += 1
            c += B[i]
        if c >= 90:
            return 'Case 1', c
        else:
            return 'Possible case 1', c
    else:
        i = 0
        while A[i] not in pos:
            i += 1
        c = B[i]
        while A[i+1] != 'B':
            i += 1
            c += B[i]
        i += 1
        c += B[i]
        while A[i+1] not in pos:
            i += 1
            c += B[i]
        if c >= 90:
            return 'Case 1', c
        else:
            return 'Possible case 1', c

In [99]:
# building main function
def classif(s, pos = ['A', 'C', 'D']):
    """
    Takes a string and classify it into reinfection cases 1 or 2
    """
    A, B = split_string(s)
    # searching for a negative PCR between the first and last positive PCR in the string
    indices = [i for i, x in enumerate(A) if x in pos]
    if 'B' in A[min(indices):max(indices)]:
        return case1(A, B, pos)
    else:
        return case2(A, B, pos)

In [124]:
# applying function
d = {}
for x in codes['id']:
    d[str(x)] = classif(codes.loc[codes['id'] == x]['code'].values[0])

In [139]:
d

{'005cad4958846409': ('Case 1', 301),
 '0065b57e5a9e784f': ('Case 1', 659),
 '006e5a1bf72e8b73': ('Case 1', 355),
 '009d11a2497f1d47': ('Case 2', 334),
 '00a5cf0620b6adfc': ('Case 2', 414),
 '00cad70e300b78c8': ('Case 2', 450),
 '00f31c11a3a6f08d': ('Case 2', 143),
 '011e826ab054591f': ('Case 2', 176),
 '012a57320c31a43f': ('Case 1', 424),
 '0134889c8d47909e': ('Case 2', 183),
 '01394866d1e64c72': ('Case 1', 331),
 '0139873673801ea0': ('Case 1', 291),
 '01422e9f5d9ad070': ('Case 2', 136),
 '015271b745049623': ('Case 1', 210),
 '015d56e832d129b0': ('Case 2', 361),
 '0165639a7fb80826': ('Case 1', 243),
 '0180c7c2db07677c': ('Case 1', 508),
 '0197651a06a237d7': ('Case 2', 303),
 '0198a2202a0f910a': ('Case 1', 128),
 '019d07016a193c91': ('Case 1', 532),
 '01a300fb5234bab8': ('Case 2', 377),
 '01b1d27f1734da73': ('Case 1', 500),
 '01c190b6006937c5': ('Case 2', 134),
 '01cd65c858e5b632': ('Case 2', 169),
 '022c2f46c2667c5a': ('Case 2', 205),
 '022dfea715f9b986': ('Case 1', 158),
 '02369f6899

In [138]:
np.unique([list(d.values())[i][0] for i in range(len(d))], return_counts=True)

(array(['Case 1', 'Case 2', 'Possible case 1', 'Possible case 2'],
       dtype='<U15'),
 array([2175,  993,  115,   20]))