In [1]:
import pandas as pd
import numpy as np
import holoviews as hv

In [65]:
def sample_cleaner(sample):
    """Standardizes sample names."""
    try:
        sample = sample.replace("-", "").lower().strip()
    except:
        return sample
    return sample

def latent_fixer(x):
    x = str(int(x))
    if len(x) == 1:
        x = "pl00"+x
    else:
        x = "pl0"+x
    return x

In [104]:
transmission = pd.read_excel('../data/raw/CyMAF Biobank 21 Aug 2019.xlsx', sheet_name=0, header=5)
transmission = transmission.dropna(how='all').copy()
transmission['subject code'] = transmission['subject code'].apply(sample_cleaner).ffill()
transmission.drop("Unnamed: 0", axis=1, inplace=True)

In [105]:
df = pd.read_csv('../data/processed/190530_merged_cmv_remmel.csv')
df = df.loc[df.Cohort.isin(['PP', 'PL', 'NP', 'NL'])].copy()
df.Sample = df.Sample.apply(sample_cleaner)
# fix naming inconsistancy
df.loc[df.Sample == 'p97', "Sample"] = 'p097'

In [106]:
pp_samples = df.loc[df.Cohort=='PP', 'Sample']
pp_samples = pp_samples.loc[~pp_samples.str.contains("v")].copy()
pp_samples = set(pp_samples)
meta_samples = set(transmission['subject code'])
shared_samples = list(pp_samples.intersection(meta_samples))

In [107]:
transmission = transmission.loc[transmission['subject code'].isin(shared_samples)].copy().reset_index(drop=True)

In [87]:
counts = transmission.isna().sum().sort_values(ascending=False)

In [88]:
counts

Unnamed: 0                                                    69
Liaison CMV IgG avidity II (Index <0,15 low;  >0,25 high)     67
Liaison CMV IgG II  (U/ml <12 negative, >14 positive)         65
Liaison CMV IgM II            (U/ml >22 positive)             65
PBMC (n°vials/n°PBMC per vial)                                49
transmission                                                  42
Subject no.                                                   42
age                                                           42
parity                                                        42
weeks of gestation at infection onset                         42
Anti-pentamer IgG titer (ELISA)                               13
Anti-gH/gL/gO IgG titer (ELISA)                               13
Anti-gB IgG titer (ELISA)                                     13
DNA (copies/ml)                                               10
ETI-CYTOK IgM (Index)                                          6
ETI-CYTOK IgG (UI/ml)    

In [92]:
# Find columns that already have only one entry per subject
counts = transmission.isna().sum().sort_values(ascending=False)
counts_bool = (counts < 45) & (counts > 40)
one_per_subject = ['subject code'] + list(counts[counts_bool].index)
metadata = transmission[one_per_subject].dropna(how='all').copy()
metadata = metadata.dropna().copy()

In [94]:
metadata.head()

Unnamed: 0,subject code,transmission,Subject no.,age,parity,weeks of gestation at infection onset
0,p005,NT,1.0,33.0,1.0,5.0
3,p007,T,3.0,37.0,1.0,8.0
6,p016,T,4.0,34.0,1.0,25.0
10,p021,T,5.0,31.0,1.0,26.0
14,p026,NT,8.0,29.0,1.0,1.0


In [95]:
metadata.columns = ['subject_code', 'transmission', 'subject_no', 'age', 'parity', 'weeks_gestation_infection_onset']
metadata = metadata[['subject_no', 'subject_code', 'transmission', 'age', 'parity', 'weeks_gestation_infection_onset']].copy().reset_index(drop=True)

In [96]:
metadata

Unnamed: 0,subject_no,subject_code,transmission,age,parity,weeks_gestation_infection_onset
0,1.0,p005,NT,33.0,1.0,5.0
1,3.0,p007,T,37.0,1.0,8.0
2,4.0,p016,T,34.0,1.0,25.0
3,5.0,p021,T,31.0,1.0,26.0
4,8.0,p026,NT,29.0,1.0,1.0
5,11.0,p032,NT,36.0,1.0,25.0
6,17.0,p053,NT,31.0,0.0,10.0
7,24.0,p074,NT,40.0,1.0,9.0
8,26.0,p083,NT,38.0,2.0,18.0
9,27.0,p087,NT,39.0,1.0,17.0


In [98]:
counts

Unnamed: 0                                                    69
Liaison CMV IgG avidity II (Index <0,15 low;  >0,25 high)     67
Liaison CMV IgG II  (U/ml <12 negative, >14 positive)         65
Liaison CMV IgM II            (U/ml >22 positive)             65
PBMC (n°vials/n°PBMC per vial)                                49
transmission                                                  42
Subject no.                                                   42
age                                                           42
parity                                                        42
weeks of gestation at infection onset                         42
Anti-pentamer IgG titer (ELISA)                               13
Anti-gH/gL/gO IgG titer (ELISA)                               13
Anti-gB IgG titer (ELISA)                                     13
DNA (copies/ml)                                               10
ETI-CYTOK IgM (Index)                                          6
ETI-CYTOK IgG (UI/ml)    

In [109]:
by_subject = transmission.groupby('subject code').first().reset_index()
by_subject.isna().sum().sort_values(ascending=False)

Liaison CMV IgG avidity II (Index <0,15 low;  >0,25 high)     26
Liaison CMV IgG II  (U/ml <12 negative, >14 positive)         26
Liaison CMV IgM II            (U/ml >22 positive)             26
PBMC (n°vials/n°PBMC per vial)                                15
Anti-gB IgG titer (ELISA)                                      4
Anti-gH/gL/gO IgG titer (ELISA)                                4
Anti-pentamer IgG titer (ELISA)                                4
ETI-CYTOK IgG (UI/ml)                                          2
ETI-CYTOK IgM (Index)                                          2
Avidity index (%)                                              1
DNA (copies/ml)                                                1
NT titer HELF (AD169 Vo36)                                     0
NT titer ARPE-19 (VR1814 HUVEC/454)                            0
plasma (ml)                                                    0
Plaque inhibition ARPE (VR1814 HUVEC/454)                      0
serum (ml)               

In [115]:
nas = by_subject.isna().sum()

In [119]:
complete_data_cols = list(nas.loc[nas == 0].index)

In [122]:
transmission = transmission[complete_data_cols].copy()

In [124]:
multiple_entries = transmission.dropna(axis=1).copy()

In [125]:
# get smallest possible spread
midpoint = 45

In [127]:
multiple_entries['distance from mean'] = np.abs(multiple_entries['days after infection onset'] - midpoint)

In [152]:
subjects_days = []
for s in multiple_entries['subject code'].unique():
    min_distance = multiple_entries.loc[multiple_entries['subject code'] == s, 'distance from mean'].min()
    min_day = list(multiple_entries.loc[(multiple_entries['subject code'] == s) & (multiple_entries['distance from mean'] == min_distance), 'days after infection onset'])[0]
    min_day = int(min_day)
    print(s, min_distance, min_day)
    subjects_days.append((s, min_day))
    
    

p005 11.0 56
p007 12.0 57
p016 27.0 72
p021 9.0 54
p026 2.0 47
p032 11.0 56
p053 2.0 43
p074 3.0 48
p083 4.0 49
p087 7.0 52
p088 10.0 55
p097 16.0 29
p100 11.0 34
p130 8.0 37
p200 14.0 31
p049 4.0 49
p062 15.0 60
p065 19.0 64
p073 16.0 29
p084 17.0 28
p085 11.0 56
p109 3.0 48
013003 16.0 29
013004 9.0 36
013015 5.0 40
013022 7.0 52
013025 4.0 41


In [153]:
subjects_days

[('p005', 56),
 ('p007', 57),
 ('p016', 72),
 ('p021', 54),
 ('p026', 47),
 ('p032', 56),
 ('p053', 43),
 ('p074', 48),
 ('p083', 49),
 ('p087', 52),
 ('p088', 55),
 ('p097', 29),
 ('p100', 34),
 ('p130', 37),
 ('p200', 31),
 ('p049', 49),
 ('p062', 60),
 ('p065', 64),
 ('p073', 29),
 ('p084', 28),
 ('p085', 56),
 ('p109', 48),
 ('013003', 29),
 ('013004', 36),
 ('013015', 40),
 ('013022', 52),
 ('013025', 41)]

In [154]:
cols = multiple_entries.columns
new_data = []
for s, d in subjects_days:
    line = multiple_entries[(multiple_entries['subject code'] == s) & (multiple_entries['days after infection onset'] == d)].reset_index(drop=True)
    try:
        line = list(line.iloc[0])
    except:
        print(s, d)
        print(line)
    new_data.append(line)

In [145]:
list(thing.iloc[0])

['p005', Timestamp('2011-04-21 00:00:00'), 56.0, 2560, 160, 0.6, 0.5, 11.0]

In [178]:
final_dataframe = pd.DataFrame(new_data, columns=cols)

In [179]:
final_dataframe.to_csv('../data/processed/2019-10-14-transmission-data.csv', index=False)

In [180]:
final_dataframe = pd.merge(single_entries, final_dataframe, on='subject code')

In [181]:
transmission_pp = pd.merge(df, final_dataframe, left_on='Sample', right_on='subject code', how='inner').drop('subject code', axis=1)

In [183]:
transmission_pp.to_csv('../data/processed/2019-10-14-transmission-pp-repaired.csv', index=False)

In [184]:
transmission_pp

Unnamed: 0,Cohort,Sample,2B_CG1,2B_CG2,2B_DS-Cav1 A-subtype RSV,2B_Measles,2B_Pertactin,2B_Pertussis,2B_Rubella viral capsid,2B_gB,...,parity,weeks of gestation at infection onset,transmission,date of sample (dd/mm/year),days after infection onset,NT titer ARPE-19 (VR1814 HUVEC/454),Plaque inhibition ARPE (VR1814 HUVEC/454),serum (ml),plasma (ml),distance from mean
0,PP,013003,46192.5,11916.0,55972.0,66.0,61.0,111.0,132.0,642.0,...,1.0,8.0,T,2009-06-30,29.0,1280,80,1.0,0.5,16.0
1,PP,013004,14034.0,5061.0,6494.0,52.0,84.0,120.0,128.0,266.0,...,1.0,18.0,NT,2009-07-02,36.0,1280,160,1.0,0.5,9.0
2,PP,013015,12500.0,2121.0,9311.0,48.0,57.0,240.0,147.5,3952.0,...,1.0,7.0,T,2009-10-19,40.0,2560,160,1.0,0.5,5.0
3,PP,013022,8507.0,1783.0,13581.5,75.0,75.0,163.0,144.0,1022.0,...,1.0,8.0,T,2009-12-31,52.0,640,160,1.0,0.5,7.0
4,PP,013025,16317.0,2552.0,1820.0,72.5,58.5,115.5,131.0,91.0,...,1.0,22.0,T,2009-12-15,41.0,2560,160,1.0,0.5,4.0
5,PP,p005,74591.0,1464.0,125404.0,87.5,212.0,606.0,211.0,4079.0,...,1.0,5.0,NT,2011-04-21,56.0,2560,160,0.6,0.5,11.0
6,PP,p007,84256.5,31781.0,75221.5,73.0,56.0,232.0,185.0,4949.5,...,1.0,8.0,T,2011-05-11,57.0,5120,1280,0.75,0.5,12.0
7,PP,p016,72436.0,2238.0,45424.0,175.0,47.0,339.0,160.0,167.0,...,1.0,25.0,T,2011-08-24,72.0,2560,320,0.6,0.5,27.0
8,PP,p021,12386.0,1998.5,88188.5,58.0,65.5,199.0,135.0,203.0,...,1.0,26.0,T,2011-08-08,54.0,1280,1280,0.5,0.5,9.0
9,PP,p026,7709.0,542.0,27080.0,62.0,80.0,280.0,212.0,53.0,...,1.0,1.0,NT,2011-08-09,47.0,160,160,1.0,0.5,2.0


In [167]:
transmission.dropna(subset)

Unnamed: 0,subject code,Subject no.,age,parity,weeks of gestation at infection onset,transmission,date of sample (dd/mm/year),days after infection onset,NT titer HELF (AD169 Vo36),NT titer ARPE-19 (VR1814 HUVEC/454),Plaque inhibition ARPE (VR1814 HUVEC/454),serum (ml),plasma (ml)
0,p005,1.0,33.0,1.0,5.0,NT,2011-04-21,56.0,20,2560,160,0.60,0.5
1,p005,,,,,,2011-05-17,82.0,20,2560,160,1.00,0.5
2,p005,,,,,,2011-09-06,194.0,20,2560,1280,0.50,0.5
3,p007,3.0,37.0,1.0,8.0,T,2011-05-11,57.0,20,5120,1280,0.75,0.5
4,p007,,,,,,2011-06-13,90.0,40,10240,640,0.75,0.5
5,p007,,,,,,2011-10-20,219.0,320,20480,1280,0.60,0.5
6,p016,4.0,34.0,1.0,25.0,T,2011-06-29,16.0,<5,320,160,0.50,0.5
7,p016,,,,,,2011-08-24,72.0,160,2560,320,0.60,0.5
8,p016,,,,,,2011-09-07,86.0,,2560,160,0.75,0.5
9,p016,,,,,,2012-01-19,220.0,320,5120,160,0.50,0.5


In [177]:
single_entries = transmission[['subject code', 'Subject no.', 'age', 'parity',
       'weeks of gestation at infection onset', 'transmission']].dropna().reset_index(drop=True).copy()