In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
# Create the geno framework
with open('../Data/Walker2015Lancet/dummyfeats1.txt', 'r') as f:
    geno_lines = [line.strip() for line in f.readlines()]

col_id = [line.split()[0] for line in geno_lines]
col_muts = [line.split()[1:] for line in geno_lines]

df_geno = pd.DataFrame(data={'ID': col_id, 'MUTATIONS': col_muts}).set_index('ID')

In [3]:
df_geno[:5]

Unnamed: 0_level_0,MUTATIONS
ID,Unnamed: 1_level_1
00-R0025,"[eis_V163I, pncA_Y95D, rrs_A1401G, rpoB_S450L,..."
00-R0086,"[gyrA_G668D, gyrA_S95T, gidB_E92D, katG_R463L,..."
00-R0178,"[rrs_A1401G, gyrA_G668D, gyrA_S95T, gyrA_E21Q,..."
00-R0223,"[gyrA_E21Q, gidB_S100F]"
00-R0308,"[rpoB_S450L, gyrA_G668D, gyrA_S95T, gyrA_E21Q,..."


In [4]:
# Create the pheno framework
df_pheno = pd.read_csv('../Data/Walker2015Lancet/dummyphenos1.csv', index_col=0)

df_pheno[:5]

Unnamed: 0_level_0,SM,KAN,AK,CAP,EMB,CIP,OFX,MOX,INH,RIF,PZA,source,linName
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
00-R0025,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,Maha,EastAsia
00-R0086,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,Maha,EastAsia
00-R0178,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,Maha,European
00-R0223,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,Maha,European
00-R0308,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,Maha,European


In [5]:
# Set the float columns to string: -1 -> nan, 0 -> 'S', 1 -> 'R'

def phenoFloatToStr(x):
    if pd.isnull(x) or -1-1e-5<x<-1+1e-5:
        return np.nan
    elif 0-1e-5<x<0+1e-5:
        return 'S'
    elif 1-1e-5<x<1+1e-5:
        return 'R'
    else:
        raise ValueError('Invalid value: {}'.format(x))
    
for col in df_pheno.columns[:11]:
    df_pheno[col] = df_pheno[col].apply(phenoFloatToStr)

In [7]:
# Combine df_geno and df_pheno using the index
# This will drop any rows that are not in both dataframes
df_geno_pheno = df_geno.join(df_pheno, how='inner')
df_geno_pheno

Unnamed: 0_level_0,MUTATIONS,SM,KAN,AK,CAP,EMB,CIP,OFX,MOX,INH,RIF,PZA,source,linName
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
00-R0025,"[eis_V163I, pncA_Y95D, rrs_A1401G, rpoB_S450L,...",,,,,R,,,,R,R,,Maha,EastAsia
00-R0086,"[gyrA_G668D, gyrA_S95T, gidB_E92D, katG_R463L,...",,,,,R,,,,R,R,R,Maha,EastAsia
00-R0178,"[rrs_A1401G, gyrA_G668D, gyrA_S95T, gyrA_E21Q,...",,,,,R,,,,R,R,R,Maha,European
00-R0223,"[gyrA_E21Q, gidB_S100F]",,,,,S,,,,S,S,,Maha,European
00-R0308,"[rpoB_S450L, gyrA_G668D, gyrA_S95T, gyrA_E21Q,...",,,,,R,,,,R,R,R,Maha,European
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WMB496,"[gyrA_G668D, gyrA_S95T, katG_R463L, gyrA_E21Q,...",S,,,,S,,,,R,,,Thailand,IndianOcean
X44787,"[gyrA_G668D, gyrA_S95T, katG_R463L, gyrA_E21Q,...",,,,,S,,,,S,S,S,LID_derivation,
X46120,"[gyrA_G668D, gyrA_S95T, katG_R463L, gyrA_E21Q,...",,,,,S,,,,S,S,S,LID_derivation,
YA00000352-S19,"[gyrA_G668D, gyrA_S95T, gyrA_E21Q, gidB_S100F,...",S,S,S,S,S,,S,S,S,S,S,LID_validation,


In [8]:
# Save the dataframe to a pickle file
df_geno_pheno.to_pickle('../Data/Walker2015Lancet.pkl')