In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from copy import deepcopy
import time
import random
from scipy import stats
from scipy.stats import mode
import seaborn as sns
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore")

## Annotate positions and within antibiotic resistance encoding genes or their upstream/downstream regions

In [6]:
!mkdir 03.annotation
## Extract the upstream and downstrean regions for each resistance-associated gene ###
# Take the antibiotic resistance gene list and define the upstream and downstream regions
ab_genes = pd.read_csv("input/AB_genes.csv")

genome_position_to_gene = pd.read_csv("input/genome_position_to_gene.csv", index_col=0)
genome_position_to_gene = genome_position_to_gene.sort_values("i")

# Initialize column to be added
data = ab_genes
data[["locus_name", "gene_start", "gene_end"]] = np.nan
data[[ "upstream_gene", "downstream_gene", "upstream_start", 
      "upstream_end", "downstream_start", "downstream_end"]] = np.nan

for idx, row in data.iterrows():
    
    # Add gene locus and gene start/end locations
    positions = genome_position_to_gene.query("gene==@row.gene")
    
    gene_start = positions.i.min()
    gene_end = positions.i.max()
    
    data.loc[idx, "gene_start"] = gene_start
    data.loc[idx, "gene_end"] = gene_end
    data.loc[idx, "locus_name"] = list(genome_position_to_gene.locus)[0]
    
    data.loc[idx, "upstream_end"] = gene_start - 1
    data.loc[idx, "downstream_start"] = gene_end + 1
    
    # upstream stuff
    upstream_all = genome_position_to_gene.query("i < @gene_start")
    upstream_gene = list(upstream_all.gene)[-1]
    upstream = genome_position_to_gene.query("gene==@upstream_gene")
    data.loc[idx, "upstream_gene"] = upstream_gene
    data.loc[idx, "upstream_start"] = upstream.i.max() + 1 
    
    # downstream stuff
    downstream_all = genome_position_to_gene.query("i > @gene_end")
    downstream_gene = list(downstream_all.gene)[0]
    downstream = genome_position_to_gene.query("gene==@downstream_gene")
    data.loc[idx, "downstream_gene"] = downstream_gene
    data.loc[idx, "downstream_end"] = downstream.i.min() - 1

data.to_csv("03.annotation/AB_genes_position_annotation.csv")

mkdir: cannot create directory ‘03.annotation’: File exists


In [8]:
data = pd.read_csv("03.annotation/AB_genes_position_annotation.csv")
ab_to_region = {}
for idx, row in data.iterrows():
    gene_name = str(row.gene)
    for position in list(range(int(row.upstream_start), int(row.upstream_end))) +\
        list(range(int(row.downstream_start), int(row.downstream_end))):
        ab_to_region[position] = "r_" + gene_name

# # Read in the SNP annotation file
snps = pd.read_pickle("input/genotypes_SNP_annotation.pkl")

# Filter for just snps in our analysis
positions = pd.read_csv("01.mutation_and_comutation_data/first_order_statistics.csv", index_col=0)
snps = snps.query("pos in @positions.pos")

# annotate the region around genes
for idx, row in snps.iterrows():
    position = row.pos
    
    if position in ab_to_region:
        snps.loc[idx, "gene_name"] = ab_to_region[position]

# need to manually add name for rrs
snps.loc[snps.query("gene_id=='Rvnr01'").index, "gene_name"] = 'rrs'


In [9]:
# Make a list of antibiotic-associated gene names
ab_to_genes = defaultdict(list)
ab_genes_list = []

for idx,row in data.iterrows():
    #print(row)
    ab_to_genes[row.Drug].append(row.gene)
    ab_to_genes[row.Drug].append("r_"+row.gene)
    ab_genes_list += [row.gene, "r_"+row.gene]
    
AB_gene_names = list(set(ab_genes_list))

In [10]:
! cp /n/data1/hms/dbmi/farhat/anna/databases/list_lineage_specific_snps_99_1.txt input/
! cp /n/data1/hms/dbmi/farhat/anna/databases/Coll_2014_lineage_barcode.csv input/

In [11]:
# add lineage annotaiton to the results

# Lineage positions from Freschi et al 2021
lineage_positions = pd.read_csv(
    "input/list_lineage_specific_snps_99_1.txt", header=None)
lineage_positions["position"] = [int(x.split("_")[0]) for x in lineage_positions[0]]

# Lineage positions from Coll et al 2014
coll_positions = pd.read_csv(
    "input/Coll_2014_lineage_barcode.csv",
    index_col=0
)

# Find out which SNPs are in the lineage set
coll_pos = set(coll_positions.position)
lineage_pos = set(lineage_positions.position)
lineage_mutation_indices  = snps.query("pos in @coll_pos or pos in @lineage_pos").index

# Annotate in SNP dataframe
snps["is_lineage"] = False
snps.loc[lineage_mutation_indices, "is_lineage"] = True

snps.to_csv("03.annotation/snps_with_gene_annotation.csv")

In [16]:
# Add gene and lineage  annotation to the results
results = pd.read_csv("02.comutation_pvalues/results_file_allpairs.csv")
results = results.merge(snps[["pos", "gene_name", "gene_id", "is_lineage"]], left_on="position_i", right_on="pos")
results = results.merge(snps[["pos", "gene_name", "gene_id", "is_lineage"]], left_on="position_j", right_on="pos", suffixes=["_i", "_j"])
results.columns

results = results[[
   'position_i', 'gene_name_i','position_j', 'gene_name_j',
    'n_mutations_i',
   'n_mutations_j', 'n_branches_with_i', 'n_branches_with_j',
   'n_muts_i_on_branch_j', 'n_muts_j_on_branch_i', 'n_muts_same_branch',
   'n_branch_with_i_no_j', 'n_branch_with_j_no_i', 'n_branch_i_and_j',
   'pval_beta_i_on_j', 'BH_critical_value', 'pval_beta_i_on_j_BH_sig',
    'is_lineage_i','is_lineage_j',
    'gene_id_i', 
    'gene_id_j', 

]]
results.to_csv("03.annotation/results_allpairs_annotated.csv")

results.query("pval_beta_i_on_j_BH_sig").to_csv("03.annotation/results_significant_annotated.csv")

## Hits within a particular antibiotic category

In [18]:
!mkdir 03.annotation/AB_to_AB
# make sure that both results are in the antibiotic resistance category
r = results.query("gene_name_i in @AB_gene_names and gene_name_j in @AB_gene_names")

#filter lineage results
significant_results = r.query('pval_beta_i_on_j_BH_sig and not is_lineage_i and not is_lineage_j')
all_results = r.query('not is_lineage_i and not is_lineage_j')

drugs = []
n = []

for key,vals in ab_to_genes.items():

    significant_results.query("gene_name_i in @vals and gene_name_j in @vals").sort_values(
        "n_muts_i_on_branch_j", ascending=False
    ).to_csv(
     f"03.annotation/AB_to_AB/{key}_significant_pairs_0p001.csv", index=None
    )
    
    all_results.query("gene_name_i in @vals and gene_name_j in @vals").sort_values(
        "n_muts_i_on_branch_j", ascending=False
    ).to_csv(
     f"03.annotation/AB_to_AB/{key}_all_results.csv", index=None
    )
    
    sig_results =significant_results.query("gene_name_i in @vals and gene_name_j in @vals")
    drugs.append(key)
    n.append(len(sig_results))

### Make table of all single mutations associated with each antibiotic

In [19]:
snps = pd.read_csv("03.annotation/snps_with_gene_annotation.csv")
first_order = pd.read_csv("01.mutation_and_comutation_data/first_order_statistics.csv", index_col=0)
snps = snps.merge(first_order[["pos","n_mutations"]], on="pos", how="left")

for key,vals in ab_to_genes.items():
    #print(key, vals)
    snps_x = snps.query("gene_name in @vals")
    print(key, len(snps_x))
    snps_x.to_csv(f"03.annotation/AB_to_AB/{key}_single_mutations.csv")



INH 80
RIF 154
EMB 81
PZA 158
MFX 33
LEV 33
OFX 33
AMI 103
CAP 92
KAN 103
STM 210
ETH 68
PAS 26
LZD 7
CYS 16
CFZ 10
CIP 33
RFB 154


# From AB to other genes

In [20]:
subset = results.query("(gene_name_i in @AB_gene_names and not gene_name_j in @AB_gene_names) or (gene_name_i not in @AB_gene_names and gene_name_j in @AB_gene_names)")
len(subset)

143473

In [21]:
significant_results = subset.query('pval_beta_i_on_j_BH_sig and not is_lineage_i and not is_lineage_j')
!mkdir 03.annotation/AB_to_UK

for key,vals in ab_to_genes.items():
    print(key)
    #print(significant_results.query("gene_name_i in @vals and gene_name_j in @vals"))
    x = significant_results.query("gene_name_j in @vals").sort_values(
        "n_muts_i_on_branch_j", ascending=False
    )
    print(len(x))
    
    significant_results.query("gene_name_j in @vals").sort_values(
        "n_muts_i_on_branch_j", ascending=False
    ).to_csv(
     f"03.annotation/AB_to_UK/{key}_significant_pairs.csv", index=None
    )
significant_results.query("gene_name_j in @AB_gene_names").to_csv("03.annotation/AB_to_UK_all.csv")

INH
231
RIF
282
EMB
209
PZA
153
MFX
59
LEV
59
OFX
59
AMI
265
CAP
215
KAN
265
STM
432
ETH
135
PAS
90
LZD
2
CYS
152
CFZ
3
CIP
59
RFB
282


## Between antibiotics (due to colinearity)

In [12]:
!mkdir 03.annotation/between_AB

r = results.query("gene_name_i in @AB_gene_names and gene_name_j in @AB_gene_names")

significant_results = r.query('pval_beta_i_on_j_BH_sig and not is_lineage_i and not is_lineage_j')
all_results = r.query('not is_lineage_i and not is_lineage_j')

for key,vals in ab_to_genes.items():
    print(key, vals)

    significant_results.query("not gene_name_i in @vals and gene_name_j in @vals").sort_values(
        "n_muts_i_on_branch_j", ascending=False
    ).to_csv(
     f"03.annotation/between_AB/{key}_significant_pairs_0p001.csv", index=None
    )
    
    sig_results =significant_results.query("gene_name_i in @vals and not gene_name_j in @vals")
    print(len(sig_results))

mkdir: cannot create directory ‘03.annotation/between_AB’: File exists
INH ['ahpC', 'r_ahpC', 'fabG1', 'r_fabG1', 'inhA', 'r_inhA', 'katG', 'r_katG', 'mshA', 'r_mshA', 'furA', 'r_furA', 'ndh', 'r_ndh', 'Rv1258c', 'r_Rv1258c', 'Rv2752c', 'r_Rv2752c']
264
RIF ['rpoB', 'r_rpoB', 'rpoA', 'r_rpoA', 'rpoC', 'r_rpoC', 'Rv2752c', 'r_Rv2752c']
475
EMB ['embA', 'r_embA', 'embB', 'r_embB', 'embC', 'r_embC', 'embR', 'r_embR', 'ubiA', 'r_ubiA']
441
PZA ['pncA', 'r_pncA', 'clpC1', 'r_clpC1', 'panD', 'r_panD', 'Rv1258c', 'r_Rv1258c', 'PPE35', 'r_PPE35', 'Rv3236c', 'r_Rv3236c']
275
MFX ['gyrA', 'r_gyrA', 'gyrB', 'r_gyrB']
213
LEV ['gyrA', 'r_gyrA', 'gyrB', 'r_gyrB']
213
OFX ['gyrA', 'r_gyrA', 'gyrB', 'r_gyrB']
213
AMI ['rrs', 'r_rrs', 'eis', 'r_eis', 'whiB7', 'r_whiB7', 'whiB6', 'r_whiB6', 'ccsA', 'r_ccsA', 'fprA', 'r_fprA', 'aftB', 'r_aftB']
207
CAP ['rrs', 'r_rrs', 'tlyA', 'r_tlyA', 'whiB6', 'r_whiB6', 'fprA', 'r_fprA', 'ccsA', 'r_ccsA', 'aftB', 'r_aftB']
129
KAN ['rrs', 'r_rrs', 'eis', 'r_eis', 'wh

## All antibiotic-associated

In [13]:
r = results.query("gene_name_i in @AB_gene_names or gene_name_j in @AB_gene_names")

significant_results = r.query('pval_beta_i_on_j_BH_sig and not is_lineage_i and not is_lineage_j')
all_results = r.query('not is_lineage_i and not is_lineage_j')

significant_results.to_csv("03.annotation/all_AB_associated_significant.csv")
print(len(significant_results))

9522
