# 01

# Compute pairwise associations between pan-genome allele frequencies and AMR phenotypes
1. Mutual Information
2. Chi-squared
3. ANOVA

### Load _M. tuberculosis_ pan-genome dataset

In [1]:
import pandas as pd
from tqdm import tqdm
from svm_epistasis_funcs import *

# new packages to import for this script
import statsmodels
import statsmodels.api as sm
import heapq, random
import entropy_estimators as ee

pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

  from pandas.core import datetools


In [2]:
### Define locations and filenames for the pan-genome matrices.
DATA_DIR = "../data/" 
SUPP_DIR = "../supplement/"
FINAL_SUPP_DIR = SUPP_DIR+"supplement_final/"

# load key data matrices
CLUSTER_INFO_FILE = "cluster_info.csv"
STRAIN_INFO_FILE = "PATRIC_genome_info.csv"
X_PANGEN_CLUSTERS_FILE = "pangen_cluster_df.csv"  # the species-cluster (each cluster represents a "gene")
X_PANGEN_ALLELES_FILE = "pangen_allele_df.csv"  # the species-allele matrix
Y_PHENOTYPES_FILE = "resistance_data.csv"  # the species-phenotype matrix

In [3]:
# Load resistance data
resistance_data = pd.read_csv(DATA_DIR+Y_PHENOTYPES_FILE, index_col = 0) #resistance_data
drug_list = list(resistance_data.columns.map(str))

# Load pan-genome in both cluster and allele form
pan_cluster_df = pd.read_csv(DATA_DIR+X_PANGEN_CLUSTERS_FILE, index_col = 0) #pan_cluster_df
pan_variant_df = pd.read_csv(DATA_DIR+X_PANGEN_ALLELES_FILE, index_col = 0) #clust_variant_df

# Load cluster information file
clust_info_df = pd.read_csv(DATA_DIR+CLUSTER_INFO_FILE, index_col = 0) #test_df_final

# Load strain information file
strain_info_df = pd.read_csv(DATA_DIR+STRAIN_INFO_FILE, index_col = 0) #....

### Filter out Mobile elements, PE family proteins, and hypotheticals.
Should drop the total number of 15367 variants (columns) to 9975

In [4]:
pan_variant_df = pan_variant_df.fillna(0).copy()

remove_cols = []
for col in tqdm(pan_variant_df.columns):
    col_name = col.split("_")[0].replace("ter", "ter ")
    clust_to_RV = clust_info_df.loc[col_name, "clust_to_rv"]
    gene_name = clust_info_df.loc[col_name, "gene_name"]
    product = clust_info_df.loc[col_name, "product"]
    
    if clust_to_RV == "0" and gene_name == "0":
        remove_cols.append(col)
    if product == "Mobile element protein" or product == "hypothetical protein":
        remove_cols.append(col)
    if "PE-PGRS" in product or "PPE family" in product or "PE family protein" in product:
        remove_cols.append(col)
    if "hypothetical protein" in product or "Hypothetical protein" in product:
        remove_cols.append(col)
    if "Transposase" in product:
        remove_cols.append(col)

remove_cols = list(set(remove_cols))
print "Number of alleles (variants) before filtering...", len(pan_variant_df.columns)
pan_variant_df.drop(remove_cols, axis=1, inplace=True)
print "Number of alleles after filtering...",len(pan_variant_df.columns)
print "Number of alleles removed...",len(remove_cols)
data_to_plot = pan_variant_df.copy()

100%|██████████| 15367/15367 [00:01<00:00, 13407.07it/s]


Number of alleles (variants) before filtering... 15367
Number of alleles after filtering... 9975
Number of alleles removed... 5392


## Perform classical GWAS with three different association metrics.
Compute pairwise associations between allele frequencies and AMR phenotypes

Leaving out rifabutin and nicotinamide.

### Mutual Information

In [5]:
def get_mi_var_drug(gene_train, drug_target):
    mi_class = [ee.midd(gene_train[x].fillna(0).values, drug_target.values) for x in gene_train.columns]
    mi_class = np.array(mi_class)
    return mi_class

def add_info_sum_row(clust_MI_dic, all_cluster_df):
    clust_count_df = pd.DataFrame.from_dict(clust_MI_dic, 
                                            orient="index")
    clust_count_df.columns = ["MI sum"]
    final_output_df = pd.concat([clust_count_df, all_cluster_df.ix[list(set(clust_MI_dic.keys())), :]], axis=1)
    final_output_df = final_output_df.sort_values(["MI sum"], axis=0, ascending=False, inplace=False, 
                                                  kind='quicksort', na_position='last')
    return final_output_df

In [7]:
clust_info_df.head()

Unnamed: 0,clust_to_rv,gene_name,ortho,cog,product,refseq,count,score,name_to_rv,pan
Cluster 0,Rv2048c,pks12,653045.Strvi_4160,Q,Polyketide synthase,AN47_01827,1590,7958.6,0,Core
Cluster 1,Rv3344c,PE_PGRS49,0,0,PE-PGRS family protein,X171_03503,794,0.0,0,Acces
Cluster 10,0,MRA_3390,419947.MRA_3390,N,PPE family protein,T604_03914,45,282.5,0,Acces
Cluster 100,Rv3512,PE_PGRS56,0,0,PE-PGRS family protein,MT7199_3572,692,0.0,0,Acces
Cluster 1000,0,0,0,0,PE-PGRS family protein,AN96_03817,4,0.0,0,Uniq


In [11]:
writer = pd.ExcelWriter(SUPP_DIR+'DataFile01_a.xlsx')

for drug_name in tqdm(["isoniazid", "ethambutol", "rifampicin", "pyrazinamide", "ethionamide", 
                              "ofloxacin", "4-aminosalicylic_acid", "streptomycin", "amikacin","cycloserine", 
                              "kanamycin", "capreomycin"]):
    
    if drug_name == "MDR" or drug_name == "XDR":
        X1, y1 = get_MDR_training(resistance_data, data_to_plot, drug_name)
    else:
        y1, X1 = get_target_data(data_to_plot, resistance_data, drug_name)
        # X1 = remove_key_antibiotic_clusters(X1, drug_name)
        
    mi_drug = get_mi_var_drug(X1, y1)
    print drug_name
    
    all_filter_ind_drugs = []
    cluster_info_dic = {}
    
    n_max_info = heapq.nlargest(50, mi_drug)
    where_max = np.where( mi_drug >= min(n_max_info) )[0]
    all_filter_ind_drugs.extend(list(where_max))

    new_cols = []
    for x in list(X1.columns[all_filter_ind_drugs]):
        new_cols.append(str(x).split("_")[0].replace("ter", "ter "))
        new_cols = list(set(new_cols))
    # print new_cols
    for x in where_max:
        clust_var_name = X1.columns[x]
        clust_name = str(clust_var_name).split("_")[0].replace("ter", "ter ")
        if clust_name not in cluster_info_dic.keys():
            cluster_info_dic.update({clust_name: mi_drug[x]})
        else:
            cluster_info_dic[clust_name] = cluster_info_dic[clust_name]+mi_drug[x]
    all_filter_ind_drugs = list(set(all_filter_ind_drugs))            
    final_df = add_info_sum_row(cluster_info_dic, clust_info_df)
        
    final_df[:40].to_excel(writer, sheet_name = drug_name+"_MI", index=True)
    
writer.save()

  0%|          | 0/12 [00:00<?, ?it/s]

S: 506 | R: 1057


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.
  8%|▊         | 1/12 [00:28<05:10, 28.23s/it]

isoniazid
S: 848 | R: 492


 17%|█▋        | 2/12 [00:53<04:26, 26.62s/it]

ethambutol
S: 578 | R: 983


 25%|██▌       | 3/12 [01:21<04:04, 27.18s/it]

rifampicin
S: 92 | R: 137


 33%|███▎      | 4/12 [01:29<02:59, 22.45s/it]

pyrazinamide
S: 353 | R: 209


 42%|████▏     | 5/12 [01:43<02:24, 20.64s/it]

ethionamide
S: 554 | R: 302


 50%|█████     | 6/12 [02:01<02:01, 20.17s/it]

ofloxacin
S: 295 | R: 80


 58%|█████▊    | 7/12 [02:11<01:33, 18.78s/it]

4-aminosalicylic_acid
S: 732 | R: 663


 67%|██████▋   | 8/12 [02:37<01:18, 19.64s/it]

streptomycin
S: 257 | R: 142


 75%|███████▌  | 9/12 [02:47<00:55, 18.66s/it]

amikacin
S: 262 | R: 71


 83%|████████▎ | 10/12 [02:57<00:35, 17.77s/it]

cycloserine
S: 550 | R: 278


 92%|█████████▏| 11/12 [03:14<00:17, 17.72s/it]

kanamycin
S: 237 | R: 141


100%|██████████| 12/12 [03:25<00:00, 17.13s/it]

capreomycin





### Chi-squared

In [12]:
# writer = pd.ExcelWriter('Supplementary_files/Chi2_allele_AMR_associations.xlsx')

writer = pd.ExcelWriter(SUPP_DIR+'DataFile01_b.xlsx')

for drug_name in tqdm(["isoniazid", "ethambutol", "rifampicin", "pyrazinamide", "ethionamide", 
                  "ofloxacin", "4-aminosalicylic_acid", "streptomycin", "amikacin","cycloserine", 
                  "kanamycin", "capreomycin"]): # ", "ethionamide", "ofloxacin", "amikacin" "4-aminosalicylic_acid"
    
    chi2_test = pd.DataFrame()
    
    if drug_name == "MDR" or drug_name == "XDR":
        X1, y1 = get_MDR_training(resistance_data, data_to_plot, drug_name)
    else:
        y1, X1 = get_target_data(data_to_plot, resistance_data, drug_name)
        # X1 = remove_key_antibiotic_clusters(X1, drug_name)
    
    print "Before removing low variance: ", X1.shape
    selector = VarianceThreshold(threshold=.01)
    selector.fit_transform(X1)
    X1 = X1[X1.columns[selector.get_support()]].copy()
    print "After removing low variance: ", X1.shape
    
    chi2_test = pd.DataFrame(list(chi2(X1, y1)))
    chi2_test.columns = X1.columns
    chi2_test = chi2_test.sort_values([0, 1], axis=1, ascending=False).T
    chi2_test.columns = ["chi2", "pVal"]
    
    # bonferroni multiple-testing correction
    rejected_list, pvalue_corrected_list, alphaC, alphacBonf = statsmodels.sandbox.stats.multicomp.multipletests(chi2_test["pVal"], 
                                                                                          alpha=0.005, method='bonferroni', is_sorted=False)

    chi2_test_corrected = chi2_test[rejected_list].copy()
    chi2_test_corrected["corrected_pVal"] = pvalue_corrected_list[rejected_list]
    print chi2_test_corrected.shape
    
    chi2_test_top50_df = pd.DataFrame()
    for allele, row in chi2_test_corrected.iloc[:40, :].iterrows():
        cluster, allele_id = allele.split("_")[0].replace("ter", "ter "), allele.split("_")[1]
        chi2_statistic = row["chi2"]
        pVal_statistic = row["pVal"]
        pVal_corrected_stat = row["corrected_pVal"]
        # print cluster, svm_weight_score
        cluster_series_df = clust_info_df.loc[cluster, :]
        cluster_series_df["chi2"] = chi2_statistic
        cluster_series_df["pVal"] = pVal_statistic
        cluster_series_df["pVal_corrected"] = pVal_corrected_stat
        cluster_series_df.drop(["name_to_rv"], inplace=True)
        chi2_test_top50_df = chi2_test_top50_df.append(cluster_series_df)
    
    new_column_order = ["chi2", "pVal","pVal_corrected", "clust_to_rv", "gene_name", "product", "cog", "count",  "ortho", 
                        "pan",  "refseq", "score"]
    chi2_test_top50_df = chi2_test_top50_df[new_column_order]
    
    chi2_test_top50_df.to_excel(writer, sheet_name = drug_name+"_chi2", index=True)
    
writer.save()

  0%|          | 0/12 [00:00<?, ?it/s]

S: 506 | R: 1057
Before removing low variance:  (1563, 9973)


  if np.issubdtype(mask.dtype, np.int):


After removing low variance:  (1563, 8652)
(1047, 3)


  8%|▊         | 1/12 [00:03<00:42,  3.82s/it]

S: 848 | R: 492
Before removing low variance:  (1340, 9973)
After removing low variance:  (1340, 8521)
(948, 3)


 17%|█▋        | 2/12 [00:07<00:35,  3.55s/it]

S: 578 | R: 983
Before removing low variance:  (1561, 9973)
After removing low variance:  (1561, 8651)
(1333, 3)


 25%|██▌       | 3/12 [00:10<00:31,  3.55s/it]

S: 92 | R: 137
Before removing low variance:  (229, 9973)
After removing low variance:  (229, 6773)


 33%|███▎      | 4/12 [00:12<00:25,  3.15s/it]

(9, 3)
S: 353 | R: 209
Before removing low variance:  (562, 9973)
After removing low variance:  (562, 7608)
(130, 3)


 42%|████▏     | 5/12 [00:15<00:21,  3.04s/it]

S: 554 | R: 302
Before removing low variance:  (856, 9973)
After removing low variance:  (856, 7872)
(138, 3)


 50%|█████     | 6/12 [00:18<00:18,  3.00s/it]

S: 295 | R: 80
Before removing low variance:  (375, 9973)
After removing low variance:  (375, 7302)
(135, 3)


 58%|█████▊    | 7/12 [00:20<00:14,  2.93s/it]

S: 732 | R: 663
Before removing low variance:  (1395, 9973)
After removing low variance:  (1395, 8405)
(899, 3)


 67%|██████▋   | 8/12 [00:23<00:11,  2.95s/it]

S: 257 | R: 142
Before removing low variance:  (399, 9973)
After removing low variance:  (399, 6865)
(46, 3)


 75%|███████▌  | 9/12 [00:26<00:08,  2.89s/it]

S: 262 | R: 71
Before removing low variance:  (333, 9973)
After removing low variance:  (333, 6803)


 83%|████████▎ | 10/12 [00:27<00:05,  2.79s/it]

(5, 3)
S: 550 | R: 278
Before removing low variance:  (828, 9973)
After removing low variance:  (828, 7840)
(176, 3)


 92%|█████████▏| 11/12 [00:30<00:02,  2.78s/it]

S: 237 | R: 141
Before removing low variance:  (378, 9973)
After removing low variance:  (378, 7027)
(69, 3)


100%|██████████| 12/12 [00:32<00:00,  2.75s/it]


### ANOVA

In [13]:
# writer = pd.ExcelWriter('Supplementary_files/ANOVA_Ftest_allele_AMR_associations.xlsx')
writer = pd.ExcelWriter(SUPP_DIR+'DataFile01_c.xlsx')

for drug_name in tqdm(["isoniazid", "ethambutol", "rifampicin", "pyrazinamide", "ethionamide", 
                  "ofloxacin", "4-aminosalicylic_acid", "streptomycin", "amikacin","cycloserine", 
                  "kanamycin", "capreomycin"]): 
    ANOVA_test = pd.DataFrame()
    
    if drug_name == "MDR" or drug_name == "XDR":
        X1, y1 = get_MDR_training(resistance_data, data_to_plot, drug_name)
    else:
        y1, X1 = get_target_data(data_to_plot, resistance_data, drug_name)
        # X1 = remove_key_antibiotic_clusters(X1, drug_name)
    
    print "Before removing low variance: ", X1.shape
    selector = VarianceThreshold(threshold=.01)
    selector.fit_transform(X1)
    X1 = X1[X1.columns[selector.get_support()]].copy()
    print "After removing low variance: ", X1.shape
    
    # chi2_test = pd.DataFrame(chi2(X1, y1)[0]).T
    ANOVA_test = pd.DataFrame(list(f_classif(X1, y1)))
    ANOVA_test.columns = X1.columns
    ANOVA_test = ANOVA_test.sort_values([0, 1], axis=1, ascending=False).T
    ANOVA_test.columns = ["F_value", "pVal"]
    
    # bonferroni multiple-testing correction
    rejected_list, pvalue_corrected_list, alphaC, alphacBonf = statsmodels.sandbox.stats.multicomp.multipletests(ANOVA_test["pVal"], 
                                                                                          alpha=0.005, method='bonferroni', is_sorted=False)

    ANOVA_test_corrected = ANOVA_test[rejected_list].copy()
    ANOVA_test_corrected["corrected_pVal"] = pvalue_corrected_list[rejected_list]
    print ANOVA_test_corrected.shape
    
    ANOVA_test_top50_df = pd.DataFrame()
    for allele, row in ANOVA_test_corrected.iloc[:40, :].iterrows():
        cluster, allele_id = allele.split("_")[0].replace("ter", "ter "), allele.split("_")[1]
        FVal_statistic = row["F_value"]
        pVal_statistic = row["pVal"]
        pVal_corrected_stat = row["corrected_pVal"]
        # print cluster, svm_weight_score
        cluster_series_df = clust_info_df.loc[cluster, :]
        cluster_series_df["F_value"] = FVal_statistic
        cluster_series_df["pVal"] = pVal_statistic
        cluster_series_df["pVal_corrected"] = pVal_corrected_stat
        cluster_series_df.drop(["name_to_rv"], inplace=True)
        ANOVA_test_top50_df = ANOVA_test_top50_df.append(cluster_series_df)
    
    new_column_order = ["F_value", "pVal","pVal_corrected", "clust_to_rv", "gene_name", "product", "cog", "count",  "ortho", 
                        "pan",  "refseq", "score"]
    ANOVA_test_top50_df = ANOVA_test_top50_df[new_column_order]
    
    ANOVA_test_top50_df.to_excel(writer, sheet_name = drug_name+"_ANOVA", index=True)
    
writer.save()

  0%|          | 0/12 [00:00<?, ?it/s]

S: 506 | R: 1057
Before removing low variance:  (1563, 9973)
After removing low variance:  (1563, 8652)
(1560, 3)


  8%|▊         | 1/12 [00:04<00:46,  4.25s/it]

S: 848 | R: 492
Before removing low variance:  (1340, 9973)
After removing low variance:  (1340, 8521)
(1550, 3)


 17%|█▋        | 2/12 [00:07<00:38,  3.89s/it]

S: 578 | R: 983
Before removing low variance:  (1561, 9973)
After removing low variance:  (1561, 8651)
(1827, 3)


 25%|██▌       | 3/12 [00:11<00:34,  3.79s/it]

S: 92 | R: 137
Before removing low variance:  (229, 9973)
After removing low variance:  (229, 6773)
(101, 3)


 33%|███▎      | 4/12 [00:13<00:27,  3.45s/it]

S: 353 | R: 209
Before removing low variance:  (562, 9973)
After removing low variance:  (562, 7608)
(235, 3)


 42%|████▏     | 5/12 [00:16<00:23,  3.30s/it]

S: 554 | R: 302
Before removing low variance:  (856, 9973)
After removing low variance:  (856, 7872)
(230, 3)


 50%|█████     | 6/12 [00:19<00:19,  3.27s/it]

S: 295 | R: 80
Before removing low variance:  (375, 9973)
After removing low variance:  (375, 7302)
(233, 3)


 58%|█████▊    | 7/12 [00:22<00:15,  3.15s/it]

S: 732 | R: 663
Before removing low variance:  (1395, 9973)
After removing low variance:  (1395, 8405)
(1466, 3)


 67%|██████▋   | 8/12 [00:25<00:12,  3.16s/it]

S: 257 | R: 142
Before removing low variance:  (399, 9973)
After removing low variance:  (399, 6865)
(255, 3)


 75%|███████▌  | 9/12 [00:27<00:09,  3.11s/it]

S: 262 | R: 71
Before removing low variance:  (333, 9973)
After removing low variance:  (333, 6803)
(20, 3)


 83%|████████▎ | 10/12 [00:30<00:06,  3.01s/it]

S: 550 | R: 278
Before removing low variance:  (828, 9973)
After removing low variance:  (828, 7840)
(427, 3)


 92%|█████████▏| 11/12 [00:33<00:03,  3.00s/it]

S: 237 | R: 141
Before removing low variance:  (378, 9973)
After removing low variance:  (378, 7027)
(193, 3)


100%|██████████| 12/12 [00:35<00:00,  2.97s/it]


## Write all three associations types to a single file!

In [14]:
writer = pd.ExcelWriter(FINAL_SUPP_DIR+'Supplementary_Data_File_1.xlsx')
for drug_name in tqdm(["isoniazid", 
                  "ethambutol", "rifampicin", "pyrazinamide", "ethionamide", 
                  "ofloxacin", "4-aminosalicylic_acid", "streptomycin", "amikacin","cycloserine", 
                  "kanamycin", "capreomycin"]): 
    
#     if drug_name == "4-aminosalicylic_acid":
#         MI_excel_df = pd.read_excel("DataFile01_a.xlsx", sheet_name="MI_4amniosalicylic")
#     else:
    MI_excel_df = pd.read_excel(SUPP_DIR+"DataFile01_a.xlsx", sheet_name=drug_name+"_MI")
        
    Chi2_excel_df = pd.read_excel(SUPP_DIR+"DataFile01_b.xlsx", sheet_name=drug_name+"_chi2")
    ANOVA_excel_df = pd.read_excel(SUPP_DIR+"DataFile01_c.xlsx", sheet_name=drug_name+"_ANOVA")
    
    MI_cols = ["MI sum", "clust_to_rv","gene_name","product"]
    chi2_cols = ["chi2","pVal","pVal_corrected","clust_to_rv","gene_name","product"]
    ANOVA_cols = ["F_value","pVal","pVal_corrected","clust_to_rv","gene_name","product"]
    
    sheet_header_str = ["","Mutual Information"] + [""]*5 + ["Chi-squared"] + [""]*7 + ["ANOVA F-test"]
    pd.DataFrame(sheet_header_str, columns=["Association"]).T.to_excel(writer,sheet_name = drug_name, 
                                                                       header=False,
                                                                       index=False, startrow=0)
    MI_excel_df[MI_cols].to_excel(writer, sheet_name = drug_name, index=True, startrow=1, startcol=1)
    Chi2_excel_df[chi2_cols].to_excel(writer, sheet_name = drug_name, index=True, startrow=1, startcol=7)
    ANOVA_excel_df[ANOVA_cols].to_excel(writer, sheet_name = drug_name, index=True, startrow=1, startcol=15)
    
writer.save()

100%|██████████| 12/12 [00:05<00:00,  2.05it/s]


## Check if primary genes are in MI, ANOVA, or ANOVA F-test

- NOTE - embC alleles were grouped in the embB cluster. Thus embB is sufficient in this case.

In [40]:
primary_amr_genes = {
    "isoniazid": ["Rv1908c", "Rv1484", "Rv1483"], #katG, inhA, fabG1
    "rifampicin": ["Rv0667", "Rv0668", "Rv3238c"], #rpoB, rpoC, Rv3238c
    "ethambutol": ["Rv3795",  "Rv3806c", "Rv1267c"], #embB, embC, ubiA, embR "Rv3793", - embC
    "pyrazinamide": ["Rv2043c"], #pncA
    "streptomycin": ["Rv0682", "Rv3919c"], #rpsL, gidB
    "ofloxacin": ["Rv0006"], #gyrA, gyrB
    "4-aminosalicylic_acid": ["Rv2447c", "Rv2764c"], #folC, thyA
    "ethionamide": ["Rv3854c","Rv1484"], #ethA, inhA
}

In [43]:
no_associations = []
for drug_name in tqdm(["isoniazid", 
                  "ethambutol", "rifampicin", "pyrazinamide", "ethionamide", 
                  "ofloxacin", "4-aminosalicylic_acid", "streptomycin", "amikacin","cycloserine", 
                  "kanamycin", "capreomycin"]): 
    
    MI_excel_df = pd.read_excel(SUPP_DIR+"DataFile01_a.xlsx", sheet_name=drug_name+"_MI")
    Chi2_excel_df = pd.read_excel(SUPP_DIR+"DataFile01_b.xlsx", sheet_name=drug_name+"_chi2")
    ANOVA_excel_df = pd.read_excel(SUPP_DIR+"DataFile01_c.xlsx", sheet_name=drug_name+"_ANOVA")
    if drug_name in primary_amr_genes.keys():
        print drug_name, "---"
        for key_gene in primary_amr_genes[drug_name]:  
            bool_test=0
            print key_gene
            if key_gene in MI_excel_df["clust_to_rv"].unique().astype(str).tolist():
                print "    in Mutual information associations"
                bool_test=1

            if key_gene in Chi2_excel_df["clust_to_rv"].unique().astype(str).tolist():
                print "    in CHI-squared associations"
                bool_test=1

            if key_gene in ANOVA_excel_df["clust_to_rv"].unique().astype(str).tolist():
                print "    in ANOVA associations"
                bool_test=1

            if bool_test==0:
                # print "--- NO ASSOCIATION OF", key_gene, "FOUND"
                no_associations.append((drug_name, key_gene))

  8%|▊         | 1/12 [00:00<00:04,  2.28it/s]

isoniazid ---
Rv1908c
    in Mutual information associations
    in CHI-squared associations
    in ANOVA associations
Rv1484
Rv1483


 17%|█▋        | 2/12 [00:00<00:04,  2.30it/s]

ethambutol ---
Rv3795
    in Mutual information associations
    in CHI-squared associations
    in ANOVA associations
Rv3806c
Rv1267c


 25%|██▌       | 3/12 [00:01<00:03,  2.30it/s]

rifampicin ---
Rv0667
    in Mutual information associations
    in CHI-squared associations
    in ANOVA associations
Rv0668
Rv3238c


 33%|███▎      | 4/12 [00:01<00:03,  2.32it/s]

pyrazinamide ---
Rv2043c
    in Mutual information associations
    in CHI-squared associations
    in ANOVA associations


 42%|████▏     | 5/12 [00:02<00:03,  2.31it/s]

ethionamide ---
Rv3854c
Rv1484


 50%|█████     | 6/12 [00:02<00:02,  2.32it/s]

ofloxacin ---
Rv0006
    in Mutual information associations
    in CHI-squared associations
    in ANOVA associations


 58%|█████▊    | 7/12 [00:03<00:02,  2.31it/s]

4-aminosalicylic_acid ---
Rv2447c
Rv2764c


 67%|██████▋   | 8/12 [00:03<00:01,  2.31it/s]

streptomycin ---
Rv0682
    in Mutual information associations
    in CHI-squared associations
    in ANOVA associations
Rv3919c


100%|██████████| 12/12 [00:05<00:00,  2.33it/s]


In [44]:
no_associations

[('isoniazid', 'Rv1484'),
 ('isoniazid', 'Rv1483'),
 ('ethambutol', 'Rv3806c'),
 ('ethambutol', 'Rv1267c'),
 ('rifampicin', 'Rv0668'),
 ('rifampicin', 'Rv3238c'),
 ('ethionamide', 'Rv3854c'),
 ('ethionamide', 'Rv1484'),
 ('4-aminosalicylic_acid', 'Rv2447c'),
 ('4-aminosalicylic_acid', 'Rv2764c'),
 ('streptomycin', 'Rv3919c')]