In [1]:
import os
import pandas as pd

datapath = "../data"

In [3]:
dfs = pd.read_excel(
    os.path.join(datapath, "raw", "Bosc_etal_21_pebble.xlsx"), sheet_name=None
)
for k, v in dfs.items():
    print(k, len(v))

essential_vi 433
essential_novi 27
essentialdomain_novi 3
essentialdomain_vi 26
growthdefect_vi 121
growthdefect_novi 14
nonessential_novi 2987
nonessential_vi 4
uncertain_vi 8
uncertain_novi 103
growthadvantage_vi 2
growthadvantage_novi 308


In [4]:
orfid_sets = {name: set(df["ORFID"].dropna()) for name, df in dfs.items()}

from collections import defaultdict

orfid_map = defaultdict(list)
for sheet, ids in orfid_sets.items():
    for orf in ids:
        orfid_map[orf].append(sheet)

duplicates = {orf: sheets for orf, sheets in orfid_map.items() if len(sheets) > 1}

print(f"Number of ORFIDs found in more than one sheet: {len(duplicates)}")

Number of ORFIDs found in more than one sheet: 0


In [27]:
import requests


def uniprot_accession(gene):
    url = "https://rest.uniprot.org/uniprotkb/search"
    query = f"gene_exact:{gene} AND organism_id:83332"
    params = {"query": query, "format": "json", "size": 1}
    r = requests.get(url, params=params)
    r.raise_for_status()
    results = r.json().get("results", [])
    if results:
        if len(results) > 1:
            print(gene)
        uniprot_id = results[0]["primaryAccession"]
        try:
            orf_id = results[0]["genes"][0]["orderedLocusNames"][0]["value"]
        except:
            uniprot_id = None  # to check manually
            orf_id = None
        return uniprot_id, orf_id
    return None, None

In [28]:
for k, v in dfs.items():
    if k == "nonessential_novi":
        uniprot_ids = []
        for i, gene in enumerate(dfs[k]["ORFID"].tolist()):
            uniprot_id, orf_id = uniprot_accession(gene)
            if orf_id == gene:
                uniprot_ids += [uniprot_id]
            else:
                uniprot_ids += [None]
                print(
                    f"[{k}] MISMATCH: query={gene}, orderedLocus={orf_id}, UniProt={uniprot_id}"
                )
        dfs[k]["uniprot_id"] = uniprot_ids
        dfs[k].to_csv(os.path.join(datapath, "processed", f"{k}.csv"), index=False)

[nonessential_novi] MISMATCH: query=Rv1181, orderedLocus=None, UniProt=None
[nonessential_novi] MISMATCH: query=Rv2922A, orderedLocus=Rv2922.1c, UniProt=P9WQC9
[nonessential_novi] MISMATCH: query=Rv1369c, orderedLocus=Rv0796, UniProt=P9WKH9
[nonessential_novi] MISMATCH: query=Rv1370c, orderedLocus=Rv0795, UniProt=P9WKH5
[nonessential_novi] MISMATCH: query=Rv1756c, orderedLocus=Rv0796, UniProt=P9WKH9
[nonessential_novi] MISMATCH: query=Rv1757c, orderedLocus=Rv0795, UniProt=P9WKH5
[nonessential_novi] MISMATCH: query=Rv1763, orderedLocus=Rv0795, UniProt=P9WKH5
[nonessential_novi] MISMATCH: query=Rv1764, orderedLocus=Rv0796, UniProt=P9WKH9
[nonessential_novi] MISMATCH: query=Rv2105, orderedLocus=Rv0795, UniProt=P9WKH5
[nonessential_novi] MISMATCH: query=Rv2106, orderedLocus=Rv0796, UniProt=P9WKH9
[nonessential_novi] MISMATCH: query=Rv2167c, orderedLocus=Rv0796, UniProt=P9WKH9
[nonessential_novi] MISMATCH: query=Rv2168c, orderedLocus=Rv0795, UniProt=P9WKH5
[nonessential_novi] MISMATCH: quer

In [34]:
nonessential_novi = {
    "Rv1181": "A0A089QRB9",
    "Rv2922A": "P9WQC9",
    "Rv1369c": "P9WKH9",
    "Rv1370c": "P9WKH5",
    "Rv1756c": "P9WKH9",
    "Rv1757c": "P9WKH5",
    "Rv1763": "P9WKH5",
    "Rv1764": "P9WKH9",
    "Rv2105": "P9WKH5",
    "Rv2106": "P9WKH9",
    "Rv2167c": "P9WKH9",
    "Rv2168c": "P9WKH5",
    "Rv2278": "P9WKH5",
    "Rv2279": "P9WKH9",
    "Rv2307D": "L7N683",
    "Rv2354": "P9WKH5",
    "Rv2355": "P9WKH9",
    "Rv2479c": "P9WKH9",
    "Rv2480c": "P9WKH5",
    "Rv2512c": "P60230",
    "Rv2561": "P9WL99",
    "Rv2562": "P9WL99",
    "Rv2648": "P9WKH5",
    "Rv2649": "P9WKH9",
    "Rv2814c": "P9WKH9",
    "Rv2815c": "P9WKH5",
    "Rv3023c": "P96354",
    "Rv3115": "P96354",
    "Rv3184": "P9WKH5",
    "Rv3185": "P9WKH9",
    "Rv3186": "P9WKH5",
    "Rv3187": "P9WKH9",
    "Rv3325": "P9WKH5",
    "Rv3326": "P9WKH5",
    "Rv3380c": "P9WKH9",
    "Rv3381c": "P9WKH5",
    "Rv3467": "	Q50655",
    "Rv3474": "P9WKH5",
    "Rv3475": "P9WKH9",
    "Rv3798": "P9WKH7",
    "Rv3844": "P96234",
}

growthadv_novi = {"Rv0815c": "P9WHF9"}

uncertain_novi = {
    "Rv3021c": "P9WHY7",
}

In [35]:
# add missing items from manual curation

df = pd.read_csv(os.path.join(datapath, "processed", "growthadvantage_novi.csv"))
df["uniprot_id"] = df["uniprot_id"].fillna(df["ORFID"].map(growthadv_novi))
df.to_csv(os.path.join(datapath, "processed", "growthadvantage_novi.csv"), index=False)

df = pd.read_csv(os.path.join(datapath, "processed", "nonessential_novi.csv"))
df["uniprot_id"] = df["uniprot_id"].fillna(df["ORFID"].map(nonessential_novi))
df.to_csv(os.path.join(datapath, "processed", "nonessential_novi.csv"), index=False)

df = pd.read_csv(os.path.join(datapath, "processed", "uncertain_novi.csv"))
df["uniprot_id"] = df["uniprot_id"].fillna(df["ORFID"].map(uncertain_novi))
df.to_csv(os.path.join(datapath, "processed", "uncertain_novi.csv"), index=False)

In [59]:
datasets = [
    "essential_novi",
    "essential_vi",
    "essentialdomain_novi",
    "essentialdomain_vi",
    "growthadvantage_novi",
    "growthadvantage_vi",
    "growthdefect_novi",
    "growthdefect_vi",
    "nonessential_vi",
    "nonessential_novi",
    "uncertain_novi",
    "uncertain_vi",
]
for d in datasets:
    df = pd.read_csv(os.path.join(datapath, "processed", f"{d}.csv"))
    print(d, len(df[df["uniprot_id"].isna()]))

essential_novi 0
essential_vi 0
essentialdomain_novi 0
essentialdomain_vi 0
growthadvantage_novi 0
growthadvantage_vi 0
growthdefect_novi 0
growthdefect_vi 0
nonessential_vi 0
nonessential_novi 0
uncertain_novi 31
uncertain_vi 0


In [60]:
## Join into single dataset
dfs = []
datasets = [
    "essential_novi",
    "essential_vi",
    "essentialdomain_novi",
    "essentialdomain_vi",
    "growthadvantage_novi",
    "growthadvantage_vi",
    "growthdefect_novi",
    "growthdefect_vi",
    "nonessential_vi",
    "nonessential_novi",
    "uncertain_novi",
    "uncertain_vi",
]
for d in datasets:
    df = pd.read_csv(os.path.join(datapath, "processed", f"{d}.csv"))
    print(d, df.shape)
    if "Unnamed: 9" in list(df.columns):
        df = df.drop(columns="Unnamed: 9")
        df = df[~df["VI"].astype(str).str.contains("Not Present.")]
    print(df.shape)
    df = df[~df["uniprot_id"].isna()]
    print(df.shape)
    df = df[~df["VI"].isna()]
    print(df.shape)
    dfs += [df]

df = pd.concat(dfs)
df.to_csv(os.path.join(datapath, "processed", "bosc_uniprots.csv"), index=False)

essential_novi (27, 10)
(27, 10)
(27, 10)
(27, 10)
essential_vi (433, 10)
(433, 10)
(433, 10)
(433, 10)
essentialdomain_novi (3, 10)
(3, 10)
(3, 10)
(3, 10)
essentialdomain_vi (26, 10)
(26, 10)
(26, 10)
(26, 10)
growthadvantage_novi (308, 11)
(301, 10)
(301, 10)
(301, 10)
growthadvantage_vi (2, 10)
(2, 10)
(2, 10)
(2, 10)
growthdefect_novi (14, 10)
(14, 10)
(14, 10)
(14, 10)
growthdefect_vi (121, 10)
(121, 10)
(121, 10)
(121, 10)
nonessential_vi (4, 10)
(4, 10)
(4, 10)
(4, 10)
nonessential_novi (2987, 11)
(2930, 10)
(2930, 10)
(2930, 10)
uncertain_novi (103, 11)
(97, 10)
(69, 10)
(69, 10)
uncertain_vi (8, 10)
(8, 10)
(8, 10)
(8, 10)


## Adding chembl and selected genes

In [119]:
chembl = pd.read_csv(os.path.join(datapath, "raw", "assays_singleprotein_uniprot.csv"))
known = pd.read_csv(os.path.join(datapath, "raw", "mtb_targets.csv"))

in_chembl = set(chembl["Uniprot ID"].tolist())
in_known = set(known["uniprot_ac"].tolist())

print(len(in_chembl), len(in_known), len(in_chembl.intersection(in_known)))
print("In known but not in chembl: ", in_known - in_chembl)

interesting = set(list(in_chembl) + list(in_known))
print(len(interesting))

132 32 19
In known but not in chembl:  {'P9WKD3', 'P9WPS1', 'P9WFX9', 'P9WFR5', 'P9WFY1', 'P9WJV5', 'P9WH43', 'I6XI14', 'P9WIL3', 'I6Y4C7', 'P9WNL7', 'P9WG45', 'P9WJY5'}
145


In [120]:
df = pd.read_csv(os.path.join(datapath, "processed", "bosc_uniprots.csv"))
df["in_chembl"] = [1 if x in in_chembl else 0 for x in df["uniprot_id"].tolist()]
df["in_known"] = [1 if x in in_known else 0 for x in df["uniprot_id"].tolist()]
df

Unnamed: 0,ORFID,name,patric,essentiality,strain,VI,VI_lower,VI_higher,high_confidence,uniprot_id,in_chembl,in_known
0,Rv0416,thiS,thiamine biosynthesis protein ThiS,Essential,H37Rv,-7.926,-8.953,-6.891,False,P96262,0,0
1,Rv0414c,thiE,thiamine-phosphate pyrophosphorylase ThiE,Essential,H37Rv,-0.281,-0.480,-0.090,False,P9WG75,0,0
2,Rv0054,ssb,single-strand DNA-binding protein Ssb,Essential,H37Rv,-10.732,-14.644,-6.738,False,P9WGD5,0,0
3,Rv0705,rpsS,30S ribosomal protein S19 RpsS,Essential,H37Rv,-17.252,-20.415,-13.994,False,P9WH45,0,0
4,Rv0710,rpsQ,30S ribosomal protein S17 RpsQ,Essential,H37Rv,-17.589,-18.498,-16.616,False,P9WH51,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3933,Rv3459c,rpsK,30S ribosomal protein S11 RpsK,Uncertain,H37Rv,-11.728,-14.195,-9.328,True,P9WH65,0,0
3934,Rv3924c,rpmH,50S ribosomal protein L34 RpmH,Uncertain,H37Rv,-11.992,-13.633,-10.352,True,P9WH93,0,0
3935,Rv0979A,rpmF,50S ribosomal protein L32 RpmF,Uncertain,H37Rv,-6.578,-7.897,-5.288,True,P9WH99,0,0
3936,Rv0723,rplO,50S ribosomal protein L15 RplO,Uncertain,H37Rv,-15.142,-17.770,-12.528,True,P9WHD7,0,0


In [121]:
uniprots = df["uniprot_id"].tolist()
missing = in_known - set(uniprots)
print(len(missing), missing)
uniprots = df["uniprot_id"].tolist()
missing = in_chembl - set(uniprots)
print(len(missing), missing)

0 set()
15 {'Q50575', 'A0A0H3M0T0', 'A5U6Z7', 'P9WKK6', 'A0A045ISB3', 'P96884', 'A5U4N0', 'P95276', 'P9WPS0', 'P9WH10', 'P96830', 'P9WID4', 'A0A0T9WNE5', 'Q93SP7', 'O69638'}


In [122]:
missing_info = {
    "Q50575": [
        None,
        "pncA",
        "Nicotinamide deamidase",
        None,
        None,
        None,
        None,
        None,
        None,
        "Q50575",
        1,
        0,
    ],
    "A0A0H3M0T0": [
        None,
        "ptbB",
        "Phosphotyrosine protein phosphatase ptpB",
        None,
        None,
        None,
        None,
        None,
        None,
        "A0A0H3M0T0",
        1,
        0,
    ],
    "A5U6Z7": [
        "MRA_3015",
        "hupB",
        "DNA-binding protein HupB",
        None,
        "Hr7Ra",
        None,
        None,
        None,
        None,
        "A5U6Z7",
        1,
        0,
    ],
    "P9WKK6": [
        "MT0483",
        "icl1",
        "Isocitrate lyase 1",
        None,
        "CDC1551",
        None,
        None,
        None,
        None,
        "P9WKK6",
        1,
        0,
    ],
    "A0A045ISB3": [
        None,
        "iphP",
        "Phosphotyrosine protein phosphatase",
        None,
        None,
        None,
        None,
        None,
        None,
        "A0A045ISB3",
        1,
        0,
    ],
    "P96884": [
        None,
        "birA",
        "biotin--[biotin carboxyl-carrier protein] ligase",
        None,
        None,
        None,
        None,
        None,
        None,
        "P96884",
        1,
        0,
    ],
    "A5U4N0": [
        "MRA_2218",
        "adoK",
        "Adenosine kinase",
        None,
        "Hr7Ra",
        None,
        None,
        None,
        None,
        "A5U4N0",
        1,
        0,
    ],
    "P95276": [
        "MT1988",
        None,
        "Epoxide hydrolase B",
        None,
        "CDC1551",
        None,
        None,
        None,
        None,
        "P95276",
        1,
        0,
    ],
    "P9WPS0": [
        "MT1345",
        "atpE",
        "ATP synthase subunit c",
        None,
        "CDC1551",
        None,
        None,
        None,
        None,
        "P9WPS0",
        1,
        0,
    ],
    "P9WH10": [
        "MT3571",
        "rmlC",
        "dTDP-4-dehydrorhamnose 3,5-epimerase",
        None,
        "CDC1551",
        None,
        None,
        None,
        None,
        "P9WH10",
        1,
        0,
    ],
    "P96830": [
        "MT0162",
        None,
        "Tyrosine specific protein phosphatases domain-containing protein",
        None,
        "CDC1551",
        None,
        None,
        None,
        None,
        "P96830",
        1,
        0,
    ],
    "P9WID4": [
        "MT2258",
        "adoK",
        "Adenosine kinase",
        None,
        "CDC1551",
        None,
        None,
        None,
        None,
        "P9WID4",
        1,
        0,
    ],
    "A0A0T9WNE5": [
        None,
        "cyp121",
        "Cytochrome P450 MT2",
        None,
        None,
        None,
        None,
        None,
        None,
        "A0A0T9WNE5",
        1,
        0,
    ],
    "Q93SP7": [
        None,
        "pncA",
        "Nicotinamide deamidase",
        None,
        None,
        None,
        None,
        None,
        None,
        "Q93SP7",
        1,
        0,
    ],
    "O69638": [
        "MT3771",
        None,
        "Hydrolase, alpha/beta hydrolase fold family",
        None,
        None,
        None,
        None,
        None,
        None,
        "O69638",
        1,
        0,
    ],
}

In [123]:
cols = df.columns.tolist()
missing_df = pd.DataFrame.from_dict(
    missing_info,
    orient="index",
)
missing_df = missing_df.reset_index(drop=True)
missing_df.columns = cols
df = pd.concat([df, missing_df])
df

  df = pd.concat([df, missing_df])


Unnamed: 0,ORFID,name,patric,essentiality,strain,VI,VI_lower,VI_higher,high_confidence,uniprot_id,in_chembl,in_known
0,Rv0416,thiS,thiamine biosynthesis protein ThiS,Essential,H37Rv,-7.926,-8.953,-6.891,False,P96262,0,0
1,Rv0414c,thiE,thiamine-phosphate pyrophosphorylase ThiE,Essential,H37Rv,-0.281,-0.480,-0.090,False,P9WG75,0,0
2,Rv0054,ssb,single-strand DNA-binding protein Ssb,Essential,H37Rv,-10.732,-14.644,-6.738,False,P9WGD5,0,0
3,Rv0705,rpsS,30S ribosomal protein S19 RpsS,Essential,H37Rv,-17.252,-20.415,-13.994,False,P9WH45,0,0
4,Rv0710,rpsQ,30S ribosomal protein S17 RpsQ,Essential,H37Rv,-17.589,-18.498,-16.616,False,P9WH51,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
10,MT0162,,Tyrosine specific protein phosphatases domain-...,,CDC1551,,,,,P96830,1,0
11,MT2258,adoK,Adenosine kinase,,CDC1551,,,,,P9WID4,1,0
12,,cyp121,Cytochrome P450 MT2,,,,,,,A0A0T9WNE5,1,0
13,,pncA,Nicotinamide deamidase,,,,,,,Q93SP7,1,0


In [124]:
df.to_csv(os.path.join(datapath, "processed", "master_uniprots.csv"), index=False)

In [125]:
df = pd.read_csv(os.path.join(datapath, "processed", "master_uniprots.csv"))
df_ = df[df["VI"].isna()]

In [126]:
names = df_["name"].tolist()

In [127]:
for n in names:
    print(df[df["name"] == n])

        ORFID  name                              patric  essentiality strain  \
1275  Rv2043c  pncA  pyrazinamidase/nicotinamidase PncA  NonEssential  H37Rv   
3938      NaN  pncA              Nicotinamide deamidase           NaN    NaN   
3951      NaN  pncA              Nicotinamide deamidase           NaN    NaN   

         VI  VI_lower  VI_higher high_confidence uniprot_id  in_chembl  \
1275  1.113    -0.942      4.621           False     I6XD65          0   
3938    NaN       NaN        NaN             NaN     Q50575          1   
3951    NaN       NaN        NaN             NaN     Q93SP7          1   

      in_known  
1275         0  
3938         0  
3951         0  
        ORFID  name                                    patric  essentiality  \
1188  Rv0153c  ptbB  phosphotyrosine protein phosphatase PtpB  NonEssential   
3939      NaN  ptbB  Phosphotyrosine protein phosphatase ptpB           NaN   

     strain     VI  VI_lower  VI_higher high_confidence  uniprot_id  \
1188 