In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/master_uniprots.csv")

In [2]:
orphan_names = []
for r in df[df["strain"] != "H37Rv"].iterrows():
    orphan_names += [(r[1]["name"], r[1]["uniprot_id"])]

In [3]:
orphan_names

[('pncA', 'Q50575'),
 ('ptbB', 'A0A0H3M0T0'),
 ('hupB', 'A5U6Z7'),
 ('icl1', 'P9WKK6'),
 ('iphP', 'A0A045ISB3'),
 ('birA', 'P96884'),
 ('adoK', 'A5U4N0'),
 (nan, 'P95276'),
 ('atpE', 'P9WPS0'),
 ('rmlC', 'P9WH10'),
 (nan, 'P96830'),
 ('adoK', 'P9WID4'),
 ('cyp121', 'A0A0T9WNE5'),
 ('pncA', 'Q93SP7'),
 (nan, 'O69638')]

In [4]:
synonyms = {"P95276": "I6YC03", "P96830": "I6WXK4", "O69638": "I6YCQ4"}

uniprot_outs = []
chembl_uniprot = {}
for n in orphan_names:
    if str(n[0]) != "nan":
        df_ = df[df["name"] == n[0]]
        uniprot_in = df_[df_["strain"] == "H37Rv"]["uniprot_id"].tolist()
        uniprot_out = df_[df_["strain"] != "H37Rv"]["uniprot_id"].tolist()
        in_chembl = min(1, sum(df_["in_chembl"].tolist()))
        uniprot_outs += uniprot_out
        for p in uniprot_in:
            chembl_uniprot[p] = in_chembl
    else:
        pass

In [5]:
df = df[~df["uniprot_id"].isin(uniprot_outs)]

In [6]:
in_chembls = []
for c, p in df[["in_chembl", "uniprot_id"]].values:
    if p in chembl_uniprot:
        in_chembls += [chembl_uniprot[p]]
    else:
        in_chembls += [c]

df.loc[:, "in_chembl"] = in_chembls

In [7]:
uniprots = []
for p in df["uniprot_id"].tolist():
    if p in synonyms:
        uniprots += [synonyms[p]]
    else:
        uniprots += [p]

df.loc[:, "uniprot_id"] = uniprots

In [8]:
remove_orfids = ["Rv0795", "Rv2561"]

df = df[~df["ORFID"].isin(remove_orfids)]

remove_orfids = ["MT0162", "MT3771", "MT1988"]
prot2chembl = {
    "I6YC03": 1,
    "I6YCQ4": 1,
}

df = df[~df["ORFID"].isin(remove_orfids)]

duplicate_uniprots = df["uniprot_id"][df["uniprot_id"].duplicated()].unique().tolist()

chembls = []
for p, c in df[["uniprot_id", "in_chembl"]].values:
    if p in prot2chembl:
        chembls += [prot2chembl[p]]
    else:
        chembls += [c]

df.loc[:, "in_chembl"] = chembls

In [9]:
rename = {
    "uniprot_id": "uniprot_ac",
    "ORFID": "orf_id",
    "patric": "patric_name",
    "VI": "vi",
    "VI_lower": "vi_lower",
    "VI_higher": "vi_higher",
    "in_known": "is_known",
}

df = df.rename(columns=rename)
df = df.reset_index(drop=True)

df = df.drop(columns=["name"])

In [10]:
df.to_csv("../data/processed/01_master_uniprots.csv", index=False)

In [11]:
df

Unnamed: 0,orf_id,patric_name,essentiality,strain,vi,vi_lower,vi_higher,high_confidence,uniprot_ac,in_chembl,is_known
0,Rv0416,thiamine biosynthesis protein ThiS,Essential,H37Rv,-7.926,-8.953,-6.891,False,P96262,0,0
1,Rv0414c,thiamine-phosphate pyrophosphorylase ThiE,Essential,H37Rv,-0.281,-0.480,-0.090,False,P9WG75,0,0
2,Rv0054,single-strand DNA-binding protein Ssb,Essential,H37Rv,-10.732,-14.644,-6.738,False,P9WGD5,0,0
3,Rv0705,30S ribosomal protein S19 RpsS,Essential,H37Rv,-17.252,-20.415,-13.994,False,P9WH45,0,0
4,Rv0710,30S ribosomal protein S17 RpsQ,Essential,H37Rv,-17.589,-18.498,-16.616,False,P9WH51,0,0
...,...,...,...,...,...,...,...,...,...,...,...
3931,Rv3459c,30S ribosomal protein S11 RpsK,Uncertain,H37Rv,-11.728,-14.195,-9.328,True,P9WH65,0,0
3932,Rv3924c,50S ribosomal protein L34 RpmH,Uncertain,H37Rv,-11.992,-13.633,-10.352,True,P9WH93,0,0
3933,Rv0979A,50S ribosomal protein L32 RpmF,Uncertain,H37Rv,-6.578,-7.897,-5.288,True,P9WH99,0,0
3934,Rv0723,50S ribosomal protein L15 RplO,Uncertain,H37Rv,-15.142,-17.770,-12.528,True,P9WHD7,0,0
