# Gene post-processing

Motivation: ensure genes are displayed properly and properly searchable. Principally concerns UniProt.

In [1]:
import pandas as pd
pd.__version__

'1.4.0'

In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
nodes_df = pd.read_pickle("./verit-web/nodes.pkl")
nodes_df["KB"] = nodes_df["Id"].str.split(":").str[0]

In [None]:
# Get all existing uniprot ids from nodes.pkl and prepare for uniprot id mapping tool
up_ids = nodes_df[nodes_df["Id"].str.startswith("uniprot:")].drop_duplicates(subset="Id")["Id"]
up_ids = up_ids.str.split(":").str[1]
pd.DataFrame(up_ids).to_csv("~/Downloads/UniProt_node_IDs.csv", index=False)

In [None]:
# After running mapping tool, select the "From" and "Gene Names" fields for writing out
up_df = pd.read_csv("~/Downloads/nodes_uniprot_primary_v1.tsv", sep="\t")
up_df["Gene Names"] = up_df["Gene Names"].str.split(" ")
up_df = up_df.explode("Gene Names").reset_index(drop=True).drop_duplicates()
up_df.columns = ["Id", "Label"]
up_df["Id"] = "uniprot:" + up_df["Id"]

# Write out as the uniprot mappings
with open("./uniprot_nodes.pkl", "wb") as p:
    pickle.dump(up_df, p)

In [None]:
up_temp = up_df.merge(nodes_df[["Id", "PR"]], on="Id", how="left").drop_duplicates()

up_temp["Priority"] = True
nodes_df["Priority"] = False
up_temp["KB"] = "uniprot"

nodes_df = pd.concat([nodes_df, up_temp]).sort_values(["Id", "Priority"], ascending=False)
nodes_df = nodes_df.dropna().reset_index(drop=True).drop_duplicates()

# Write out as nodes (the important part for displaying correct symbol)
with open("./PMC_OA_pickles/nodes.pkl", "wb") as p:
    pickle.dump(nodes_df, p)

In [None]:
cdb_df = pd.read_pickle("../combinedDBs.pkl")

up_temp = up_df.copy()
up_temp.columns = ["id", "name"]

up_temp["Priority"] = True
cdb_df["Priority"] = False

cdb_df = pd.concat([cdb_df, up_temp]).sort_values(["id", "Priority"], ascending=False).reset_index(drop=True).drop_duplicates()

# Write out as combinedDBs (important for ensuring )
with open("./PMC_OA_pickles/combinedDBs.pkl", "wb") as p:
    pickle.dump(cdb_df, p)

## If need pandas 1.1.5, run below
Must downgrade pandas

In [None]:
pkl = pd.read_pickle("~/Downloads/verit-web/PMC_OA_pickles/nodes.pkl")
pkl.to_csv("./p115_nodes.csv", index=False)

In [None]:
pkl = pd.read_pickle("~/Downloads/verit-web/PMC_OA_pickles/uniprot_nodes.pkl")
pkl.to_csv("./P115_uniprot_nodes.csv", index=False)

### Now, downgrade pandas

In [None]:
import pandas as pd
import pickle
assert pd.__version__ == "1.1.5"

In [None]:
csv = pd.read_csv("./p115_nodes.csv")
csv.to_pickle("C:/Users/ericj/Downloads/verit-web/PMC_OA_pickles/pandas_1.1.5_pickles/nodes.pkl")

csv = pd.read_csv("./p115_uniprot_nodes.csv")
csv.to_pickle("C:/Users/ericj/Downloads/verit-web/PMC_OA_pickles/pandas_1.1.5_pickles/uniprot_nodes.pkl")