In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

data_dir = Path("data")

cell_col = "CELL_TYPE"
phenotype_col = "PHENOTYPE_NOTES"
method_col = "LIBRARY_METHODOLOGY"
gene_col = "OFFICIAL_SYMBOL"
hit_col = "HIT"

In [None]:
human_screens = pd.read_csv(data_dir / "screens/index_homo_sapiens.tsv", sep="\t")
mouse_screens = pd.read_csv(data_dir / "screens/index_mus_musculus.tsv", sep="\t")

human_genome = pd.read_csv("../../genomes/genome_homo_sapiens.tsv", sep="\t")
mouse_genome = pd.read_csv("../../genomes/genome_mus_musculus.tsv", sep="\t")

In [None]:
all_screens = pd.concat([human_screens, mouse_screens])

In [None]:
screens = all_screens[~all_screens["SIGNIFICANCE_CRITERIA"].str.contains("OR")].reset_index(drop=True)

In [None]:
def check_not_empty(xs):
    assert (xs.notnull() & (xs.str.strip() != "")).all()

In [None]:
cells = screens[cell_col].drop_duplicates().sort_values()
check_not_empty(cells)
cells.to_csv(data_dir / "terms/cells.csv", index=False)

In [None]:
screens[phenotype_col] = screens["PHENOTYPE"] + ". " + screens["NOTES"]
phenotypes = screens[phenotype_col].drop_duplicates().sort_values()
check_not_empty(phenotypes)
phenotypes.to_csv(data_dir / "terms/phenotypes.csv", index=False)

In [None]:
methods = screens[method_col].drop_duplicates().sort_values()
check_not_empty(methods)
methods.to_csv(data_dir / "terms/methods.csv", index=False)

In [None]:
human_genome = human_genome[human_genome["Gene_Type"] == "PROTEIN_CODING"]
mouse_genome = mouse_genome[mouse_genome["Gene_Type"] == "PROTEIN_CODING"]

assert not human_genome[gene_col].duplicated().any()
assert not mouse_genome[gene_col].duplicated().any()

In [None]:
human_genome.to_csv(data_dir / "terms/genes_human.csv", index=False)

In [None]:
mouse_genome.to_csv(data_dir / "terms/genes_mouse.csv", index=False)