# Description

TODO

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from IPython.display import HTML

from entity import Trait, Gene
from data.cache import read_data
import conf

# Settings

In [3]:
EXPERIMENT_NAME = "single_gene"

LIPIDS_GENE_SET = "gene_set_decrease"
LIPIDS_GENE_SET_QUERY = "(rank == -3) | (rank == -2)"

In [4]:
OUTPUT_DIR = Path(
    conf.RESULTS["CRISPR_ANALYSES"]["BASE_DIR"], f"{EXPERIMENT_NAME}-{LIPIDS_GENE_SET}"
)
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
display(OUTPUT_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/crispr_analyses/single_gene-gene_set_decrease')

# Data loading

## S-MultiXcan results

### Load

In [5]:
smultixcan_results_filename = conf.PHENOMEXCAN[
    "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
]

display(smultixcan_results_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

In [6]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

### Rename genes and remove repeated ones

In [7]:
smultixcan_results = smultixcan_results.rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [8]:
smultixcan_results.index[smultixcan_results.index.duplicated(keep="first")]

Index(['SPATA13', 'LINC01422', 'LINC00484', 'MAL2', 'GOLGA8M', 'LINC01115',
       'LYNX1'],
      dtype='object', name='gene_name')

In [9]:
smultixcan_results = smultixcan_results.loc[
    ~smultixcan_results.index.duplicated(keep="first")
]

In [10]:
smultixcan_results.shape

(22508, 3752)

In [11]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


### Standardize by trait

In [12]:
_tmp = smultixcan_results.apply(lambda x: x / x.sum())

In [13]:
_tmp.shape

(22508, 3752)

In [14]:
assert _tmp.shape == smultixcan_results.shape

In [15]:
# some testing
_trait = "body height"
_gene = "SCYL3"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "100001_raw-Food_weight"
_gene = "DPM1"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "estrogen-receptor negative breast cancer"
_gene = "CFH"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "asthma"
_gene = "C1orf112"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

In [16]:
smultixcan_results = _tmp

## Differentially expressed genes

### Load

In [17]:
input_filepath = Path(conf.CRISPR["BASE_DIR"], "lipid_DEG.csv")
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/crispr_screen/lipid_DEG.csv')

In [18]:
deg_genes = pd.read_csv(input_filepath)

In [19]:
deg_genes.shape

(462, 12)

In [20]:
deg_genes.head()

Unnamed: 0,gene_name,GFPLow_vs_UnSorted.log2FC,GFPLow_vs_UnSorted.FDR,GFPLow_vs_UnSorted.DEG,GFPHigh_vs_UnSorted.log2FC,GFPHigh_vs_UnSorted.FDR,GFPHigh_vs_UnSorted.DEG,GFPHigh_vs_GFPLow.log2FC,GFPHigh_vs_GFPLow.FDR,GFPHigh_vs_GFPLow.DEG,lipid effect,rank
0,ABCA2,0.558668,0.116008,False,0.784919,0.027128,True,0.22625,0.632691,False,decrease *,-1
1,ABCB7,0.798525,0.02922,True,-0.238342,0.790778,False,-1.036866,0.000693,True,increase **,2
2,ABCE1,0.684075,0.047833,True,0.044326,0.975255,False,-0.63975,0.049229,True,increase **,2
3,ABCG2,0.438126,0.049607,True,0.37555,0.229539,False,-0.062576,0.883678,False,increase *,1
4,ABHD10,-0.641405,0.009846,True,-0.241943,0.658553,False,0.399461,0.156967,False,decrease *,-1


### Select gene set

In [21]:
df = deg_genes.query(LIPIDS_GENE_SET_QUERY)

In [22]:
df.shape

(96, 12)

In [23]:
df_genes = df["gene_name"].unique().tolist()

display(len(df_genes))
display(df_genes[:10])

assert len(df_genes) == 96

96

['ABHD5',
 'ACVR1B',
 'AGPAT3',
 'AMMECR1',
 'ARMC12',
 'ATP5O',
 'BACH1',
 'BCL9L',
 'BLCAP',
 'BMPER']

In [24]:
# keep genes present in S-MultiXcan results
df_genes_present = smultixcan_results.index.intersection(df_genes).tolist()

display(len(df_genes_present))
display(df_genes_present[:10])

assert len(df_genes_present) == 85

85

['ABHD5',
 'PTBP1',
 'NDUFB4',
 'NFYC',
 'LRRC40',
 'KEAP1',
 'OSBPL8',
 'NDUFB7',
 'CSK',
 'SQLE']

# Get top traits

In [25]:
traits = []

for g in df_genes_present:
    _tmp = smultixcan_results.loc[g]
    _tmp = _tmp[_tmp > 0.0].sort_values(ascending=False)

    #     _tmp = _tmp.head(50)
    traits.append(_tmp)

In [26]:
traits_df = (
    pd.concat(traits)
    .reset_index()
    .groupby("index")
    .sum()
    .sort_values(0, ascending=False)
    .reset_index()
).rename(columns={"index": "trait", 0: "value"})

In [27]:
# add trait category
trait_code_to_trait_obj = [
    Trait.get_trait(full_code=t)
    if not Trait.is_efo_label(t)
    else Trait.get_traits_from_efo(t)
    for t in traits_df["trait"]
]

In [28]:
traits_df = traits_df.assign(
    category=[
        t.category if not isinstance(t, list) else t[0].category
        for t in trait_code_to_trait_obj
    ]
)

In [29]:
traits_df.shape

(3752, 3)

In [30]:
traits_df.head()

Unnamed: 0,trait,value,category
0,6150_4-Vascularheart_problems_diagnosed_by_doc...,0.005252,Diseases (cardiovascular)
1,4079_raw-Diastolic_blood_pressure_automated_re...,0.005248,Blood pressure
2,hypertension,0.005163,Diseases (cardiovascular)
3,ASTHMA_EOSINOPHIL_SUGG-Suggestive_for_eosinoph...,0.005096,Diseases (FinnGen)
4,6153_2-Medication_for_cholesterol_blood_pressu...,0.005081,Medication


In [31]:
output_file = Path(OUTPUT_DIR, "traits.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/crispr_analyses/single_gene-gene_set_decrease/traits.pkl')

In [32]:
traits_df.to_pickle(output_file)

# Summary

In [33]:
top_traits = traits_df.head(100)

In [34]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
):
    display(top_traits)

Unnamed: 0,trait,value,category
0,6150_4-Vascularheart_problems_diagnosed_by_doctor_High_blood_pressure,0.005252,Diseases (cardiovascular)
1,4079_raw-Diastolic_blood_pressure_automated_reading,0.005248,Blood pressure
2,hypertension,0.005163,Diseases (cardiovascular)
3,ASTHMA_EOSINOPHIL_SUGG-Suggestive_for_eosinophilic_asthma,0.005096,Diseases (FinnGen)
4,6153_2-Medication_for_cholesterol_blood_pressure_diabetes_or_take_exogenous_hormones_Blood_pressure_medication,0.005081,Medication
5,20153_raw-Forced_expiratory_volume_in_1second_FEV1_predicted,0.005032,Spirometry
6,6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above,0.005031,Diseases (cardiovascular)
7,20003_1141191044-Treatmentmedication_code_levothyroxine_sodium,0.00498,Medications
8,30030_raw-Haematocrit_percentage,0.004969,Blood count
9,20003_1140860696-Treatmentmedication_code_lisinopril,0.004963,Medications


# Summary using trait categories

In [35]:
top_traits_categories = (
    top_traits.groupby("category")
    .mean()
    .sort_values("value", ascending=False)
    .reset_index()
)

In [36]:
top_traits_categories.head()

Unnamed: 0,category,value
0,Diseases (cardiovascular),0.005148
1,Blood pressure,0.004937
2,Blood,0.004875
3,Medication,0.004869
4,Anxiety,0.004834


In [37]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
):
    display(top_traits_categories)

Unnamed: 0,category,value
0,Diseases (cardiovascular),0.005148
1,Blood pressure,0.004937
2,Blood,0.004875
3,Medication,0.004869
4,Anxiety,0.004834
5,Body size measures,0.004801
6,Vitamin/mineral supplements yesterday,0.004794
7,Early life factors,0.004792
8,Diseases (FinnGen),0.00479
9,Blood count,0.004777


In [38]:
for row_idx, row in top_traits_categories.iterrows():
    category = row["category"]
    display(HTML(f"<h2>{category}</h2>"))

    _df = (
        top_traits[top_traits["category"] == category]
        .groupby("trait")["value"]
        .mean()
        .sort_values()
    )
    display(_df)

trait
6150_100-Vascularheart_problems_diagnosed_by_doctor_None_of_the_above    0.005031
hypertension                                                             0.005163
6150_4-Vascularheart_problems_diagnosed_by_doctor_High_blood_pressure    0.005252
Name: value, dtype: float64

trait
94_raw-Diastolic_blood_pressure_manual_reading         0.004672
4080_raw-Systolic_blood_pressure_automated_reading     0.004890
4079_raw-Diastolic_blood_pressure_automated_reading    0.005248
Name: value, dtype: float64

trait
erythrocyte count    0.004875
Name: value, dtype: float64

trait
6177_2-Medication_for_cholesterol_blood_pressure_or_diabetes_Blood_pressure_medication                            0.004657
6153_2-Medication_for_cholesterol_blood_pressure_diabetes_or_take_exogenous_hormones_Blood_pressure_medication    0.005081
Name: value, dtype: float64

trait
20419-Difficulty_concentrating_during_worst_period_of_anxiety    0.004834
Name: value, dtype: float64

trait
49_raw-Hip_circumference    0.004735
50_raw-Standing_height      0.004807
20015_raw-Sitting_height    0.004862
Name: value, dtype: float64

trait
20084_468-Vitamin_andor_mineral_supplement_use_Multivitamin_    0.004794
Name: value, dtype: float64

trait
20022_raw-Birth_weight    0.004740
body height               0.004843
Name: value, dtype: float64

trait
XVII_MALFORMAT_ABNORMAL-Congenital_malformations_deformations_and_chromosomal_abnormalities    0.004614
RHEUMA_SEROPOS_OTH-Otherunspecified_seropositiverheumatoid_arthritis                           0.004666
RHEUMA_SEROPOS-Seropositive_rheumatoid_arthritis                                               0.004733
C3_SKIN-Malignant_neoplasm_of_skin                                                             0.004765
C_SKIN                                                                                         0.004765
F5_ALCOHOLAC-Acute_alcohol_intoxication                                                        0.004894
ASTHMA_EOSINOPHIL_SUGG-Suggestive_for_eosinophilic_asthma                                      0.005096
Name: value, dtype: float64

trait
30100_raw-Mean_platelet_thrombocyte_volume    0.004615
30000_raw-White_blood_cell_leukocyte_count    0.004632
30010_raw-Red_blood_cell_erythrocyte_count    0.004655
30260_raw-Mean_reticulocyte_volume            0.004723
30080_raw-Platelet_count                      0.004880
30020_raw-Haemoglobin_concentration           0.004961
30030_raw-Haematocrit_percentage              0.004969
Name: value, dtype: float64

trait
3062_raw-Forced_vital_capacity_FVC                              0.004619
20151_raw-Forced_vital_capacity_FVC_Best_measure                0.004648
3063_raw-Forced_expiratory_volume_in_1second_FEV1               0.004661
3064_raw-Peak_expiratory_flow_PEF                               0.004867
20153_raw-Forced_expiratory_volume_in_1second_FEV1_predicted    0.005032
Name: value, dtype: float64

trait
5265_raw-Corneal_resistance_factor_left    0.004762
Name: value, dtype: float64

trait
20003_1140866280-Treatmentmedication_code_bumetanide              0.004570
20003_1140860728-Treatmentmedication_code_quinapril               0.004601
20003_1140888366-Treatmentmedication_code_thiamine_preparation    0.004617
20003_1140874744-Treatmentmedication_code_gliclazide              0.004720
20003_1140872492-Treatmentmedication_code_nicotine_product        0.004727
20003_1140872198-Treatmentmedication_code_sodium_valproate        0.004737
20003_1140884600-Treatmentmedication_code_metformin               0.004812
20003_1141194794-Treatmentmedication_code_bendroflumethiazide     0.004852
20003_1140860696-Treatmentmedication_code_lisinopril              0.004963
20003_1141191044-Treatmentmedication_code_levothyroxine_sodium    0.004980
Name: value, dtype: float64

trait
40001_C920-Underlying_primary_cause_of_death_ICD10_C920_Acute_myeloid_leukaemia    0.004757
Name: value, dtype: float64

trait
6148_1-Eye_problemsdisorders_Diabetes_related_eye_disease    0.004627
2217_raw-Age_started_wearing_glasses_or_contact_lenses       0.004852
Name: value, dtype: float64

trait
6158_1-Why_reduced_smoking_Illness_or_ill_health    0.004621
2907-Ever_stopped_smoking_for_6_months              0.004782
Name: value, dtype: float64

trait
22617_1141-Job_SOC_coding_Quality_assurance_managers                                                                                  0.004569
22601_11123211-Job_coding_director_or_chief_executive_of_private_organisationcompany_company_chairman_or_president_general_manager    0.004589
22601_11343224-Job_coding_advertising_or_public_relations_manager_mediapublicity_manager_campaignfundraising_manager                  0.004603
22617_1233-Job_SOC_coding_Hairdressing_and_beauty_salon_managers_and_proprietors                                                      0.004623
22617_3119-Job_SOC_coding_Science_and_engineering_technicians_nec                                                                     0.004626
22601_23293031-Job_coding_other_researchers_including_in_broadcasting_journalism_photography_printing_and_publishing                  0.004633
22617_1173-Job_SOC_coding_Senior_officers_in_fire_ambulance_prison_and_related_services                                               0.

trait
20113_4-Illnesses_of_adopted_mother_Bowel_cancer    0.004697
Name: value, dtype: float64

trait
1518-Hot_drink_temperature                                                 0.004616
1548-Variation_in_diet                                                     0.004655
1538_1-Major_dietary_changes_in_the_last_5_years_Yes_because_of_illness    0.004664
1478-Salt_added_to_food                                                    0.004837
Name: value, dtype: float64

trait
4105_raw-Heel_bone_mineral_density_BMD_left                           0.004578
4123_raw-Heel_quantitative_ultrasound_index_QUI_direct_entry_right    0.004697
4125_raw-Heel_bone_mineral_density_BMD_Tscore_automated_right         0.004697
4124_raw-Heel_bone_mineral_density_BMD_right                          0.004750
Name: value, dtype: float64

trait
22507_raw-Age_of_stopping_smoking    0.004675
Name: value, dtype: float64

trait
pituitary gland adenoma        0.004632
hypothyroidism AND myxedema    0.004678
Name: value, dtype: float64

trait
20127_raw-Neuroticism_score    0.004575
mood swings                    0.004715
Name: value, dtype: float64

trait
23125_raw-Arm_fatfree_mass_left       0.004588
23105_raw-Basal_metabolic_rate        0.004599
23114_raw-Leg_predicted_mass_right    0.004612
23117_raw-Leg_fatfree_mass_left       0.004621
23118_raw-Leg_predicted_mass_left     0.004621
23113_raw-Leg_fatfree_mass_right      0.004624
23122_raw-Arm_predicted_mass_right    0.004627
23121_raw-Arm_fatfree_mass_right      0.004639
23102_raw-Whole_body_water_mass       0.004649
23101_raw-Whole_body_fatfree_mass     0.004649
23130_raw-Trunk_predicted_mass        0.004706
23129_raw-Trunk_fatfree_mass          0.004711
Name: value, dtype: float64

trait
47_raw-Hand_grip_strength_right    0.004576
46_raw-Hand_grip_strength_left     0.004697
Name: value, dtype: float64

trait
5540_0-Surgeryamputation_of_toe_or_leg_No    0.004625
Name: value, dtype: float64

trait
cutaneous melanoma                               0.004581
cholesteatoma of middle ear                      0.004585
pleural empyema                                  0.004628
diabetes mellitus                                0.004644
J20-Diagnoses_main_ICD10_J20_Acute_bronchitis    0.004678
Name: value, dtype: float64

trait
699_raw-Length_of_time_at_current_address    0.004614
Name: value, dtype: float64

trait
nasal cavity polyp    0.004595
Name: value, dtype: float64

trait
intracranial volume measurement    0.004595
Name: value, dtype: float64