# Description

TODO

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
from IPython.display import HTML

from entity import Trait, Gene
from data.cache import read_data
import conf

# Settings

In [3]:
EXPERIMENT_NAME = "single_gene"

LIPIDS_GENE_SET = "gene_set_increase"
LIPIDS_GENE_SET_QUERY = "(rank == 3) | (rank == 2)"

In [4]:
OUTPUT_DIR = Path(
    conf.RESULTS["CRISPR_ANALYSES"]["BASE_DIR"], f"{EXPERIMENT_NAME}-{LIPIDS_GENE_SET}"
)
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
display(OUTPUT_DIR)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/crispr_analyses/single_gene-gene_set_increase')

# Data loading

## S-MultiXcan results

### Load

In [5]:
smultixcan_results_filename = conf.PHENOMEXCAN[
    "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
]

display(smultixcan_results_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-efo_partial-mashr-zscores.pkl')

In [6]:
smultixcan_results = pd.read_pickle(smultixcan_results_filename)

### Rename genes and remove repeated ones

In [7]:
smultixcan_results = smultixcan_results.rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [8]:
smultixcan_results.index[smultixcan_results.index.duplicated(keep="first")]

Index(['SPATA13', 'LINC01422', 'LINC00484', 'MAL2', 'GOLGA8M', 'LINC01115',
       'LYNX1'],
      dtype='object', name='gene_name')

In [9]:
smultixcan_results = smultixcan_results.loc[
    ~smultixcan_results.index.duplicated(keep="first")
]

In [10]:
smultixcan_results.shape

(22508, 3752)

In [11]:
smultixcan_results.head()

Unnamed: 0_level_0,100001_raw-Food_weight,100002_raw-Energy,100003_raw-Protein,100004_raw-Fat,100005_raw-Carbohydrate,100006_raw-Saturated_fat,100007_raw-Polyunsaturated_fat,100008_raw-Total_sugars,100009_raw-Englyst_dietary_fibre,100010-Portion_size,...,visual impairment,vitiligo,vitreous body disease,vocal cord polyp,voice disorders,wellbeing measurement AND family relationship,wheezing,whooping cough,worry measurement,wrist fracture
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DPM1,1.145442,0.724557,0.090876,0.298165,1.134347,1.371138,0.065718,0.794317,0.600342,0.317652,...,0.360518,1.351624,1.157695,0.835289,1.173072,1.33728,1.743822,1.017226,1.512184,0.972241
SCYL3,0.618066,1.028131,2.21842,0.762584,0.934418,0.192993,1.08023,0.765997,0.375898,0.678731,...,2.134504,0.12783,0.53469,0.120516,0.517464,2.545363,0.673331,2.003092,0.344,2.033122
C1orf112,0.515724,0.403596,1.251359,0.433091,0.413466,0.246261,1.236151,0.82743,0.571985,0.782174,...,1.768905,0.992408,0.548215,0.412341,1.499415,1.36678,0.443318,0.41763,0.225934,1.613246
FGR,0.280781,0.25391,0.879148,0.352705,0.051846,0.184212,0.148566,0.009989,0.363751,0.374514,...,0.656552,2.046041,2.746832,0.108211,1.008258,0.755695,0.896228,0.875047,0.476405,1.693057
CFH,0.548127,0.389877,0.723469,1.16725,0.315952,0.324939,1.613932,0.311432,0.333548,1.807243,...,0.260482,0.646204,1.08024,0.67833,1.465358,0.307672,0.118376,1.419812,2e-06,1.040737


### Standardize by trait

In [12]:
_tmp = smultixcan_results.apply(lambda x: x / x.sum())

In [13]:
_tmp.shape

(22508, 3752)

In [14]:
assert _tmp.shape == smultixcan_results.shape

In [15]:
# some testing
_trait = "body height"
_gene = "SCYL3"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "100001_raw-Food_weight"
_gene = "DPM1"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "estrogen-receptor negative breast cancer"
_gene = "CFH"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

_trait = "asthma"
_gene = "C1orf112"
assert (
    _tmp.loc[_gene, _trait]
    == smultixcan_results.loc[_gene, _trait] / smultixcan_results[_trait].sum()
)

In [16]:
smultixcan_results = _tmp

## Differentially expressed genes

### Load

In [17]:
input_filepath = Path(conf.CRISPR["BASE_DIR"], "lipid_DEG.csv")
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/crispr_screen/lipid_DEG.csv')

In [18]:
deg_genes = pd.read_csv(input_filepath)

In [19]:
deg_genes.shape

(462, 12)

In [20]:
deg_genes.head()

Unnamed: 0,gene_name,GFPLow_vs_UnSorted.log2FC,GFPLow_vs_UnSorted.FDR,GFPLow_vs_UnSorted.DEG,GFPHigh_vs_UnSorted.log2FC,GFPHigh_vs_UnSorted.FDR,GFPHigh_vs_UnSorted.DEG,GFPHigh_vs_GFPLow.log2FC,GFPHigh_vs_GFPLow.FDR,GFPHigh_vs_GFPLow.DEG,lipid effect,rank
0,ABCA2,0.558668,0.116008,False,0.784919,0.027128,True,0.22625,0.632691,False,decrease *,-1
1,ABCB7,0.798525,0.02922,True,-0.238342,0.790778,False,-1.036866,0.000693,True,increase **,2
2,ABCE1,0.684075,0.047833,True,0.044326,0.975255,False,-0.63975,0.049229,True,increase **,2
3,ABCG2,0.438126,0.049607,True,0.37555,0.229539,False,-0.062576,0.883678,False,increase *,1
4,ABHD10,-0.641405,0.009846,True,-0.241943,0.658553,False,0.399461,0.156967,False,decrease *,-1


### Select gene set

In [21]:
df = deg_genes.query(LIPIDS_GENE_SET_QUERY)

In [22]:
df.shape

(175, 12)

In [23]:
df_genes = df["gene_name"].unique().tolist()

display(len(df_genes))
display(df_genes[:10])

assert len(df_genes) == 175

175

['ABCB7',
 'ABCE1',
 'ABHD13',
 'ACACA',
 'ACTR1A',
 'ACVR1',
 'AGPAT6',
 'AKIRIN2',
 'AP2S1',
 'ASCC3']

In [24]:
# keep genes present in S-MultiXcan results
df_genes_present = smultixcan_results.index.intersection(df_genes).tolist()

display(len(df_genes_present))
display(df_genes_present[:10])

assert len(df_genes_present) == 164

164

['DHX33',
 'RPAP3',
 'QPCTL',
 'TTC27',
 'PHF20',
 'AP2S1',
 'NOP16',
 'POLR3E',
 'DGAT2',
 'RPL18']

# Get top traits

In [25]:
traits = []

for g in df_genes_present:
    _tmp = smultixcan_results.loc[g]
    _tmp = _tmp[_tmp > 0.0].sort_values(ascending=False)

    #     _tmp = _tmp.head(50)
    traits.append(_tmp)

In [26]:
traits_df = (
    pd.concat(traits)
    .reset_index()
    .groupby("index")
    .sum()
    .sort_values(0, ascending=False)
    .reset_index()
).rename(columns={"index": "trait", 0: "value"})

In [27]:
# add trait category
trait_code_to_trait_obj = [
    Trait.get_trait(full_code=t)
    if not Trait.is_efo_label(t)
    else Trait.get_traits_from_efo(t)
    for t in traits_df["trait"]
]

In [28]:
traits_df = traits_df.assign(
    category=[
        t.category if not isinstance(t, list) else t[0].category
        for t in trait_code_to_trait_obj
    ]
)

In [29]:
traits_df.shape

(3752, 3)

In [30]:
traits_df.head()

Unnamed: 0,trait,value,category
0,30180_raw-Lymphocyte_percentage,0.010133,Blood count
1,30200_raw-Neutrophill_percentage,0.009718,Blood count
2,30140_raw-Neutrophill_count,0.009544,Blood count
3,30010_raw-Red_blood_cell_erythrocyte_count,0.009362,Blood count
4,lymphocyte count,0.009343,Blood


In [31]:
output_file = Path(OUTPUT_DIR, "traits.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/crispr_analyses/single_gene-gene_set_increase/traits.pkl')

In [32]:
traits_df.to_pickle(output_file)

# Summary

In [33]:
top_traits = traits_df.head(100)

In [34]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
):
    display(top_traits)

Unnamed: 0,trait,value,category
0,30180_raw-Lymphocyte_percentage,0.010133,Blood count
1,30200_raw-Neutrophill_percentage,0.009718,Blood count
2,30140_raw-Neutrophill_count,0.009544,Blood count
3,30010_raw-Red_blood_cell_erythrocyte_count,0.009362,Blood count
4,lymphocyte count,0.009343,Blood
5,23130_raw-Trunk_predicted_mass,0.009334,Impedance measures
6,23129_raw-Trunk_fatfree_mass,0.009327,Impedance measures
7,30040_raw-Mean_corpuscular_volume,0.009309,Blood count
8,30270_raw-Mean_sphered_cell_volume,0.009252,Blood count
9,30000_raw-White_blood_cell_leukocyte_count,0.00925,Blood count


# Summary using trait categories

In [35]:
top_traits_categories = (
    top_traits.groupby("category")
    .mean()
    .sort_values("value", ascending=False)
    .reset_index()
)

In [36]:
top_traits_categories.head()

Unnamed: 0,category,value
0,Blood count,0.009015
1,Impedance measures,0.009012
2,Blood,0.008935
3,Body size measures,0.008902
4,Sun exposure,0.008825


In [37]:
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None, "max_colwidth", None
):
    display(top_traits_categories)

Unnamed: 0,category,value
0,Blood count,0.009015
1,Impedance measures,0.009012
2,Blood,0.008935
3,Body size measures,0.008902
4,Sun exposure,0.008825
5,Early life factors,0.008805
6,Spirometry,0.008778
7,Diseases (ICD10 main),0.008744
8,Milk/eggs/cheese yesterday,0.008726
9,Female-specific factors,0.008716


In [38]:
for row_idx, row in top_traits_categories.iterrows():
    category = row["category"]
    display(HTML(f"<h2>{category}</h2>"))

    _df = (
        top_traits[top_traits["category"] == category]
        .groupby("trait")["value"]
        .mean()
        .sort_values()
    )
    display(_df)

trait
30110_raw-Platelet_distribution_width                   0.008428
30280_raw-Immature_reticulocyte_fraction                0.008441
30030_raw-Haematocrit_percentage                        0.008535
30160-Basophill_count                                   0.008565
30190_raw-Monocyte_percentage                           0.008579
30240_raw-Reticulocyte_percentage                       0.008619
30120_raw-Lymphocyte_count                              0.008766
30290_raw-High_light_scatter_reticulocyte_percentage    0.008792
30090_raw-Platelet_crit                                 0.008821
30250_raw-Reticulocyte_count                            0.008877
30300_raw-High_light_scatter_reticulocyte_count         0.009012
30050_raw-Mean_corpuscular_haemoglobin                  0.009132
30260_raw-Mean_reticulocyte_volume                      0.009170
30000_raw-White_blood_cell_leukocyte_count              0.009250
30270_raw-Mean_sphered_cell_volume                      0.009252
30040_raw-Mean_corp

trait
23114_raw-Leg_predicted_mass_right    0.008672
23118_raw-Leg_predicted_mass_left     0.008672
23117_raw-Leg_fatfree_mass_left       0.008678
23113_raw-Leg_fatfree_mass_right      0.008682
23106_raw-Impedance_of_whole_body     0.008843
23105_raw-Basal_metabolic_rate        0.008934
23121_raw-Arm_fatfree_mass_right      0.009088
23122_raw-Arm_predicted_mass_right    0.009095
23126_raw-Arm_predicted_mass_left     0.009100
23101_raw-Whole_body_fatfree_mass     0.009124
23109_raw-Impedance_of_arm_right      0.009134
23102_raw-Whole_body_water_mass       0.009150
23110_raw-Impedance_of_arm_left       0.009177
23125_raw-Arm_fatfree_mass_left       0.009179
23129_raw-Trunk_fatfree_mass          0.009327
23130_raw-Trunk_predicted_mass        0.009334
Name: value, dtype: float64

trait
monocyte count                             0.008483
myeloid white cell count                   0.008897
granulocyte count                          0.008919
sum of neutrophil and eosinophil counts    0.008931
sum of basophil and neutrophil counts      0.008950
neutrophil count                           0.008952
leukocyte count                            0.009006
lymphocyte count                           0.009343
Name: value, dtype: float64

trait
50_raw-Standing_height      0.008807
20015_raw-Sitting_height    0.008996
Name: value, dtype: float64

trait
1050-Time_spend_outdoors_in_summer               0.008399
1737-Childhood_sunburn_occasions                 0.008415
1727-Ease_of_skin_tanning                        0.009030
1747_2-Hair_colour_natural_before_greying_Red    0.009072
1717-Skin_colour                                 0.009208
Name: value, dtype: float64

trait
20022_raw-Birth_weight    0.008715
body height               0.008895
Name: value, dtype: float64

trait
20153_raw-Forced_expiratory_volume_in_1second_FEV1_predicted    0.008778
Name: value, dtype: float64

trait
sleep disorder          0.008560
rheumatoid arthritis    0.008929
Name: value, dtype: float64

trait
100920_2102-Type_milk_consumed_semiskimmed    0.008726
Name: value, dtype: float64

trait
2764_raw-Age_at_last_live_birth                    0.008484
3581_raw-Age_at_menopause_last_menstrual_period    0.008947
Name: value, dtype: float64

trait
2159-Ever_had_samesex_intercourse          0.008603
2149-Lifetime_number_of_sexual_partners    0.008828
Name: value, dtype: float64

trait
22601_21253043-Job_coding_chemical_engineer_plastics_engineer_pharmaceutical_engineer    0.008633
22617_2125-Job_SOC_coding_Chemical_engineers                                             0.008634
22601_51112476-Job_coding_farmer_farming_contractor_herd_manager_smallholder_bailiff     0.008685
22617_5111-Job_SOC_coding_Farmers                                                        0.008706
22617_5216-Job_SOC_coding_Pipe_fitters                                                   0.008839
Name: value, dtype: float64

trait
20118_7-Home_area_population_density_urban_or_rural_EnglandWales_Village_less_sparse    0.008681
Name: value, dtype: float64

trait
894-Duration_of_moderate_activity    0.008669
Name: value, dtype: float64

trait
4080_raw-Systolic_blood_pressure_automated_reading    0.008668
Name: value, dtype: float64

trait
5610_1-Which_eyes_affected_by_presbyopia_Right_eye    0.008573
Name: value, dtype: float64

trait
attention deficit hyperactivity disorder    0.008568
Name: value, dtype: float64

trait
6177_1-Medication_for_cholesterol_blood_pressure_or_diabetes_Cholesterol_lowering_medication    0.008415
6177_100-Medication_for_cholesterol_blood_pressure_or_diabetes_None_of_the_above                0.008707
Name: value, dtype: float64

trait
M13_DUPUTRYEN-Palmar_fascial_fibromatosis_Dupuytren    0.008445
RHEUMA_SEROPOS-Seropositive_rheumatoid_arthritis       0.008457
SLEEP-Sleep_disorders_combined                         0.008566
G6_SLEEPAPNO-Sleep_apnoea                              0.008716
Name: value, dtype: float64

trait
20107_1-Illnesses_of_father_Heart_disease                  0.008396
20107_101-Illnesses_of_father_None_of_the_above_group_2    0.008693
Name: value, dtype: float64

trait
2664_4-Reason_for_reducing_amount_of_alcohol_drunk_Financial_reasons    0.008542
Name: value, dtype: float64

trait
1478-Salt_added_to_food    0.008497
1379-Lambmutton_intake     0.008545
1528-Water_intake          0.008546
1448_1-Bread_type_White    0.008566
Name: value, dtype: float64

trait
404_raw-Duration_to_first_press_of_snapbutton_in_each_round    0.008497
20023_raw-Mean_time_to_correctly_identify_matches              0.008578
Name: value, dtype: float64

trait
5111_raw-3mm_asymmetry_angle_left    0.008515
Name: value, dtype: float64

trait
20003_1141167334-Treatmentmedication_code_colofac100_tablet                   0.008433
20003_1140861958-Treatmentmedication_code_simvastatin                         0.008466
20003_1141195224-Treatmentmedication_code_formoterol                          0.008468
20003_1141164086-Treatmentmedication_code_salmeterolfluticasone_propionate    0.008521
20003_1141146188-Treatmentmedication_code_latanoprost                         0.008659
Name: value, dtype: float64

trait
30510_raw-Creatinine_enzymatic_in_urine    0.008486
Name: value, dtype: float64

trait
deep vein thrombosis                                        0.008404
6150_2-Vascularheart_problems_diagnosed_by_doctor_Angina    0.008465
Name: value, dtype: float64

trait
100018_raw-Retinol    0.008415
Name: value, dtype: float64

trait
40001_C439-Underlying_primary_cause_of_death_ICD10_C439_Malignant_melanoma_of_skin_unspecified    0.008414
Name: value, dtype: float64

trait
3894_raw-Age_heart_attack_diagnosed    0.008412
Name: value, dtype: float64

trait
chronic pain    0.00841
Name: value, dtype: float64

trait
estrogen-receptor negative breast cancer    0.008398
Name: value, dtype: float64