In [1]:
import pandas as pd

In [11]:
# load all dfs
enrichr_df = pd.read_csv("enr_data/Enrichr_df.csv", index_col=0) # sorted by Term alphabetically
offline_df = pd.read_csv("enr_data/offline_df.csv", index_col=0) # sorted by Term alphabetically
string_df = pd.read_csv("enr_data/string_df.csv", index_col=0) # already sorted by fdr & by raw p-value
gsea_df = pd.read_csv("enr_data/gsea_df.csv", index_col=0) # sorted by absoulte NES
preranked_df = pd.read_csv("enr_data/preranked_df.csv", index_col=0) # sorted by absoulte NES

In [37]:
# sorting
enrichr_df.sort_values(by=['Adjusted P-value'], inplace=True, ascending=True)
offline_df.sort_values(by=['Adjusted P-value'], inplace=True, ascending=True)
enrichr_df.reset_index(drop=True, inplace=True)
offline_df.reset_index(drop=True, inplace=True)
print(enrichr_df.head())

                 Gene_set                                            Term  \
174  ReactomePathways.gmt    Response to elevated platelet cytosolic Ca2+   
145  ReactomePathways.gmt                          Platelet degranulation   
143  ReactomePathways.gmt  Platelet activation, signaling and aggregation   
92   ReactomePathways.gmt      Intrinsic Pathway of Fibrin Clot Formation   
148  ReactomePathways.gmt      Post-translational protein phosphorylation   

    Overlap       P-value  Adjusted P-value  Odds Ratio  Combined Score  \
174    9/84  3.044820e-07          0.000034   11.475569      172.186938   
145    9/82  2.466298e-07          0.000034   11.758513      178.910215   
143   9/129  1.145495e-05          0.000855    7.419854       84.416339   
92     4/15  1.838297e-05          0.000877   27.238770      297.013883   
148    6/51  1.958012e-05          0.000877   12.213309      132.404435   

                                                 Genes  \
174  APOA1;AHSG;KNG1;ITIH4;C

check if enrichr and offline are the same

In [10]:
enrichr_df.compare(offline_df, align_axis=1, keep_shape=True)
all(enrichr_df == offline_df)

True

compare GO enrichment methods

In [12]:
print("number of")
print("enriched terms GSEApy", enrichr_df.shape[0], "enriched terms STRING", string_df.shape[0])

enriched terms GSEApy 224 enriched terms STRING 5


In [15]:
# Can all of the string enriched terms be found in GSEApy results?
for term in string_df["description"].tolist():
    if term not in enrichr_df["Term"].tolist():
        print(term)
    else:
        print("found")

found
found
found
found
found


In [35]:
for col in enrichr_df.columns: # on column to find out if already sorted
    print(col, enrichr_df[col].is_monotonic_increasing)

Gene_set True
Term False
Overlap False
P-value False
Adjusted P-value True
Odds Ratio False
Combined Score False
Genes False
Proteins False


In [38]:
overlap_df = enrichr_df.loc[enrichr_df["Term"].isin(string_df["description"].tolist())]
print(overlap_df)

                Gene_set                                               Term  \
1   ReactomePathways.gmt                             Platelet degranulation   
3   ReactomePathways.gmt         Intrinsic Pathway of Fibrin Clot Formation   
4   ReactomePathways.gmt         Post-translational protein phosphorylation   
5   ReactomePathways.gmt  Regulation of Insulin-like Growth Factor (IGF)...   
12  ReactomePathways.gmt                                         Hemostasis   

   Overlap       P-value  Adjusted P-value  Odds Ratio  Combined Score  \
1     9/82  2.466298e-07          0.000034   11.758513      178.910215   
3     4/15  1.838297e-05          0.000877   27.238770      297.013883   
4     6/51  1.958012e-05          0.000877   12.213309      132.404435   
5     6/56  3.379333e-05          0.001262   11.121316      114.496698   
12  11/288  3.079131e-04          0.005306    4.068858       32.899538   

                                                Genes  \
1   APOA1;AHSG;KNG1;ITI

In [54]:
# jaccard index
enrichr_terms = enrichr_df["Term"]
string_terms = string_df["description"]

def jaccard_similarity(A, B):
    nominator = A.intersection(B)
    denominator = A.union(B)
    similarity = len(nominator)/len(denominator)
    return similarity

enrichr_terms_s = set(enrichr_df["Term"])
string_terms_s = set(string_df["description"])
print("jaccard index", jaccard_similarity(enrichr_terms_s, string_terms_s))

enrichr_top_ten = set(enrichr_df.loc[:10, "Term"])
print("jaccard top ten", jaccard_similarity(enrichr_top_ten, string_terms))

enrichr_top_ten = set(enrichr_df.loc[:5, "Term"])
print("jaccard top five", jaccard_similarity(enrichr_top_ten, string_terms))

jaccard index 0.022321428571428572
jaccard top ten 0.3333333333333333
jaccard top five 0.5714285714285714


In [64]:
# mean rank difference
# def mean_rank_difference(list_a, list_b):
#     rank_dict = {element: index+1 for index, element in enumerate(list_a)}
#     total_diff = 0
#
#     for index, element in enumerate(list_b):
#         if element in rank_dict:
#             rank_a = rank_dict[element]
#             rank_diff = abs(rank_a - (index+1))
#             total_diff += rank_diff
#
#     mean_rank_diff = total_diff / len(list_b)
#     return mean_rank_diff

def mean_rank_difference(rank_dict_a, rank_dict_b):
    total_diff = 0

    for element, rank_b in rank_dict_b.items():
        if element in rank_dict_a:
            rank_a = rank_dict_a[element]
            rank_diff = abs(rank_a - rank_b)
            total_diff += rank_diff

    mean_rank_diff = total_diff / len(rank_dict_b)
    return mean_rank_diff

string_dict = {element: index+1 for index, element in enumerate(string_terms)}
overlap_dict = dict(zip(overlap_df["Term"], overlap_df.index))
# print(string_dict)
# print(overlap_dict)
print("mean rank difference", mean_rank_difference(string_dict, overlap_dict))

mean rank difference 2.8


In [None]:
# pearson correlation of ranks

In [32]:
# sorted by absolute NES value

Name True
Term False
ES False
NES False
NOM p-val False
FDR q-val False
FWER p-val False
Tag % False
Gene % False
Lead_genes False
Lead_proteins False
