<a href="https://colab.research.google.com/github/dikraMasrour/Breast_Cancer_Risk_Factor_Prediction_KG/blob/main/BC_KG_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Breast cancer biomedical KG Exploratory Data Analysis

In [None]:
# imports
import pandas as pd
from collections import Counter

In [None]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataPath = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/Copie de all_data_triples_can.csv"
df = pd.read_csv(dataPath, compression='gzip')
df.head()

Unnamed: 0,PREDICATE,ORIGIN_ID,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


In [None]:
df.shape

(28720666, 4)

Rename columns to prepare for creating a TorchKGE knowledge graph

In [None]:
df.rename(columns={'SUBJECT_CUI': 'from', 'PREDICATE': 'rel', 'OBJECT_CUI': 'to'}, inplace=True)
df.head()

Unnamed: 0,rel,ORIGIN_ID,from,to
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


Drop unused 'ORIGIN_ID' column

In [None]:
df.drop('ORIGIN_ID', axis=1, inplace=True)
df.head()

Unnamed: 0,rel,from,to
0,PROCESS_OF,C0003725,C0999630
1,ISA,C0039258,C0446169
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114


In [None]:
# distinct relationships
df.rel.unique(), df.rel.unique().shape

(array(['PROCESS_OF', 'ISA', 'CAUSES', 'LOCATION_OF', 'PART_OF',
        'PRODUCES', 'INTERACTS_WITH', 'AFFECTS', 'TREATS', 'USES',
        'COEXISTS_WITH', 'DIAGNOSES', 'compared_with', 'INHIBITS',
        'STIMULATES', 'CONVERTS_TO', 'ASSOCIATED_WITH', 'NEG_OCCURS_IN',
        'OCCURS_IN', 'NEG_ASSOCIATED_WITH', 'COMPLICATES', 'PREVENTS',
        'NEG_CAUSES', 'NEG_PROCESS_OF', 'PREDISPOSES', 'NEG_TREATS',
        'higher_than', 'METHOD_OF', 'NEG_AFFECTS', 'lower_than',
        'DISRUPTS', 'MEASURES', 'AUGMENTS', 'ADMINISTERED_TO',
        'NEG_STIMULATES', 'NEG_PART_OF', 'NEG_PRODUCES', 'PRECEDES',
        'MANIFESTATION_OF', 'NEG_ADMINISTERED_TO', 'NEG_LOCATION_OF',
        'NEG_COEXISTS_WITH', 'NEG_INTERACTS_WITH', 'NEG_AUGMENTS',
        'NEG_ISA', 'MEASUREMENT_OF', 'same_as', 'NEG_INHIBITS',
        'NEG_DISRUPTS', 'NEG_USES', 'NEG_MEASURES', 'NEG_PREDISPOSES',
        'NEG_PREVENTS', 'NEG_DIAGNOSES', 'NEG_METHOD_OF', 'NEG_same_as',
        'NEG_higher_than', 'NEG_PRECEDES', 'NE

In [None]:
# distinct objects
df['from'].unique(), df['from'].unique().shape

(array(['C0003725', 'C0039258', 'C0318627', ..., 'C3645163', 'C1283601',
        'C1979657'], dtype=object), (309555,))

In [None]:
# distinct subjects
df['to'].unique(), df['to'].unique().shape

(array(['C0999630', 'C0446169', 'C0206590', ..., 'C0948269', 'C1625850',
        'C5466877'], dtype=object), (263115,))

#### Filtering the least common relationships



In [None]:
# get count for distinct relationships
rel_counter = Counter(df['rel'])
# filter counts based on min threshold of 100
filtered_rel_counter = {rel: count for rel, count in rel_counter.items() if count < 100}
# get filtered relationships
filtered_rel = list(filtered_rel_counter.keys())
len(filtered_rel_counter), filtered_rel

(6, ['NEG_MEASUREMENT_OF', 'PREP', '241', '1532', 'NOM', 'VERB'])

In [None]:
# get indices to drop
rel_to_drop = {i for i in df.index if df['rel'][i] in filtered_rel}
# drop filtered relationships from df 
df.drop(rel_to_drop, inplace=True)

In [None]:
df.rel.unique().shape[0]

62

#### Filtering the least common objects and subjects
Here, we get the list of indices to drop for objects and subjects and only drop rows where they coincide


In [None]:
vc = df[['from', 'to']].value_counts().reset_index()

In [None]:
to_drop = vc[vc[0] < 10]
to_drop

Unnamed: 0,from,to,0
89438,C0034693,107228383,9
89439,C0051979,C0812242,9
89440,C0033684,C0007620,9
89441,C0009498,C0010957,9
89442,C0015505,C0019080,9
...,...,...,...
18629954,C0082180,C1148576,1
18629955,C0082180,C1150423,1
18629956,C0082180,C1155265,1
18629957,C0082180,C1155266,1


In [None]:
result = {i for i in df.index if (df['from'][i],df['to'][i]) in filtered_from}

Int64Index([       0,        1,        2,        3,        4,        5,
                   6,        7,        8,        9,
            ...
            28720656, 28720657, 28720658, 28720659, 28720660, 28720661,
            28720662, 28720663, 28720664, 28720665],
           dtype='int64', length=28679384)

In [None]:
result = df[~df.isin(to_drop)]
result 
# 28720666

Unnamed: 0,rel,from,to
0,PROCESS_OF,C0003725,C0999630
1,ISA,C0039258,C0446169
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114
...,...,...,...
28720661,COEXISTS_WITH,C1413909,C1413914
28720662,PROCESS_OF,C4023614,C1413909
28720663,PROCESS_OF,C4023614,C1413909
28720664,PROCESS_OF,C4023614,C1413914


In [None]:
ind = df.index[df.value_counts(['from', 'to']) > 100]

  ind = df[df.value_counts(['from', 'to']) > 100]


ValueError: ignored

In [None]:
# get count for distinct subject
from_counter = Counter(df['from'])
# filter counts based on min threshold of 100
filtered_from_counter = {subj: count for subj, count in from_counter.items() if count < 10}
# get filtered relationships
filtered_from = list(filtered_from_counter.keys())
len(filtered_from)

183189

In [None]:
# get indices to drop
from_to_drop = {i for i in df.index if df['from'][i] in filtered_from}

In [None]:
# get count for distinct objects
to_counter = Counter(df['to'])
# filter counts based on min threshold of 50
filtered_to_counter = {obj: count for obj, count in to_counter.items() if count < 10}
# get filtered relationships
filtered_to = list(filtered_to_counter.keys())
len(filtered_to)

151721

In [None]:
# get indices to drop
to_to_drop = {i for i in df.index if df['to'][i] in filtered_to}

In [None]:
# get intersection of list indices
to_drop = set(from_to_drop).intersection(to_to_drop)
to_drop