<a href="https://colab.research.google.com/github/dikraMasrour/Breast_Cancer_Risk_Factor_Prediction_KG/blob/main/BC_KG_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Breast cancer biomedical KG Exploratory Data Analysis

In [26]:
# imports
import pandas as pd
from collections import Counter

In [1]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
dataPath = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/Copie de all_data_triples_can.csv"
df = pd.read_csv(dataPath, compression='gzip')
df.head()

Unnamed: 0,PREDICATE,ORIGIN_ID,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


In [7]:
df.shape

(28720666, 4)

Rename columns to prepare for creating a TorchKGE knowledge graph

In [9]:
df.rename(columns={'SUBJECT_CUI': 'from', 'PREDICATE': 'rel', 'OBJECT_CUI': 'to'}, inplace=True)
df.head()

Unnamed: 0,rel,ORIGIN_ID,from,to
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


Drop unused 'ORIGIN_ID' column

In [12]:
df.drop('ORIGIN_ID', axis=1, inplace=True)
df.head()

Unnamed: 0,rel,from,to
0,PROCESS_OF,C0003725,C0999630
1,ISA,C0039258,C0446169
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114


In [16]:
# distinct relationships
df.rel.unique(), df.rel.unique().shape

(array(['PROCESS_OF', 'ISA', 'CAUSES', 'LOCATION_OF', 'PART_OF',
        'PRODUCES', 'INTERACTS_WITH', 'AFFECTS', 'TREATS', 'USES',
        'COEXISTS_WITH', 'DIAGNOSES', 'compared_with', 'INHIBITS',
        'STIMULATES', 'CONVERTS_TO', 'ASSOCIATED_WITH', 'NEG_OCCURS_IN',
        'OCCURS_IN', 'NEG_ASSOCIATED_WITH', 'COMPLICATES', 'PREVENTS',
        'NEG_CAUSES', 'NEG_PROCESS_OF', 'PREDISPOSES', 'NEG_TREATS',
        'higher_than', 'METHOD_OF', 'NEG_AFFECTS', 'lower_than',
        'DISRUPTS', 'MEASURES', 'AUGMENTS', 'ADMINISTERED_TO',
        'NEG_STIMULATES', 'NEG_PART_OF', 'NEG_PRODUCES', 'PRECEDES',
        'MANIFESTATION_OF', 'NEG_ADMINISTERED_TO', 'NEG_LOCATION_OF',
        'NEG_COEXISTS_WITH', 'NEG_INTERACTS_WITH', 'NEG_AUGMENTS',
        'NEG_ISA', 'MEASUREMENT_OF', 'same_as', 'NEG_INHIBITS',
        'NEG_DISRUPTS', 'NEG_USES', 'NEG_MEASURES', 'NEG_PREDISPOSES',
        'NEG_PREVENTS', 'NEG_DIAGNOSES', 'NEG_METHOD_OF', 'NEG_same_as',
        'NEG_higher_than', 'NEG_PRECEDES', 'NE

In [18]:
# distinct objects
df['from'].unique(), df['from'].unique().shape

(array(['C0003725', 'C0039258', 'C0318627', ..., 'C3645163', 'C1283601',
        'C1979657'], dtype=object), (309555,))

In [19]:
# distinct subjects
df['to'].unique(), df['to'].unique().shape

(array(['C0999630', 'C0446169', 'C0206590', ..., 'C0948269', 'C1625850',
        'C5466877'], dtype=object), (263115,))

#### Filtering the least common relationships

In [35]:
rel_counter = Counter(df['rel'])
filtered_rel_counter = {rel: count for rel, count in rel_counter.items() if count > 100}
filtered_rel_counter

{'PROCESS_OF': 1180072,
 'ISA': 392697,
 'CAUSES': 1157832,
 'LOCATION_OF': 3701432,
 'PART_OF': 1502015,
 'PRODUCES': 580496,
 'INTERACTS_WITH': 2707623,
 'AFFECTS': 2453641,
 'TREATS': 1614997,
 'USES': 591553,
 'COEXISTS_WITH': 2263722,
 'DIAGNOSES': 340216,
 'compared_with': 626344,
 'INHIBITS': 1327363,
 'STIMULATES': 1607718,
 'CONVERTS_TO': 57804,
 'ASSOCIATED_WITH': 1300032,
 'NEG_OCCURS_IN': 4121,
 'OCCURS_IN': 64051,
 'NEG_ASSOCIATED_WITH': 75210,
 'COMPLICATES': 52635,
 'PREVENTS': 273651,
 'NEG_CAUSES': 67821,
 'NEG_PROCESS_OF': 101909,
 'PREDISPOSES': 421774,
 'NEG_TREATS': 122657,
 'higher_than': 165684,
 'METHOD_OF': 166825,
 'NEG_AFFECTS': 299225,
 'lower_than': 29952,
 'DISRUPTS': 713488,
 'MEASURES': 268593,
 'AUGMENTS': 761943,
 'ADMINISTERED_TO': 290007,
 'NEG_STIMULATES': 98532,
 'NEG_PART_OF': 109876,
 'NEG_PRODUCES': 42173,
 'PRECEDES': 185181,
 'MANIFESTATION_OF': 49537,
 'NEG_ADMINISTERED_TO': 20923,
 'NEG_LOCATION_OF': 202149,
 'NEG_COEXISTS_WITH': 140106,
 'N