<a href="https://colab.research.google.com/github/dikraMasrour/Breast_Cancer_Risk_Factor_Prediction_KG/blob/main/BC_KG_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Breast cancer biomedical KG Exploratory Data Analysis

In [1]:
# imports
import pandas as pd
import numpy as np
from collections import Counter
import gc

In [2]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
dataPath = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/Copie de all_data_triples_can.csv"
df = pd.read_csv(dataPath, compression='gzip')
df.head()

Unnamed: 0,PREDICATE,ORIGIN_ID,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


In [4]:
df.shape

(28720666, 4)

Rename columns to prepare for creating a TorchKGE knowledge graph

In [5]:
df.rename(columns={'SUBJECT_CUI': 'from', 'PREDICATE': 'rel', 'OBJECT_CUI': 'to'}, inplace=True)
df.head()

Unnamed: 0,rel,ORIGIN_ID,from,to
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


Drop unused 'ORIGIN_ID' column

In [6]:
df.drop('ORIGIN_ID', axis=1, inplace=True)
df.head()

Unnamed: 0,rel,from,to
0,PROCESS_OF,C0003725,C0999630
1,ISA,C0039258,C0446169
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114


In [7]:
# distinct relationships
df.rel.unique(), df.rel.unique().shape

(array(['PROCESS_OF', 'ISA', 'CAUSES', 'LOCATION_OF', 'PART_OF',
        'PRODUCES', 'INTERACTS_WITH', 'AFFECTS', 'TREATS', 'USES',
        'COEXISTS_WITH', 'DIAGNOSES', 'compared_with', 'INHIBITS',
        'STIMULATES', 'CONVERTS_TO', 'ASSOCIATED_WITH', 'NEG_OCCURS_IN',
        'OCCURS_IN', 'NEG_ASSOCIATED_WITH', 'COMPLICATES', 'PREVENTS',
        'NEG_CAUSES', 'NEG_PROCESS_OF', 'PREDISPOSES', 'NEG_TREATS',
        'higher_than', 'METHOD_OF', 'NEG_AFFECTS', 'lower_than',
        'DISRUPTS', 'MEASURES', 'AUGMENTS', 'ADMINISTERED_TO',
        'NEG_STIMULATES', 'NEG_PART_OF', 'NEG_PRODUCES', 'PRECEDES',
        'MANIFESTATION_OF', 'NEG_ADMINISTERED_TO', 'NEG_LOCATION_OF',
        'NEG_COEXISTS_WITH', 'NEG_INTERACTS_WITH', 'NEG_AUGMENTS',
        'NEG_ISA', 'MEASUREMENT_OF', 'same_as', 'NEG_INHIBITS',
        'NEG_DISRUPTS', 'NEG_USES', 'NEG_MEASURES', 'NEG_PREDISPOSES',
        'NEG_PREVENTS', 'NEG_DIAGNOSES', 'NEG_METHOD_OF', 'NEG_same_as',
        'NEG_higher_than', 'NEG_PRECEDES', 'NE

In [8]:
# distinct objects
df['from'].unique(), df['from'].unique().shape

(array(['C0003725', 'C0039258', 'C0318627', ..., 'C3645163', 'C1283601',
        'C1979657'], dtype=object), (309555,))

In [9]:
# distinct subjects
df['to'].unique(), df['to'].unique().shape

(array(['C0999630', 'C0446169', 'C0206590', ..., 'C0948269', 'C1625850',
        'C5466877'], dtype=object), (263115,))

In [10]:
df.dropna(inplace=True)

In [11]:
df.drop_duplicates(inplace=True)
df.shape

(23765960, 3)

#### Filtering the least common relationships



In [12]:
# get count for distinct relationships
rel_counter = Counter(df['rel'])
# filter counts based on min threshold of 100
filtered_rel_counter = {rel: count for rel, count in rel_counter.items() if count < 100}
# get filtered relationships
filtered_rel = list(filtered_rel_counter.keys())
len(filtered_rel_counter), filtered_rel_counter

(6,
 {'NEG_MEASUREMENT_OF': 43,
  'PREP': 4,
  '241': 1,
  '1532': 1,
  'NOM': 2,
  'VERB': 1})

In [13]:
# get indices to drop
rel_to_drop = {i for i in df.index if df['rel'][i] in filtered_rel}
# drop filtered relationships from df 
df.drop(rel_to_drop, inplace=True)

In [14]:
df.rel.unique().shape[0]

62

In [15]:
df.shape

(23765908, 3)

In [16]:
del filtered_rel_counter
del filtered_rel
del rel_counter
del rel_to_drop
gc.collect()

0

#### Filtering the least common couples of subject and object


In [29]:
# get value counts for triples in original df
vc = df[['from', 'to']].value_counts().reset_index()

In [30]:
# number of unique couples
vc.shape[0]

18629959

In [31]:
vc.head()

Unnamed: 0,from,to,0
0,C0033684,C0002520,19
1,C0003250,C0003241,19
2,C0021665,C0023630,19
3,C0017337,C0033684,19
4,C0035668,C0033684,18


In [32]:
# least common triples that appear 3 times or less
vc_filter = vc[vc[0] <= 1]
vc_filter.shape

(15486113, 3)

In [33]:
vc_filter.head()

Unnamed: 0,from,to,0
3143846,C0395295,C0040674,1
3143847,C0596764,C0001948,1
3143848,C0597357,C0002864,1
3143849,C0420187,C0175677,1
3143850,C0430389,C0007908,1


In [40]:
# index of rows containing the least common 
index_to_drop = df.reset_index().merge(vc_filter, on=['from', 'to'], how='inner')['index']

In [None]:
del vc_filter
del vc
gc.collect()

In [43]:
df.drop(index_to_drop, axis=0, inplace=True)
df.shape

(8279795, 3)

In [44]:
gc.collect()

0

In [47]:
df.to_pickle('/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/preprocessed_KG.pkl')