<a href="https://colab.research.google.com/github/dikraMasrour/Breast_Cancer_Risk_Factor_Prediction_KG/blob/main/BC_KG_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Breast cancer biomedical KG Exploratory Data Analysis

In [3]:
# imports
import pandas as pd
import numpy as np
from collections import Counter

In [4]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
dataPath = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/Copie de all_data_triples_can.csv"
df = pd.read_csv(dataPath, compression='gzip')
df.head()

Unnamed: 0,PREDICATE,ORIGIN_ID,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


In [None]:
df.shape

(28720666, 4)

Rename columns to prepare for creating a TorchKGE knowledge graph

In [6]:
df.rename(columns={'SUBJECT_CUI': 'from', 'PREDICATE': 'rel', 'OBJECT_CUI': 'to'}, inplace=True)
df.head()

Unnamed: 0,rel,ORIGIN_ID,from,to
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114


Drop unused 'ORIGIN_ID' column

In [7]:
df.drop('ORIGIN_ID', axis=1, inplace=True)
df.head()

Unnamed: 0,rel,from,to
0,PROCESS_OF,C0003725,C0999630
1,ISA,C0039258,C0446169
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114


In [None]:
# distinct relationships
df.rel.unique(), df.rel.unique().shape

(array(['PROCESS_OF', 'ISA', 'CAUSES', 'LOCATION_OF', 'PART_OF',
        'PRODUCES', 'INTERACTS_WITH', 'AFFECTS', 'TREATS', 'USES',
        'COEXISTS_WITH', 'DIAGNOSES', 'compared_with', 'INHIBITS',
        'STIMULATES', 'CONVERTS_TO', 'ASSOCIATED_WITH', 'NEG_OCCURS_IN',
        'OCCURS_IN', 'NEG_ASSOCIATED_WITH', 'COMPLICATES', 'PREVENTS',
        'NEG_CAUSES', 'NEG_PROCESS_OF', 'PREDISPOSES', 'NEG_TREATS',
        'higher_than', 'METHOD_OF', 'NEG_AFFECTS', 'lower_than',
        'DISRUPTS', 'MEASURES', 'AUGMENTS', 'ADMINISTERED_TO',
        'NEG_STIMULATES', 'NEG_PART_OF', 'NEG_PRODUCES', 'PRECEDES',
        'MANIFESTATION_OF', 'NEG_ADMINISTERED_TO', 'NEG_LOCATION_OF',
        'NEG_COEXISTS_WITH', 'NEG_INTERACTS_WITH', 'NEG_AUGMENTS',
        'NEG_ISA', 'MEASUREMENT_OF', 'same_as', 'NEG_INHIBITS',
        'NEG_DISRUPTS', 'NEG_USES', 'NEG_MEASURES', 'NEG_PREDISPOSES',
        'NEG_PREVENTS', 'NEG_DIAGNOSES', 'NEG_METHOD_OF', 'NEG_same_as',
        'NEG_higher_than', 'NEG_PRECEDES', 'NE

In [None]:
# distinct objects
df['from'].unique(), df['from'].unique().shape

(array(['C0003725', 'C0039258', 'C0318627', ..., 'C3645163', 'C1283601',
        'C1979657'], dtype=object), (309555,))

In [None]:
# distinct subjects
df['to'].unique(), df['to'].unique().shape

(array(['C0999630', 'C0446169', 'C0206590', ..., 'C0948269', 'C1625850',
        'C5466877'], dtype=object), (263115,))

#### Filtering the least common relationships



In [8]:
# get count for distinct relationships
rel_counter = Counter(df['rel'])
# filter counts based on min threshold of 100
filtered_rel_counter = {rel: count for rel, count in rel_counter.items() if count < 100}
# get filtered relationships
filtered_rel = list(filtered_rel_counter.keys())
len(filtered_rel_counter), filtered_rel_counter

(6,
 {'NEG_MEASUREMENT_OF': 43,
  'PREP': 10,
  '241': 2,
  '1532': 2,
  'NOM': 6,
  'VERB': 2})

In [9]:
# get indices to drop
rel_to_drop = {i for i in df.index if df['rel'][i] in filtered_rel}
# drop filtered relationships from df 
df.drop(rel_to_drop, inplace=True)

In [10]:
df.rel.unique().shape[0]

62

In [11]:
df.dropna(inplace=True)
df.shape

(28720601, 3)

#### Filtering the least common objects and subjects
Here, we get the list of indices to drop for objects and subjects and only drop rows where they coincide


In [12]:
vcf = df[['from']].value_counts().reset_index()
vct = df[['to']].value_counts().reset_index()

In [16]:
vcf = vcf[vcf[0] > 100]
vct = vct[vct[0] > 100]

In [17]:
vcf

Unnamed: 0,from,0
0,C0007634,94387
1,C0087111,86054
2,100862685,84220
3,C0033684,60306
4,C0013227,55787
...,...,...
35846,C0006718,101
35847,C0555278,101
35848,C0001219,101
35849,C0555351,101


In [18]:
vct

Unnamed: 0,to,0
0,C0030705,182005
1,C0007634,106400
2,C0012634,103948
3,100862685,81561
4,C0027651,71207
...,...,...
32475,C4076168,101
32476,C0068711,101
32477,C0040017,101
32478,C0029117,101


In [19]:
vcf[0].mean(), vct[0].mean()

(701.2956402889738, 788.7815270935961)

In [20]:
vc = df[['from', 'to']].value_counts().reset_index()

In [41]:
vc_filter = vc[vc[0] < 3]
vc_filter

Unnamed: 0,from,to,0
1720440,C0042285,C0026285,2
1720441,C0040300,C1576408,2
1720442,C1419252,C0596901,2
1720443,C0040300,C1565747,2
1720444,C0040300,C1574460,2
...,...,...,...
18629954,C0082180,C1148576,1
18629955,C0082180,C1150423,1
18629956,C0082180,C1155265,1
18629957,C0082180,C1155266,1


In [52]:
vcf['from'].isin(vc_filter['from'].unique()).sum()

35533

In [43]:
temp = df.copy()

In [None]:
new = temp[(temp[['from', 'to']].isin(vc_filter[['from', 'to']])) == 0]
new

In [None]:
# get count for distinct subject
from_counter = Counter(df['from'])
# filter counts based on min threshold of 100
filtered_from_counter = {subj: count for subj, count in from_counter.items() if count < 10}
# get filtered relationships
filtered_from = list(filtered_from_counter.keys())
len(filtered_from)

183189

In [None]:
# get indices to drop
from_to_drop = {i for i in df.index if df['from'][i] in filtered_from}

In [None]:
# get count for distinct objects
to_counter = Counter(df['to'])
# filter counts based on min threshold of 50
filtered_to_counter = {obj: count for obj, count in to_counter.items() if count < 10}
# get filtered relationships
filtered_to = list(filtered_to_counter.keys())
len(filtered_to)

151721

In [None]:
# get indices to drop
to_to_drop = {i for i in df.index if df['to'][i] in filtered_to}

In [None]:
# get intersection of list indices
to_drop = set(from_to_drop).intersection(to_to_drop)
to_drop