## Corpus Analysis
#### David da Costa Correia @ FCUL & INSA

In [2]:
import pandas as pd
import json
import os
import zipfile

CORPUS_FOLDER = '../outputs/corpus.zip' # Or zip - zip is better performance
ROOT = os.getcwd()

In [3]:
def load_corpus(corpus_dir:os.PathLike) -> dict:
	zipped = corpus_dir.endswith('.zip')
	if zipped:
		zip_dir = zipfile.ZipFile(corpus_dir)
		files = zip_dir.namelist()
	else:
		files = []
		for root,_,fils in os.walk(corpus_dir):
			for file in fils:
				files.append(os.path.join(root,file))

	corpus = {}
	errors = []
	for file in files:
		if file.endswith('.log'):
			zip_dir.extract(file)
			log_path = os.path.join(ROOT, file)
			continue
		
		pmid = int(os.path.splitext(file)[0].split('/')[-1])
		corpus[pmid] = None
		
		try:
			if zipped: 
				f = zip_dir.open(file)
			else:
				f = open(file, 'r')
			corpus[pmid] = json.load(f)
			f.close()
		except Exception as e:
			errors.append(pmid)
	
	if zipped: zip_dir.close()
	
	return corpus, errors, log_path


def get_ents(corpus):
	# ents=(neg,pos,all)
	rna_ents = ([],[],[])
	phe_ents = ([],[],[])
	
	for art in corpus.values():
		if art is None:
			continue
		for sent in art:
			rels = sent['relations']
			for rel in rels:
				e1,e2 = rel['e1'], rel['e2']
				
				if e1['type'] == 'ncRNA': 
					rna = (e1['text'].lower(),e1['ID'])
					phe = (e2['text'].lower(),e2['ID'])
				else:
					rna = (e2['text'].lower(),e2['ID'])
					phe = (e1['text'].lower(),e1['ID'])

				label = rel['relation']
				rna_ents[label].append(rna)
				phe_ents[label].append(phe)
				
				rna_ents[2].append(rna)
				phe_ents[2].append(phe)

	return rna_ents, phe_ents

LOG_LINES = []
def print_and_log(line=''):
	print(line)
	LOG_LINES.append(line+'\n')

def k_higher_value_counts(k:int, data):
	if not isinstance(data, pd.Series):
		data = pd.Series(data)
	counts = data.value_counts()[:k]
	ratios = data.value_counts(normalize=True)[:k]
	return pd.concat([counts, ratios], axis=1).reset_index()

In [3]:
# Load Data & Corpus
corpus, errors, corpus_log = load_corpus(CORPUS_FOLDER)
print(len(errors))

0


In [4]:
# Check if entities were correctly linked to their IDs
# Get true IDs from merpy lexicon
ent_ids = {}
from merpy import mer_path as mer_path

with open(mer_path+'data/ncrnas-rnac_links.tsv', 'r') as f:
	lines = f.readlines()
for line in lines:
	rna, rna_id = line.split('\t')
	ent_ids[rna] = rna_id.strip('\n')

with open(mer_path+'data/phenotypes_links.tsv', 'r') as f:
	lines = f.readlines()
for line in lines:
	phen, phen_id = line.split('\t')
	ent_ids[phen] = phen_id.strip('\n')

In [5]:
# Compare given IDs to true IDs
mismatches = {}
for art in corpus.values():
	if art is None:
		continue
	for sent in art:
		rels = sent['relations']
		for rel in rels:
			e1,e2 = rel['e1'], rel['e2']
			for ent in [e1,e2]:
				given_id = ent['ID']
				true_id = ent_ids.get(ent['text'].lower())
				if given_id is None or true_id is None:
					continue 
				if given_id != true_id:
					mismatches[ent['text'].lower()] = {'true_id':true_id, 'given_id':given_id}

print(mismatches)


{}


In [6]:
# Check for relation mislabels
data = pd.read_csv('../outputs/dataset/rel_dataset.csv', sep='\t')
true_labels = set()
for i,row in data.iterrows():
	rna_id,_,phen_id,_,pmid,_ = row
	true_labels.add((rna_id,phen_id,int(pmid)))

mislabels = []
for pmid in corpus:
	art = corpus[pmid]
	if art is None:
		continue
	for sent in art:
		rels = sent['relations']
		for rel in rels:
			if rel['e1']['type'] == 'ncRNA':
				rna_id = rel['e1']['ID']
				phen_id = rel['e2']['ID']
			else:
				rna_id = rel['e2']['ID']
				phen_id = rel['e1']['ID']

			pred_label = rel['relation']
			true_label = (rna_id,phen_id,pmid) in true_labels
			if bool(pred_label) != true_label:
				mislabels.append((rna_id, phen_id, pmid))

print(mislabels)

[]


In [7]:
articles_file = os.path.join('../data/articles.json')
with open(articles_file, 'r') as f:
	raw_articles = json.load(f)
raw_articles_type = {int(art['PMID']):art['is_full_text'] for art in raw_articles}
del raw_articles

In [8]:
# 1. Number of Annotations (positive and negative)
with open(corpus_log, 'r') as f:
	log_lines_old = f.readlines()


# General counts
t_art = len(raw_articles_type)
n_art = 0 # annotated articles
n_rel = 0 # relaitons
n_neg = 0 # negatives
n_pos = 0 # positives

# Full-text specific counts
n_ft = 0
n_rel_ft = 0
n_neg_ft = 0
n_pos_ft = 0

for pmid in corpus.keys():
	art = corpus[pmid]
	if art is None:
		continue
	n_art += 1
	# Check if is full text of abstract
	full_text = raw_articles_type[pmid]
	# Update counts
	if full_text:
		n_ft += 1
	# Get n_anns, n_pos and n_neg specific for full-text/abstract
	for sent in art:
		rels = sent['relations']
		n_rel += len(rels)
		# Update annotation counts
		if full_text:
			n_rel_ft += len(rels)
		# Update positive/negative counts
		for rel in rels:
			label = rel['relation']
			if label == 1:
				n_pos += 1
				if full_text:
					n_pos_ft += 1
			else:
				n_neg += 1
				if full_text:
					n_neg_ft += 1

n_ab = n_art-n_ft
n_rel_ab = n_rel-n_rel_ft
n_neg_ab = n_neg-n_neg_ft
n_pos_ab = n_pos-n_pos_ft

print_and_log(f'\nTotal Relations: {n_rel}')
print_and_log(f'Global:		{n_art}/{t_art} (Ratio: {n_art/t_art:.3f})')
print_and_log(f'	Positives: {n_pos} (Ratio: {n_pos/n_rel:.3f})')
print_and_log(f'	Negatives: {n_neg}')
print_and_log(f'Full-text:	 {n_ft}')
print_and_log(f'	Relations: {n_rel_ft}')
print_and_log(f'	Positives: {n_pos_ft} (Ratio: {n_pos_ft/n_rel_ft:.3f})')
print_and_log(f'	Negatives: {n_neg_ft}')
print_and_log(f'Abstracts:	 {n_ab}')
print_and_log(f'	Relations: {n_rel_ab}')
print_and_log(f'	Positives: {n_pos_ab} (Ratio: {n_pos_ab/n_rel_ab:.3f})')
print_and_log(f'	Negatives: {n_neg_ab}')


Total Relations: 400645
Global:        21619/29281 (Ratio: 0.738)
    Positives: 84416 (Ratio: 0.211)
    Negatives: 316229
Full-text:     19295
    Relations: 391096
    Positives: 82005 (Ratio: 0.210)
    Negatives: 309091
Abstracts:     2324
    Relations: 9549
    Positives: 2411 (Ratio: 0.252)
    Negatives: 7138


In [9]:
# 2. Number of annotated ncRNAs and HPO terms
rnas, phes = get_ents(corpus)

n_neg_rna = len(set(rnas[0]))
n_pos_rna = len(set(rnas[1]))
n_both_rna = len(set(rnas[0]) & set(rnas[1]))
n_rna = len(set(rnas[2]))
n_neg_phe = len(set(phes[0]))
n_pos_phe = len(set(phes[1]))
n_both_phe = len(set(phes[0]) & set(phes[1]))
n_phe = len(set(phes[2]))

print_and_log(f'\nNumber of annotated ncRNAs: {n_rna}')
print_and_log(f'	In positives: {n_pos_rna} (Ratio: {n_pos_rna/n_rna:.3f})')
print_and_log(f'	In negatives: {n_neg_rna} (Ratio: {n_neg_rna/n_rna:.3f})')
print_and_log(f'	Overlap: {n_both_rna/n_rna:.3f}')
# print(f'	Total number of ncRNA names in the custom lexicon: {442900}')
# print(f'	Ratio: {len(set(rnas))/442900:.3f}')
print_and_log(f'Number of annotated HPO Terms: {n_phe}')
print_and_log(f'	In positives: {n_pos_phe} (Ratio: {n_pos_phe/n_phe:.3f})')
print_and_log(f'	In negatives: {n_neg_phe} (Ratio: {n_neg_phe/n_phe:.3f})')
print_and_log(f'	Overlap: {n_both_phe/n_phe:.3f}')

# print(f'	Total number of "Phenotypic abnormality" descendants as of Jan 2024: {17425}')
# print(f'	Ratio: {len(set(phes))/17425:.3f}')


Number of annotated ncRNAs: 4093
	In positives: 1414 (Ratio: 0.345)
	In negatives: 3932 (Ratio: 0.961)
	Overlap: 0.306
Number of annotated HPO Terms: 1486
	In positives: 289 (Ratio: 0.194)
	In negatives: 1478 (Ratio: 0.995)
	Overlap: 0.189


In [10]:
print_and_log('\nMost annotated ncRNAs:')
print_and_log('%-45s | %6s | %5s' % ('ncRNA', 'N Rels', 'Ratio'))
for _,row in k_higher_value_counts(10, rnas[2]).iterrows():
	ent, c, p = row
	print_and_log('%-45s | %6i | %5.3f' % (str(ent), c, p))

print_and_log('\nMost annotated ncRNAs in Positives:')
print_and_log('%-45s | %6s | %5s' % ('ncRNA', 'N Rels', 'Ratio'))
for _,row in k_higher_value_counts(10, rnas[1]).iterrows():
	ent, c, p = row
	print_and_log('%-45s | %6i | %5.3f' % (str(ent), c, p))

print_and_log('\nMost annotated ncRNAs in Negatives:')
print_and_log('%-45s | %6s | %5s' % ('ncRNA', 'N Rels', 'Ratio'))
for _,row in k_higher_value_counts(10, rnas[0]).iterrows():
	ent, c, p = row
	print_and_log('%-45s | %6i | %5.3f' % (str(ent), c, p))


print_and_log('\nMost annotated Phenotypes:')
print_and_log('%-45s | %6s | %5s' % ('Phenotype', 'N Rels', 'Ratio'))
for _,row in k_higher_value_counts(10, phes[2]).iterrows():
	ent, c, p = row
	print_and_log('%-45s | %6i | %5.3f' % (str(ent), c, p))

print_and_log('\nMost annotated Phenotypes in Positives:')
print_and_log('%-45s | %6s | %5s' % ('Phenotype', 'N Rels', 'Ratio'))
for _,row in k_higher_value_counts(10, phes[1]).iterrows():
	ent, c, p = row
	print_and_log('%-45s | %6i | %5.3f' % (str(ent), c, p))

print_and_log('\nMost annotated Phenotypes in Negatives:')
print_and_log('%-45s | %6s | %5s' % ('Phenotype', 'N Rels', 'Ratio'))
for _,row in k_higher_value_counts(10, phes[0]).iterrows():
	ent, c, p = row
	print_and_log('%-45s | %6i | %5.3f' % (str(ent), c, p))


Most annotated ncRNAs:
ncRNA                                         | N Rels | Ratio
('mir-21', 'URS00000AF93C')                   |  14166 | 0.035
('malat1', 'URS00025F0023')                   |   7996 | 0.020
('hotair', 'URS00026A2894')                   |   7811 | 0.019
('mir-34a', 'URS000033F823')                  |   7599 | 0.019
('mir-155', 'URS000062749E')                  |   6538 | 0.016
('mir-145', 'URS00004F4657')                  |   5077 | 0.013
('neat1', 'URS00025DF744')                    |   4411 | 0.011
('pvt1', 'URS0002853A3B')                     |   4311 | 0.011
('mir-146a', 'URS000075D8A0')                 |   3986 | 0.010
('uca1', 'URS00025DE66C')                     |   3738 | 0.009

Most annotated ncRNAs in Positives:
ncRNA                                         | N Rels | Ratio
('hotair', 'URS00026A2894')                   |   3706 | 0.044
('malat1', 'URS00025F0023')                   |   3362 | 0.040
('neat1', 'URS00025DF744')                    |   2157 | 

In [11]:
# # Chart
# source = [0, 0]
# target = [1, 2]
# value = [n_pos, n_neg]
# label = [
#	 # f'Articles: {len(raw_articles)}',
#	 # f'Full Texts: {ft}',
#	 # f'Abstracts: {ab}',
#	 # f'Annotated: {n_art}',
#	 # f'Not Annotated: {t_art-n_art}',
#	 f'Annotations: {n_ann}',
#	 f'Positives: {n_pos}',
#	 f'Negatives: {n_neg}',
#		 ]

# node = dict(
#	 pad=15, 
#	 thickness=15, 
#	 line={"color":"black", "width":0.5}, 
#	 label=label
#	 )

# test_source = [0,0,1,1] 
# test_target = [2,3,4,2]
# test_value = [2,10,6,2]
# test_link = dict(source=test_source, target=test_target, value=test_value)
# test_data = go.Sankey(link=test_link)

# link = dict(source=source, target=target, value=value)
# data = go.Sankey(link=link, node=node)
# fig = go.Figure(data)

# # for i, (source, target, value) in enumerate(zip(source, target, value)):
# #	 annotation_text = f"{value}"
# #	 print(annotation_text)
# #	 fig.add_annotation(
# #		 x=0.5,  # Adjust x-coordinate as needed
# #		 y=(i*0.1 + 0.5),  # Adjust y-coordinate as needed
# #		 xref="paper",
# #		 yref="paper",
# #		 text=annotation_text,
# #		 showarrow=True,
# #		 font=dict(color="black")
# #	 )

# fig.show()

In [12]:
# Update corpus log file
with open(corpus_log, 'w') as f:
	for line in log_lines_old:
		f.write(line)
	f.write('\n\n-- Complete Analysis --\n')
	for line in LOG_LINES:
		f.write(line)