In [1]:
import pandas as pd
from math import log
from collections import defaultdict

In [26]:
def get_phen(row):
	return row['e1_text'] if row['e1_type'] == 'Phenotype' else row['e2_text']

def get_rna(row):
	# GST that links misc-RNA URS00004F7BA4 conflicts with glutathione S-transferase genes, that are protein coding
	# 16s rrna links with studies about anti-shine delgano sequences (ASD), instead of autism
	# ASM is anti-seizure medication
	# storm refers to cytokine storm
	FILTER = ['air','top','fast','dams','particle','cardinal','digit','laser','gst','asm','storm','16s rrna','trn','tag','cntnap2','maas']
	if row['e1_type'] == 'ncRNA':
		rna = row['e1_text'].lower().replace('-','')
	else:
		rna = row['e2_text'].lower().replace('-','')
	
	if rna not in FILTER:
		return rna

def get_pair(row):
	return (row['e1_ID'],row['e2_ID']) if row['e1_type'] == 'ncRNA' else (row['e2_ID'],row['e1_ID'])

def k_higher_value_counts(k:int, data):
	if not isinstance(data, pd.Series):
		data = pd.Series(data)
	counts = data.value_counts()[:k]
	ratios = data.value_counts(normalize=True)[:k]
	return pd.concat([counts, ratios], axis=1).reset_index()

def npmi(relation, data):
	rna, phe = relation
	n_t = len(data)
	n_rna = len(data[data.apply(lambda row: rna in tuple(row), axis=1)])
	n_phe = len(data[data.apply(lambda row: phe in tuple(row), axis=1)])
	n_rna_phe = len(data[data.apply(lambda row: rna in tuple(row) and phe in tuple(row), axis=1)])

	return log((n_rna_phe*n_t)/(n_rna*n_phe))/(-log(n_rna_phe/n_t))

def get_relation_npmis(data):
	rels = data['pair'].unique()
	rels = pd.DataFrame(rels, columns=['relation'])
	rels['npmi'] = rels['relation'].apply(lambda x: npmi(x, data))
	return rels

In [27]:
results = pd.read_csv('../outputs/asd/asd_rels.csv', sep='\t')

results['rna'] = results.apply(get_rna, axis=1)
results['phen'] = results.apply(get_phen, axis=1)
results['pair'] = results.apply(get_pair, axis=1)
results.dropna(inplace=True)
# results.drop_duplicates(inplace=True)
rna_names = defaultdict(set)
phe_names = defaultdict(set)
for i,row in results.iterrows():
	rna_name, phe_name = row['rna'], row['phen']
	rna_id, phe_id = row['pair']
	rna_names[rna_id].add(rna_name)
	phe_names[phe_id].add(phe_name)

print('Number of sentences:    ', len(results['sentence'].unique()))
print('Number of annotations:  ', len(results))
print('    Positives:          ', len(results[results['label'] == 1]))
print('    Negatives:          ', len(results[results['label'] == 0]))
print('Number of relations:    ', len(results['pair'].unique()))
print('    Positives:          ', len(results[results['label'] == 1]['pair'].unique()))
print('    Negatives:          ', len(results[results['label'] == 0]['pair'].unique()))
print('Number of unique ncRNAs:', len(results['pair'].apply(lambda x: x[0]).unique()))
print('Number of articles:     ', len(results['pmcid'].unique()))


Number of sentences:     1123
Number of annotations:   1401
    Positives:           1372
    Negatives:           28
Number of relations:     310
    Positives:           306
    Negatives:           20
Number of unique ncRNAs: 257
Number of articles:      549


In [28]:
npmis = get_relation_npmis(results)
def get_names(x):
	return rna_names.get(x[0]), phe_names.get(x[1])
npmis['names'] = npmis['relation'].apply(get_names)

In [29]:
npmis.sort_values(by='npmi', inplace=True, ascending=False)
print('NPMI of relations')
npmis[:10]

NPMI of relations


Unnamed: 0,relation,npmi,names
142,"(URS00003FE4D4, HP:0000717)",0.240401,"({hsamir106a5p}, {autism, Autism})"
270,"(URS000075A884, HP:0000717)",0.240401,"({hsamir200a, mir200a}, {autism, Autism})"
297,"(URS0000669010, HP:0000717)",0.240401,"({hsamir598}, {autism, Autism})"
292,"(URS0000499F46, HP:0000717)",0.217401,"({hsamir4436b3p}, {autism, Autism})"
299,"(URS000075B6A0, HP:0000717)",0.217401,"({hsamir211}, {autism, Autism})"
140,"(URS00005C2E31, HP:0000717)",0.217401,"({hsamir1915p}, {autism, Autism})"
280,"(URS00027C0F22, HP:0000717)",0.217401,"({nras}, {autism, Autism})"
141,"(URS000025D232, HP:0000717)",0.217401,"({hsamir1395p}, {autism, Autism})"
50,"(URS00025DA306, HP:0000717)",0.217401,"({hcp5}, {autism, Autism})"
144,"(URS00005B3525, HP:0000717)",0.217401,"({hsamir1955p}, {autism, Autism})"


In [5]:
# Most common ncRNAs related to ASD
positives = results[results['label'] == 1]['rna']
negatives = results[results['label'] == 0]['rna']
overlap = set(positives).intersection(set(negatives))

positives_only = positives[~positives.isin(overlap)]
negatives_only = negatives[~negatives.isin(overlap)]

print('ncRNAs in positives')
display(k_higher_value_counts(10, positives))
# print('ncRNAs in negatives')
# display(k_higher_value_counts(10, negatives))
# print('ncRNAs only in positives')
# display(k_higher_value_counts(10, positives_only))
# print('ncRNAs only in negatives')
# display(k_higher_value_counts(10, negatives_only))

ncRNAs in positives


Unnamed: 0,rna,count,proportion
0,bdnf,508,0.369993
1,mir137,102,0.07429
2,il1rapl1,52,0.037873
3,mir1290,21,0.015295
4,mir146a,19,0.013838
5,mir132,18,0.01311
6,nhip,17,0.012382
7,mir320a,14,0.010197
8,thril,11,0.008012
9,hotair,11,0.008012


In [35]:
results[results['rna'] == 'hsamir211']#['sentence'].tolist()

Unnamed: 0,e1_type,e1_text,e1_ID,e1_start,e1_end,e2_type,e2_text,e2_ID,e2_start,e2_end,sentence,pmcid,label,rna,phen,pair
2942,ncRNA,hsa-miR-211,URS000075B6A0,91,102,Phenotype,autism,HP:0000717,150,156,"Among the 71 miRNAs, a few including hsa-miR-4...",3581547,1,hsamir211,autism,"(URS000075B6A0, HP:0000717)"


In [139]:
# Results table
ncrnas_results = k_higher_value_counts(10, positives)
ncrnas_results['%'] = ncrnas_results['proportion'].apply(lambda x: f'{x*100:.2f}')
ncrnas_results.drop('proportion', axis=1, inplace=True)
ncrnas_results['n_articles'] = ncrnas_results['rna'].apply(lambda x: len(results[results['rna'] == x]['pmcid'].unique()))
ncrnas_results['n_sentences'] = ncrnas_results['rna'].apply(lambda x: len(results[results['rna'] == x]['sentence'].unique()))
ncrnas_results.index = [1,2,3,4,5,6,7,8,9,10]
# ncrnas_results.to_latex('./ncrnas.tex')