In [1]:
import pandas as pd
from math import log
from collections import defaultdict

In [26]:
def get_phen(row):
	return row['e1_text'] if row['e1_type'] == 'Phenotype' else row['e2_text']

def get_rna(row):
	# GST that links misc-RNA URS00004F7BA4 conflicts with glutathione S-transferase genes, that are protein coding
	# 16s rrna links with studies about anti-shine delgano sequences (ASD), instead of autism
	# ASM is anti-seizure medication
	# storm refers to cytokine storm
	FILTER = ['air','top','fast','dams','particle','cardinal','digit','laser','gst','asm','storm','16s rrna','trn','tag','cntnap2','maas']
	if row['e1_type'] == 'ncRNA':
		rna = row['e1_text'].lower().replace('-','')
	else:
		rna = row['e2_text'].lower().replace('-','')
	
	if rna not in FILTER:
		return rna

def get_pair(row):
	return (row['e1_ID'],row['e2_ID']) if row['e1_type'] == 'ncRNA' else (row['e2_ID'],row['e1_ID'])

def k_higher_value_counts(k:int, data):
	if not isinstance(data, pd.Series):
		data = pd.Series(data)
	counts = data.value_counts()[:k]
	ratios = data.value_counts(normalize=True)[:k]
	return pd.concat([counts, ratios], axis=1).reset_index()

def npmi(relation, data):
	rna, phe = relation
	n_t = len(data)
	n_rna = len(data[data.apply(lambda row: rna in tuple(row), axis=1)])
	n_phe = len(data[data.apply(lambda row: phe in tuple(row), axis=1)])
	n_rna_phe = len(data[data.apply(lambda row: rna in tuple(row) and phe in tuple(row), axis=1)])

	return log((n_rna_phe*n_t)/(n_rna*n_phe))/(-log(n_rna_phe/n_t))

def get_relation_npmis(data):
	rels = data['pair'].unique()
	rels = pd.DataFrame(rels, columns=['relation'])
	rels['npmi'] = rels['relation'].apply(lambda x: npmi(x, data))
	return rels

In [None]:
results = pd.read_csv('../outputs/asd/asd_rels.csv', sep='\t')

results['rna'] = results.apply(get_rna, axis=1)
results['phen'] = results.apply(get_phen, axis=1)
results['pair'] = results.apply(get_pair, axis=1)
results.dropna(inplace=True)
# results.drop_duplicates(inplace=True)
rna_names = defaultdict(set)
phe_names = defaultdict(set)
for i,row in results.iterrows():
	rna_name, phe_name = row['rna'], row['phen']
	rna_id, phe_id = row['pair']
	rna_names[rna_id].add(rna_name)
	phe_names[phe_id].add(phe_name)

print('Number of sentences:    ', len(results['sentence'].unique()))
print('Number of annotations:  ', len(results))
print('    Positives:          ', len(results[results['label'] == 1]))
print('    Negatives:          ', len(results[results['label'] == 0]))
print('Number of relations:    ', len(results['pair'].unique()))
print('    Positives:          ', len(results[results['label'] == 1]['pair'].unique()))
print('    Negatives:          ', len(results[results['label'] == 0]['pair'].unique()))
print('Number of unique ncRNAs:', len(results['pair'].apply(lambda x: x[0]).unique()))
print('Number of articles:     ', len(results['pmcid'].unique()))


In [28]:
npmis = get_relation_npmis(results)
def get_names(x):
	return rna_names.get(x[0]), phe_names.get(x[1])
npmis['names'] = npmis['relation'].apply(get_names)

In [None]:
npmis.sort_values(by='npmi', inplace=True, ascending=False)
print('NPMI of relations')
npmis[:10]

In [None]:
# Most common ncRNAs related to ASD
positives = results[results['label'] == 1]['rna']
negatives = results[results['label'] == 0]['rna']
overlap = set(positives).intersection(set(negatives))

positives_only = positives[~positives.isin(overlap)]
negatives_only = negatives[~negatives.isin(overlap)]

print('ncRNAs in positives')
display(k_higher_value_counts(10, positives))
# print('ncRNAs in negatives')
# display(k_higher_value_counts(10, negatives))
# print('ncRNAs only in positives')
# display(k_higher_value_counts(10, positives_only))
# print('ncRNAs only in negatives')
# display(k_higher_value_counts(10, negatives_only))

In [None]:
results[results['rna'] == 'hsamir211']#['sentence'].tolist()

In [139]:
# Results table
ncrnas_results = k_higher_value_counts(10, positives)
ncrnas_results['%'] = ncrnas_results['proportion'].apply(lambda x: f'{x*100:.2f}')
ncrnas_results.drop('proportion', axis=1, inplace=True)
ncrnas_results['n_articles'] = ncrnas_results['rna'].apply(lambda x: len(results[results['rna'] == x]['pmcid'].unique()))
ncrnas_results['n_sentences'] = ncrnas_results['rna'].apply(lambda x: len(results[results['rna'] == x]['sentence'].unique()))
ncrnas_results.index = [1,2,3,4,5,6,7,8,9,10]
# ncrnas_results.to_latex('./ncrnas.tex')