In [134]:
import pandas as pd

In [135]:
def get_phen(row):
	return row['e1_text'] if row['e1_type'] == 'Phenotype' else row['e2_text']

def get_rna(row):
	# GST that links misc-RNA URS00004F7BA4 conflicts with glutathione S-transferase genes, that are protein coding
	# 16s rrna links with studies about anti-shine delgano sequences (ASD), instead of autism
	# ASM is anti-seizure medication
	# storm refers to cytokine storm
	FILTER = ['air','top','fast','dams','particle','cardinal','digit','laser','gst','asm','storm','16s rrna','trn','tag','cntnap2']
	if row['e1_type'] == 'ncRNA':
		rna = row['e1_text'].lower().replace('-','')
	else:
		rna = row['e2_text'].lower().replace('-','')
	
	if rna not in FILTER:
		return rna

def get_pair(row):
	return (row['e1_ID'],row['e2_ID']) if row['e1_type'] == 'ncRNA' else (row['e2_ID'],row['e1_ID'])


def k_higher_value_counts(k:int, data):
	if not isinstance(data, pd.Series):
		data = pd.Series(data)
	counts = data.value_counts()[:k]
	ratios = data.value_counts(normalize=True)[:k]
	return pd.concat([counts, ratios], axis=1).reset_index()

In [136]:
results = pd.read_csv('../outputs/asd/asd_rels.csv', sep='\t')

results['rna'] = results.apply(get_rna, axis=1)
results['phen'] = results.apply(get_phen, axis=1)
results['pair'] = results.apply(get_pair, axis=1)
results.dropna(inplace=True)

print('Number of sentences:    ', len(results['sentence'].unique()))
print('Number of annotations:  ', len(results))
print('    Positives:          ', len(results[results['label'] == 1]))
print('    Negatives:          ', len(results[results['label'] == 0]))
print('Number of relations:    ', len(results['pair'].unique()))
print('    Positives:          ', len(results[results['label'] == 1]['pair'].unique()))
print('    Negatives:          ', len(results[results['label'] == 0]['pair'].unique()))
print('Number of unique ncRNAs:', len(results['pair'].apply(lambda x: x[0]).unique()))
print('Number of articles:     ', len(results['pmcid'].unique()))


Number of sentences:     1125
Number of annotations:   1403
    Positives:           1373
    Negatives:           29
Number of relations:     311
    Positives:           307
    Negatives:           21
Number of unique ncRNAs: 258
Number of articles:      551


In [137]:
# Most common ncRNAs related to ASD
positives = results[results['label'] == 1]['rna']
negatives = results[results['label'] == 0]['rna']
overlap = set(positives).intersection(set(negatives))

positives_only = positives[~positives.isin(overlap)]
negatives_only = negatives[~negatives.isin(overlap)]

print('ncRNAs in positives')
display(k_higher_value_counts(10, positives))
# print('ncRNAs in negatives')
# display(k_higher_value_counts(10, negatives))
# print('ncRNAs only in positives')
# display(k_higher_value_counts(10, positives_only))
# print('ncRNAs only in negatives')
# display(k_higher_value_counts(10, negatives_only))



ncRNAs in positives


Unnamed: 0,rna,count,proportion
0,bdnf,508,0.369993
1,mir137,102,0.07429
2,il1rapl1,52,0.037873
3,mir1290,21,0.015295
4,mir146a,19,0.013838
5,mir132,18,0.01311
6,nhip,17,0.012382
7,mir320a,14,0.010197
8,thril,11,0.008012
9,hotair,11,0.008012


In [144]:
results[results['rna'] == 'nhip']#['sentence'].tolist()

Unnamed: 0,e1_type,e1_text,e1_ID,e1_start,e1_end,e2_type,e2_text,e2_ID,e2_start,e2_end,sentence,pmcid,label,rna,phen,pair
402,Phenotype,ASD,HP:0000729,0,3,ncRNA,NHIP,URS00026A276D,56,60,ASD placenta and brain samples show significan...,10560404,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
403,Phenotype,ASD,HP:0000729,0,3,ncRNA,NHIP,URS00026A276D,56,60,ASD placenta and brain samples show significan...,10560404,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
404,Phenotype,ASD,HP:0000729,32,35,ncRNA,NHIP,URS00026A276D,105,109,Using RNAseq on both postmortem ASD and contro...,10560404,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
405,ncRNA,NHIP,URS00026A276D,39,43,Phenotype,ASD,HP:0000729,94,97,A common structural variant within the NHIP lo...,10560404,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
406,ncRNA,NHIP,URS00026A276D,83,87,Phenotype,ASD,HP:0000729,155,158,"Therefore, profiling epigenomic signatures or ...",10560404,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
1012,Phenotype,ASD,HP:0000729,78,81,ncRNA,NHIP,URS00026A276D,168,172,We use whole genome bisulfite sequencing in pl...,8848662,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
1013,ncRNA,NHIP,URS00026A276D,19,23,Phenotype,ASD,HP:0000729,226,229,"Within this locus, NHIP is functionally charac...",8848662,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
1014,ncRNA,NHIP,URS00026A276D,0,4,Phenotype,ASD,HP:0000729,159,162,NHIP overexpression increases cellular prolife...,8848662,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
1015,ncRNA,NHIP,URS00026A276D,56,60,Phenotype,ASD,HP:0000729,149,152,A common structural variant disrupting the pro...,8848662,1,nhip,ASD,"(URS00026A276D, HP:0000729)"
1016,Phenotype,ASD,HP:0000729,0,3,ncRNA,NHIP,URS00026A276D,49,53,ASD placental samples showed significantly low...,8848662,1,nhip,ASD,"(URS00026A276D, HP:0000729)"


In [139]:
# Results table
ncrnas_results = k_higher_value_counts(10, positives)
ncrnas_results['%'] = ncrnas_results['proportion'].apply(lambda x: f'{x*100:.2f}')
ncrnas_results.drop('proportion', axis=1, inplace=True)
ncrnas_results['n_articles'] = ncrnas_results['rna'].apply(lambda x: len(results[results['rna'] == x]['pmcid'].unique()))
ncrnas_results['n_sentences'] = ncrnas_results['rna'].apply(lambda x: len(results[results['rna'] == x]['sentence'].unique()))
ncrnas_results.index = [1,2,3,4,5,6,7,8,9,10]
# ncrnas_results.to_latex('./ncrnas.tex')