In [1]:
import pickle
from indra_db.util import get_db, get_ro

In [2]:
db = get_db('primary')

In [56]:
show_tables = """SELECT *
                 FROM
                     pg_catalog.pg_tables
                 WHERE
                     schemaname != 'pg_catalog'
                 AND schemaname != 'information_schema';
              """

In [58]:
res = db.session.execute(show_tables)

In [10]:
list(res)

[('public', 'raw_muts', 'tester', None, True, False, True, False),
 ('public', 'text_ref', 'tester', None, True, False, True, False),
 ('public', 'rejected_statements', 'tester', None, True, False, True, False),
 ('public', 'reading_updates', 'tester', None, True, False, False, False),
 ('public', 'raw_mods', 'tester', None, True, False, True, False),
 ('public', 'raw_agents', 'tester', None, True, False, True, False),
 ('public', 'text_content', 'tester', None, True, False, True, False),
 ('public', 'source_file', 'tester', None, True, False, False, False),
 ('public', 'updates', 'tester', None, True, False, False, False),
 ('public', 'pa_agents', 'tester', None, True, False, True, False),
 ('public', 'raw_unique_links', 'tester', None, True, False, True, False),
 ('public', 'preassembly_updates', 'tester', None, True, False, False, False),
 ('public', 'pa_mods', 'tester', None, True, False, True, False),
 ('public', 'pa_muts', 'tester', None, True, False, True, False),
 ('public', 'p

In [53]:
def describe_table(tablename):
    query = """
            SELECT
                column_name, data_type
            FROM
                information_schema.columns
            WHERE
                table_name = :tablename;
            """
    res = db.session.execute(query, {'tablename': tablename})
    return list(res)

This query does a self join on the raw agents table to find texts grounded to each HGNC gene along with statement id's for these texts

In [82]:
db.session.rollback()

In [83]:
query = """
        SELECT x.stmt_id, x.ag_num, x.db_id, y.db_id, w.reader
        FROM
            raw_agents x
        INNER JOIN
            raw_agents y
        ON x.stmt_id = y.stmt_id AND x.ag_num = y.ag_num
        INNER JOIN
            raw_statements z
        ON x.stmt_id = z.id
        INNER JOIN
            reading w
        ON z.reading_id = w.id
        WHERE
            x.db_name = 'TEXT' AND y.db_name = 'HGNC'
        AND x.stmt_id IS NOT NULL
        """
text2grounding = list(db.session.execute(query))

The output of this query has been pickled so we don't need to run it multiple times

In [85]:
with open('text2grounding.pkl', 'wb') as f:
    pickle.dump(text2grounding, f)

In [88]:
text2grounding[2]

(16891, 1, 'renin', '9958', 'REACH')

In [3]:
with open('text2grounding.pkl', 'rb') as f:
    text2grounding = pickle.load(f)

In [89]:
from collections import defaultdict
from indra.databases.hgnc_client import get_hgnc_name

In [90]:
hgnc_texts = defaultdict(lambda: defaultdict(set))

In [91]:
bad = []
for stmt_id, _, text, hgnc_id, reader in text2grounding:
    name = get_hgnc_name(hgnc_id)
    if name:
        hgnc_texts[name][text].add((stmt_id, reader))
    else:
        bad.append(hgnc_id)

In [92]:
import nltk
from nltk.corpus import words

In [19]:
t = {1: 2, 2: 3}
set(t)

{1, 2}

In [93]:
wordset = set(words.words())

Get all genes with grounded from a common english word. Take only texts with over 10 statements. Calculate ratio of statements that came from a common english word

In [94]:
word_texts = {}
for gene, info in hgnc_texts.items():
    total_stmt_count = sum([len(stmts) for stmts in info.values()])
    filtered_info = {text: stmts for text, stmts in info.items() if len(stmts) > 10}
    texts = set(filtered_info)
    overlap = texts & wordset
    if overlap:
        count4word_texts = sum([len(stmts) for text, stmts in filtered_info.items() if text in overlap])
        word_texts[gene] = (filtered_info, tuple(overlap), count4word_texts/total_stmt_count)

In [95]:
worst_words = sorted(word_texts.items(), key=lambda x: -x[1][2])

In [144]:
worst_words[26][1]

({'cryptic': {(40125, 'SPARSER'),
   (98971, 'SPARSER'),
   (197912, 'SPARSER'),
   (199033, 'SPARSER'),
   (320937, 'SPARSER'),
   (504873, 'SPARSER'),
   (681201, 'SPARSER'),
   (683599, 'SPARSER'),
   (718489, 'SPARSER'),
   (722199, 'SPARSER'),
   (725408, 'SPARSER'),
   (908773, 'SPARSER'),
   (917917, 'SPARSER'),
   (1141107, 'SPARSER'),
   (1142882, 'SPARSER'),
   (1142897, 'SPARSER'),
   (1374610, 'SPARSER'),
   (1374611, 'SPARSER'),
   (1374612, 'SPARSER'),
   (1384894, 'SPARSER'),
   (1385662, 'SPARSER'),
   (1410052, 'SPARSER'),
   (1538471, 'SPARSER'),
   (1636061, 'SPARSER'),
   (1771416, 'SPARSER'),
   (1797345, 'SPARSER'),
   (1901309, 'SPARSER'),
   (2120002, 'SPARSER'),
   (2120013, 'SPARSER'),
   (2295771, 'SPARSER'),
   (2295778, 'SPARSER'),
   (2417438, 'SPARSER'),
   (2535032, 'SPARSER'),
   (2877812, 'SPARSER'),
   (2880859, 'SPARSER'),
   (3196649, 'SPARSER'),
   (3222152, 'SPARSER'),
   (3245641, 'SPARSER'),
   (3272694, 'SPARSER'),
   (3315248, 'SPARSER'),
   (

### describe_table('raw_statements')