In [1]:
import pickle
import pandas as pd
from indra_db.util import get_db, get_ro

In [2]:
db = get_db('primary')

In [3]:
show_tables = """SELECT *
                 FROM
                     pg_catalog.pg_tables
                 WHERE
                     schemaname != 'pg_catalog'
                 AND schemaname != 'information_schema';
              """

In [4]:
res = db.session.execute(show_tables)

In [5]:
list(res)

[('public', 'text_ref', 'tester', None, True, False, True, False),
 ('public', 'raw_muts', 'tester', None, True, False, True, False),
 ('public', 'rejected_statements', 'tester', None, True, False, True, False),
 ('public', 'reading_updates', 'tester', None, True, False, False, False),
 ('public', 'raw_mods', 'tester', None, True, False, True, False),
 ('public', 'raw_agents', 'tester', None, True, False, True, False),
 ('public', 'text_content', 'tester', None, True, False, True, False),
 ('public', 'mesh_ref_annotations', 'tester', None, True, False, True, False),
 ('public', 'source_file', 'tester', None, True, False, False, False),
 ('public', 'updates', 'tester', None, True, False, False, False),
 ('public', 'pa_agents', 'tester', None, True, False, True, False),
 ('public', 'raw_unique_links', 'tester', None, True, False, True, False),
 ('public', 'preassembly_updates', 'tester', None, True, False, False, False),
 ('public', 'pa_mods', 'tester', None, True, False, True, False),
 

In [6]:
def describe_table(tablename):
    query = """
            SELECT
                column_name, data_type
            FROM
                information_schema.columns
            WHERE
                table_name = :tablename;
            """
    res = db.session.execute(query, {'tablename': tablename})
    return list(res)

This query does a self join on the raw agents table to find texts grounded to each HGNC gene along with statement id's for these texts

In [None]:
db.session.rollback()

In [None]:
query = """
        SELECT x.stmt_id, x.ag_num, x.db_id, y.db_id, w.reader
        FROM
            raw_agents x
        INNER JOIN
            raw_agents y
        ON x.stmt_id = y.stmt_id AND x.ag_num = y.ag_num
        INNER JOIN
            raw_statements z
        ON x.stmt_id = z.id
        INNER JOIN
            reading w
        ON z.reading_id = w.id
        WHERE
            x.db_name = 'TEXT' AND y.db_name = 'HGNC'
        AND x.stmt_id IS NOT NULL
        """
text2grounding = list(db.session.execute(query))

The output of this query has been pickled so we don't need to run it multiple times

In [None]:
with open('text2grounding.pkl', 'wb') as f:
    pickle.dump(text2grounding, f)

In [None]:
text2grounding[2]

In [7]:
with open('text2grounding.pkl', 'rb') as f:
    text2grounding = pickle.load(f)

In [8]:
from collections import defaultdict
from indra.databases.hgnc_client import get_hgnc_name

In [9]:
hgnc_texts = defaultdict(lambda: defaultdict(set))

In [10]:
bad = []
for stmt_id, _, text, hgnc_id, reader in text2grounding:
    name = get_hgnc_name(hgnc_id)
    if name:
        hgnc_texts[name][text].add((stmt_id, reader))
    else:
        bad.append(hgnc_id)

In [11]:
import nltk
from nltk.corpus import words

In [12]:
wordset = set(words.words())

Get all genes with grounded from a common english word. Take only texts with over 10 statements. Calculate ratio of statements that came from a common english word

In [13]:
word_texts = {}
for gene, info in hgnc_texts.items():
    total_stmt_count = sum([len(stmts) for stmts in info.values()])
    filtered_info = {text: stmts for text, stmts in info.items() if len(stmts) > 10}
    texts = set(filtered_info)
    overlap = texts & wordset
    if overlap:
        count4word_texts = sum([len(stmts) for text, stmts in filtered_info.items() if text in overlap])
        word_texts[gene] = (filtered_info, tuple(overlap), count4word_texts/total_stmt_count,
                           total_stmt_count)

In [14]:
worst_words = sorted(word_texts.items(), key=lambda x: -x[1][2])

In [15]:
rows = []
for gene, info in worst_words:
    new_row = [gene, ','.join(info[1]), info[2], info[3]]
    rows.append(new_row)
df = pd.DataFrame(rows, columns=['gene', 'english_words', 'proportion', 'total_stmts'])

In [None]:
df[df.proportion > 0.8].to_csv('worst_english_words.tsv', sep='\t', index=False)

In [16]:
from indra_db.util.content_scripts import get_text_content_from_stmt_ids
from indra.literature.adeft_tools import get_text_content_for_gene, universal_extract_text

In [None]:
THG1L_texts = get_text_content_for_gene('THG1L')

In [None]:
texts = [universal_extract_text(text) for text in THG1L_texts]

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
pipeline = Pipeline([('tfidf',
                    TfidfVectorizer(ngram_range=(1, 2),
                    max_features=10000,
                    stop_words='english')),
                    ('osvm', OneClassSVM(gamma='scale'))])                  

In [None]:
params = {'tfidf__max_features': [100, 1000],
          'tfidf__ngram_range': [(1, 1), (1, 2)],
          'osvm__nu': [0.05, 0.1, 0.15, 0.3],
          'osvm__degree': [2, 3]}

In [None]:
grid_search = GridSearchCV(pipeline, params, cv=5, n_jobs=8, scoring='accuracy')

In [None]:
grid_search.fit(texts, [1]*len(texts))

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
THG1L_stmt_ids = [pmid for pmid, reader in worst_words[1][1][0]['foci']]

In [None]:
THG1L_foci_texts = get_text_content_from_stmt_ids(THG1L_stmt_ids)

In [None]:
db_texts = list(THG1L_foci_texts[1].values())

In [None]:
db_texts = [universal_extract_text(t) for t in db_texts if t]

In [None]:
preds = grid_search.predict(db_texts)

In [None]:
p = [(x+1)/2 for x in preds]

In [None]:
sum(p)/len(p)

In [None]:
db_texts[1]

In [None]:
F13B_texts = get_text_content_for_gene('F13B')

In [None]:
grid_search.fit(F13B_texts, [1]*len(F13B_texts))

In [None]:
grid_search.best_score_

In [None]:
worst_words[0][0]

In [None]:
F13B_stmt_ids = [pmid for pmid, reader in worst_words[0][1][0]['fibrin']]

In [None]:
F13B_db_texts = get_text_content_from_stmt_ids(F13B_stmt_ids)

In [None]:
F13B_db_texts = list(F13B_db_texts[1].values())

In [None]:
F13B_db_texts[10]

In [None]:
F13B_db_texts = [universal_extract_text(t) for t in F13B_db_texts if t]

In [None]:
preds = grid_search.predict(F13B_db_texts)

In [None]:
preds = [(x+1)/2 for x in preds]

In [None]:
sum(preds)/len(preds)

In [None]:
F13B_db_texts[50]

In [None]:
list(worst_words[0][1][0].keys())

In [None]:
list(hgnc_texts['F13B'].keys())

In [None]:
hgnc_texts['F13B']['F13b']

In [None]:
insulin_texts = get_text_content_for_gene('INS')

In [None]:
insulin_texts = [universal_extract_text(text) for text in insulin_texts]

In [None]:
pipeline.fit(insulin_texts)

In [None]:
df.head()

In [None]:
df.head(20)

In [None]:
INS_stmts = [stmt for stmt, reader in worst_words[15][1][0]['insulin']]

In [None]:
INS_db_texts = get_text_content_from_stmt_ids(INS_stmts)

In [None]:
db_texts = [universal_extract_text(t) for t in INS_db_texts[1].values() if t]

In [None]:
preds = pipeline.predict(db_texts)

In [None]:
sum([(x+1)/2 for x in preds])/len(preds)

In [None]:
worst_words[8][1]

In [18]:
db.session.rollback()

In [20]:
describe_table('raw_agents')

[('id', 'integer'),
 ('stmt_id', 'integer'),
 ('db_name', 'character varying'),
 ('db_id', 'character varying'),
 ('ag_num', 'integer'),
 ('role', 'character varying')]

In [19]:
describe_table('raw_statements')

[('id', 'integer'),
 ('uuid', 'character varying'),
 ('batch_id', 'integer'),
 ('mk_hash', 'bigint'),
 ('text_hash', 'bigint'),
 ('source_hash', 'bigint'),
 ('db_info_id', 'integer'),
 ('reading_id', 'bigint'),
 ('type', 'character varying'),
 ('indra_version', 'character varying'),
 ('json', 'bytea'),
 ('create_date', 'timestamp without time zone')]

In [48]:
from sqlalchemy import text as sql_text

In [31]:
query = """
        SELECT stmt.id
        FROM
            raw_statements stmt
        INNER JOIN
            (SELECT * FROM raw_agents ra
             WHERE 
                 ra.db_name = 'TEXT' AND
                 ra.db_id LIKE 'MYL_' AND
                 ra.stmt_id IS NOT NULL) myl
        ON stmt.id = myl.stmt_id
        WHERE
            stmt.type = 'Complex'
        """
myl_statements = list(db.session.execute(query))

In [43]:
myl_statements = [x[0] for x in myl_statements]

In [44]:
myl_statements

[1588298,
 630404,
 727638,
 1300348,
 3220238,
 2122166,
 3220239,
 3732159,
 3220241,
 3460408,
 3644367,
 2122166,
 3406339,
 3732119,
 4316383,
 4316383,
 9001130,
 9468876,
 9332344,
 9468896,
 12764781,
 10571812,
 11050455,
 11927064,
 12670277,
 10875295,
 10950823,
 15190087,
 12670277,
 12764781,
 13192177,
 13192178,
 13507319,
 15742542,
 15938982,
 13669793,
 16193140,
 17377778,
 17929576,
 17459630,
 17459633,
 21847662,
 18651888,
 18651889,
 18703343,
 19574845,
 20071602,
 20784443,
 20784448,
 21154623,
 18651882,
 19285091,
 19343200,
 19382034,
 20842389,
 21404102,
 21924344,
 22381540,
 22381547,
 22403359,
 22449565,
 22449570,
 22449573,
 22449579,
 22449585,
 24880576,
 22461951,
 24880583,
 26775344,
 25577698,
 26180323,
 26180323,
 26616054,
 25624507,
 26551956,
 30237880,
 30268535,
 30268539,
 30268542,
 30237880,
 30268535,
 30268539,
 30268542]

In [54]:
query = """
        SELECT db_id 
        FROM
            raw_agents
        WHERE
            stmt_id = ANY(:stmts)
        """
myl_binders = list(db.session.execute(sql_text(query), {'stmts': myl_statements}))

In [52]:
db.session.rollback()

In [55]:
myl_binders

[('miRNA-29a-3p',),
 ('miRNA-29a-3p',),
 ('Q15746',),
 ('7590',),
 ('MYLK',),
 ('MYLK',),
 ('P60660',),
 ('Q9UBC3',),
 ('DNMT3B',),
 ('2979',),
 ('DNMT3B',),
 ('Q15746',),
 ('MYLK',),
 ('7590',),
 ('MYLK',),
 ('P05556',),
 ('ITGB1',),
 ('6153',),
 ('ITGB1',),
 ('P15692',),
 ('VEGFA',),
 ('12680',),
 ('VEGFA',),
 ('Q15746',),
 ('MYLK',),
 ('7590',),
 ('MYLK',),
 ('Q9V3E7',),
 ('ALI',),
 ('Ref1',),
 ('7587',),
 ('MYL6',),
 ('MYL6',),
 ('P35579',),
 ('7579',),
 ('MYH9',),
 ('MYH9',),
 ('P05091',),
 ('SPP1',),
 ('SPP1',),
 ('P10916',),
 ('MYL2',),
 ('7583',),
 ('MYL2',),
 ('P05091',),
 ('ALDH2',),
 ('404',),
 ('ALDH2',),
 ('P15516',),
 ('Pb',),
 ('5284',),
 ('HTN3',),
 ('P10916',),
 ('MYL2',),
 ('7583',),
 ('MYL2',),
 ('ALDH2',),
 ('404',),
 ('ALDH2',),
 ('P15516',),
 ('Pb',),
 ('5284',),
 ('HTN3',),
 ('P10916',),
 ('MYL2',),
 ('7583',),
 ('MYL2',),
 ('P10916',),
 ('MYL2',),
 ('7583',),
 ('MYL2',),
 ('P05091',),
 ('ALDH2',),
 ('404',),
 ('ALDH2',),
 ('P15516',),
 ('Pb',),
 ('5284',),
 ('HT

In [62]:
[x for x in worst_words if 'light' in x[1][0]]

[('TNFSF14',
  ({'Light': {(1594793, 'REACH'),
     (24196521, 'REACH'),
     (11162151, 'REACH'),
     (29708839, 'REACH'),
     (13507377, 'REACH'),
     (26284051, 'REACH'),
     (25103105, 'REACH'),
     (27001709, 'REACH'),
     (1437342, 'REACH'),
     (11399948, 'REACH'),
     (29572999, 'REACH'),
     (13670338, 'REACH'),
     (20101058, 'REACH'),
     (15307824, 'REACH'),
     (16332247, 'REACH'),
     (23311831, 'REACH'),
     (11077197, 'REACH'),
     (30650215, 'REACH'),
     (26739477, 'REACH'),
     (27427974, 'REACH'),
     (645428, 'REACH'),
     (17090210, 'REACH'),
     (3505967, 'REACH'),
     (27606831, 'REACH'),
     (30155293, 'REACH'),
     (9322405, 'REACH'),
     (29269242, 'REACH'),
     (15251432, 'REACH'),
     (21690344, 'REACH'),
     (19298406, 'REACH'),
     (24039633, 'REACH'),
     (19215503, 'REACH'),
     (24669419, 'REACH'),
     (28447032, 'REACH'),
     (24902177, 'REACH'),
     (28062016, 'REACH'),
     (13168269, 'REACH'),
     (22144712, 'REACH

In [63]:
type(hgnc_texts)

collections.defaultdict

In [74]:
crys = [(x, y) for x, y in hgnc_texts.items() if 'crystallin' in y]

In [75]:
len(crys)

1

In [76]:
crys

[('CRYGC',
  defaultdict(set,
              {'gammaC-crystallin': {(149076, 'REACH'),
                (9442622, 'REACH'),
                (9442624, 'REACH'),
                (16212257, 'REACH'),
                (17729323, 'REACH'),
                (17729328, 'REACH'),
                (17729329, 'REACH'),
                (19477920, 'REACH'),
                (19477922, 'REACH')},
               'crystallin': {(198061, 'SPARSER'),
                (230764, 'SPARSER'),
                (673705, 'SPARSER'),
                (698313, 'SPARSER'),
                (1150382, 'SPARSER'),
                (1150384, 'SPARSER'),
                (1152838, 'SPARSER'),
                (1532585, 'SPARSER'),
                (1538478, 'SPARSER'),
                (1538481, 'SPARSER'),
                (1541240, 'SPARSER'),
                (1541242, 'SPARSER'),
                (1541243, 'SPARSER'),
                (1541244, 'SPARSER'),
                (1541245, 'SPARSER'),
                (1635900, 'SPARSER'),
 