# Calculate Odds Ratios for each Square Bin

In [1]:
%load_ext autoreload
%autoreload 2

import csv
from collections import Counter, defaultdict
import json
import lzma
from multiprocessing import Process, Manager
from pathlib import Path
import pickle
import re
import sys
from threading import Thread

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

from annorxiver_modules.corpora_comparison_helper import (
    aggregate_word_counts,
    get_term_statistics,
)

from annorxiver_modules.word_bin_helper import lemmatize_tokens, write_lemma_counts

# Gather Paper Bins Dataframe

In [2]:
pmc_df = pd.read_csv("output/paper_dataset/paper_dataset_tsne_square.tsv", sep="\t")
print(pmc_df.shape)
pmc_df.head()

(1809901, 5)


Unnamed: 0,dim1,dim2,journal,document,squarebin_id
0,18.007381,-18.416918,Med_Hist_Suppl,PMC2557413,0
1,18.007381,-18.416918,Med_Hist_Suppl,PMC2530993,0
2,18.007381,-18.416918,Med_Hist_Suppl,PMC2557483,0
3,18.007381,-18.416918,Med_Hist_Suppl,PMC2642002,0
4,18.007381,-18.416918,Med_Hist_Suppl,PMC2530995,0


In [3]:
word_count_folder = Path("../pmc_corpus/pmc_word_counts/")

In [4]:
word_counter_file = "output/app_plots/global_doc_word_counter.tsv.xz"
field_names = ["document", "lemma", "count"]
n_jobs = 3
QUEUE_SIZE = 75000  # Queue Size if too big then will need to make smaller
doc_xpath = "//abstract/sec/*|//abstract/p|//body/sec/*|//body/p"

In [5]:
with Manager() as m:

    # Set up the Queue
    doc_path_queue = m.JoinableQueue(QUEUE_SIZE)
    lemma_queue = m.JoinableQueue(QUEUE_SIZE)

    # Start the document object feeder
    t = Thread(
        target=write_lemma_counts,
        args=(word_counter_file, field_names, lemma_queue, n_jobs),
    )
    t.start()

    running_jobs = []
    # Start the jobs
    for job in range(n_jobs):
        p = Process(
            target=lemmatize_tokens, args=(doc_xpath, doc_path_queue, lemma_queue)
        )
        running_jobs.append(p)
        p.start()

    for idx, row in tqdm_notebook(pmc_df.iterrows()):
        doc_path = f"../journals/{row['journal']}/{row['document']}.nxml"
        doc_path_queue.put(doc_path)

    # Poison pill to end running jobs
    for job in running_jobs:
        doc_path_queue.put(None)

    # Wait for jobs to finish
    for job in running_jobs:
        job.join()

    # Wait until thread is done running
    t.join()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [14]:
with lzma.open(word_counter_file, "rt") as infile:
    reader = csv.DictReader(infile, delimiter="\t")

    background_bin_dictionaries = defaultdict(Counter)
    word_bin_dictionaries = {
        squarebin_id: defaultdict(Counter)
        for squarebin_id in pmc_df.squarebin_id.unique()
    }

    document_mapper = dict(zip(pmc_df.document.tolist(), pmc_df.squarebin_id.tolist()))

    for line in tqdm_notebook(reader):
        squarebin_id = document_mapper[line["document"]]
        background_bin_dictionaries.update({line["lemma"]: int(line["count"])})
        word_bin_dictionaries[squarebin_id].update({line["lemma"]: int(line["count"])})

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [15]:
cutoff_score = 20
background_sum = sum(background_bin_dictionaries.values())
bin_ratios = {}

for squarebin in tqdm_notebook(word_bin_dictionaries):

    bin_dict = word_bin_dictionaries[squarebin]
    bin_sum = sum(word_bin_dictionaries[squarebin].values())

    # Try and filter out low count tokens to speed function up
    filtered_bin_dict = {
        lemma: bin_dict[lemma] for lemma in bin_dict if bin_dict[lemma] > cutoff_score
    }

    if len(filtered_bin_dict) > 0:
        bin_dict = filtered_bin_dict

    # Calculate odds ratio
    bin_words = set(bin_dict.keys())
    background_words = set(background_bin_dictionaries.keys())
    words_to_compute = bin_words & background_words

    word_odd_ratio_records = []
    for idx, word in enumerate(words_to_compute):
        top = float(bin_dict[word] * background_sum)
        bottom = float(background_bin_dictionaries[word] * bin_sum)
        word_odd_ratio_records.append(
            {"lemma": word, "odds_ratio": np.log(top / bottom)}
        )

    sorted(word_odd_ratio_records, key=lambda x: x["odds_ratio"], reverse=True)
    bin_ratios[squarebin] = word_odd_ratio_records[0:20]

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))




# Insert Bin Word Associations in JSON File

In [16]:
square_bin_plot_df = pd.read_json(
    open(Path("output") / Path("app_plots") / Path("pmc_square_plot.json"))
)
square_bin_plot_df.head()

Unnamed: 0,x,y,xmin,xmax,ymin,ymax,count,bin_id,pc,journal
0,18.275,-18.275,17.85,18.7,-18.7,-17.85,2044,0,"[{'score': -0.399191306, 'pc': '03'}, {'score'...","{'Med_Hist_Suppl': 40, 'Yale_J_Biol_Med': 482,..."
1,-7.225,-14.875,-7.65,-6.8,-15.3,-14.45,2,1,"[{'score': -0.48889934160000004, 'pc': '04'}, ...",{'Comp_Funct_Genomics': 2}
2,-7.225,-13.175,-7.65,-6.8,-13.6,-12.75,1,2,"[{'score': 0.3814907737, 'pc': '02'}, {'score'...",{'Comp_Funct_Genomics': 1}
3,-1.275,-13.175,-1.7,-0.8499999,-13.6,-12.75,1,3,"[{'score': 0.5376298982000001, 'pc': '01'}, {'...",{'Comp_Funct_Genomics': 1}
4,-0.425,-12.325,-0.85,8.5e-08,-12.75,-11.9,2,4,"[{'score': 0.6109766762000001, 'pc': '01'}, {'...","{'Int_J_Biomed_Imaging': 1, 'Comput_Math_Metho..."


In [17]:
bin_odds_df = pd.DataFrame.from_records(
    [{"bin_id": key, "bin_odds": bin_ratios[key]} for key in bin_ratios]
)
bin_odds_df.head()

Unnamed: 0,bin_id,bin_odds
0,0,"[{'lemma': ' ', 'odds_ratio': 16.517769203672..."
1,1,"[{'lemma': 'question', 'odds_ratio': 13.217313..."
2,2,"[{'lemma': 'new', 'odds_ratio': 15.76399740129..."
3,3,"[{'lemma': 'include', 'odds_ratio': 12.4234246..."
4,4,"[{'lemma': 'predict', 'odds_ratio': 12.9987887..."


In [18]:
(
    square_bin_plot_df.merge(bin_odds_df, on=["bin_id"]).to_json(
        Path("output") / Path("app_plots") / Path("pmc_square_plot.json"),
        orient="records",
        lines=False,
    )
)