# Calculate Odds Ratios for each Square Bin

In [1]:
%load_ext autoreload
%autoreload 2

from collections import Counter
import csv
import json
from pathlib import Path
import re

import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm_notebook

from annorxiver_modules.corpora_comparison_helper import (
    aggregate_word_counts,
    get_term_statistics
)

# Gather Paper Bins Dataframe

In [2]:
pmc_df = pd.read_csv(
    "output/paper_dataset/paper_dataset_tsne_square.tsv",
    sep="\t"
)
pmc_df.head()

Unnamed: 0,dim1,dim2,journal,document,squarebin_id
0,-18.124933,-24.866093,Cancer_Inform,PMC2675495,0
1,-14.685319,-25.280441,Proteomics,PMC4230411,1
2,-10.54606,-24.52709,Chemphyschem,PMC5129479,2
3,-10.54606,-24.52709,Chembiochem,PMC4736454,2
4,-10.54606,-24.52709,ChemistryOpen,PMC5715300,2


In [3]:
word_count_folder = Path("../pmc_corpus/pmc_word_counts/")

In [4]:
bin_group = pmc_df.groupby("squarebin_id")

In [5]:
spacy_nlp = spacy.load('en_core_web_sm')
stop_word_list = list(spacy_nlp.Defaults.stop_words)

In [6]:
global_word_counter = Counter()
for name, group in tqdm_notebook(bin_group):
    files = [
        f"{word_count_folder.resolve()}/{doc}.tsv"
        for doc in group.document.tolist()
    ]
    
    agg_word_count = aggregate_word_counts(files, disable_progressbar=True)
    
    filtered_agg_word_count = {
       term[0]:agg_word_count[term] 
        for term in agg_word_count 
        if term[1] != 'SPACE' and term[0] not in stop_word_list
    }
    
    global_word_counter.update(Counter(filtered_agg_word_count))

HBox(children=(IntProgress(value=0, max=1478), HTML(value='')))




In [8]:
for bin_id, group in tqdm_notebook(bin_group):
    files = [
        f"{word_count_folder.resolve()}/{doc}.tsv"
        for doc in group.document.tolist()
    ]
    
    agg_word_count = aggregate_word_counts(files, disable_progressbar=True)
    
    filtered_agg_word_count = {
       term[0]:agg_word_count[term] 
        for term in agg_word_count 
        if term[1] != 'SPACE' and "\\" not in repr(term[0]) and term[0] not in stop_word_list
    }
    
    bin_counter = Counter(filtered_agg_word_count)
    remaining_words = (
        Counter({
            term:global_word_counter[term] 
            for term in filtered_agg_word_count
        })  - bin_counter
    )

    bin_df = (
        pd.DataFrame.from_dict(
            dict(bin_counter),
            orient="index",
            columns=["count"]
        )
        .rename_axis("lemma")
        .reset_index()
    )
    
    background_df = (
        pd.DataFrame.from_dict(
            {
                key:remaining_words[key]
                for key in bin_counter
                if key in remaining_words
            },
            orient="index",
            columns=["count"]
        )
        .rename_axis("lemma")
        .reset_index()
    )
    
    # Calculate the odds ratio
    word_odds_df = get_term_statistics(
        bin_df,
        background_df,
        100, 
        psudeocount=1,
        disable_progressbar=True
    )
    
    (
        word_odds_df
        .to_csv(
            f"output/word_odds/word_odds_bin_{bin_id}.tsv", 
            sep="\t", index=False
        )
    )

HBox(children=(IntProgress(value=0, max=1478), HTML(value='')))




# Insert Bin Word Associations in JSON File

In [2]:
square_bin_plot_df = pd.read_json(
    open(
        Path("output")/
        Path("app_plots")/
        Path("pmc_square_plot.json")
    )
)
square_bin_plot_df.head()

Unnamed: 0,x,y,xmin,xmax,ymin,ymax,count,bin_id,pc,journal
0,-18.275,-25.075,-18.7,-17.85,-25.5,-24.65,1,0,"[{'score': 0.3609571261, 'pc': '01'}, {'score'...",{'Cancer_Inform': 1}
1,-14.875,-25.075,-15.3,-14.45,-25.5,-24.65,1,1,"[{'score': 0.5555347419000001, 'pc': '01'}, {'...",{'Proteomics': 1}
2,-10.625,-24.225,-11.05,-10.2,-24.65,-23.8,38,2,"[{'score': 0.6018766422, 'pc': '01'}, {'score'...","{'Chemphyschem': 1, 'Chembiochem': 1, 'Chemist..."
3,-15.725,-23.375,-16.15,-15.3,-23.8,-22.95,2,3,"[{'score': 0.575093618, 'pc': '01'}, {'score':...","{'Glob_Chall': 1, 'Adv_Sci_(Weinh)': 1}"
4,-14.875,-23.375,-15.3,-14.45,-23.8,-22.95,5,4,"[{'score': 0.4791479219, 'pc': '01'}, {'score'...","{'Neth_Heart_J': 1, 'Genome_Biol': 2, 'J_Biol_..."


In [3]:
lemma_bin_records = []
for bin_id in tqdm_notebook(square_bin_plot_df.bin_id.tolist()):
    bin_assoc_df = pd.read_csv(
        f"output/word_odds/word_odds_bin_{bin_id}.tsv",
        sep="\t"
    )
    
    high_odds_words = (
        bin_assoc_df
        .sort_values("odds_ratio", ascending=False)
        .head(15)
        [["lemma", "odds_ratio"]]
    )
      
    lemma_bin_records.append([
        {
            "lemma": pair[0],
            "odds_ratio": pair[1]
        }
        for pair in zip(high_odds_words.lemma, high_odds_words.odds_ratio)
    ])

HBox(children=(IntProgress(value=0, max=1478), HTML(value='')))




In [8]:
(
    square_bin_plot_df
    .assign(bin_odds=lemma_bin_records)
    .to_json(
        Path("output")/
        Path("app_plots")/
        Path("pmc_square_plot.json"),
        orient = 'records',
        lines = False
    )
)