In [81]:
import json
import codecs
import math
import numpy as np
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

In [97]:
with open('data/allMeSH_2022.json', "rb") as f:
    num_lines = sum(1 for _ in f)

num_lines

16218839

In [99]:
with codecs.open(json_file, 'r', encoding='utf-8', errors='ignore') as corpus:
    line_nr = 0
    for line in corpus:
        if line_nr > 16218829:
            print(line[:-2])
        line_nr += 1

{"journal":"Journal of bacteriology","meshMajor":["Cephalothin","Chloramphenicol","Drug Resistance, Microbial","Drug Stability","Enterobacter","Escherichia","Klebsiella","Pharmaceutical Preparations","Proteus","Salmonella","Shigella","Tetracycline"],"year":"1964","abstractText":"Wick, Warren E. (The Lilly Research Laboratories, Indianapolis, Ind.). Influence of antibiotic stability on the results of in vitro testing procedures. J. Bacteriol. 87:1162-1170. 1964.-Certain antibiotics undergo at least partial degradation under the conditions of in vitro testing procedures. With cephalothin used as an example, experimental evidence is presented to indicate the necessity for re-evaluation of results obtained from in vitro sensitivity testing methods for some antibiotics. The in vitro activity of cephalothin, tetracycline, and chloramphenicol against a variety of gram-negative bacteria is described. Plate counts demonstrate changes in the viable cell population over a 48-hr period in tubes of

In [80]:
# Ensure you have downloaded the necessary NLTK data files
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess(text):
    # Tokenize, remove stop words, and stem the text
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    tokens = word_tokenize(text.lower())
    tokens = [stemmer.stem(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/julian/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
json_file = 'data/allMeSH_2022.json'
list_counter = []

# showcase preprocessing
with codecs.open(json_file, 'r', encoding='utf-8', errors='ignore') as corpus:
    line_nr = 0
    for line in corpus:
        if line_nr > 0:
            line_json = json.loads(line[:-2])
            print(line_json["pmid"])
            print(preprocess(line_json["abstractText"]))
            list_counter.append(Counter(preprocess(line_json["abstractText"])))
            #print(line[:-2])
        line_nr += 1
        if line_nr >= 11:
            break

34823483
['background', 'worldwid', 'hypertens', 'disord', 'pregnanc', 'hdp', 'fetal', 'growth', 'restrict', 'fgr', 'preterm', 'birth', 'remain', 'lead', 'caus', 'matern', 'fetal', 'mortal', 'morbid', 'fetal', 'cardiac', 'deform', 'chang', 'first', 'sign', 'placent', 'dysfunct', 'associ', 'hdp', 'fgr', 'preterm', 'birth', 'addit', 'preterm', 'birth', 'like', 'associ', 'chang', 'electr', 'activ', 'across', 'uterin', 'muscl', 'therefor', 'fetal', 'cardiac', 'function', 'uterin', 'activ', 'use', 'earli', 'detect', 'complic', 'pregnanc', 'fetal', 'cardiac', 'function', 'uterin', 'activ', 'assess', 'echocardiographi', 'fetal', 'electrocardiographi', 'electrohysterographi', 'ehg', 'studi', 'aim', 'gener', 'refer', 'valu', 'ehg', 'paramet', 'second', 'trimest', 'pregnanc', 'investig', 'diagnost', 'potenti', 'paramet', 'earli', 'detect', 'hdp', 'fgr', 'preterm', 'longitudin', 'prospect', 'cohort', 'studi', 'elig', 'women', 'recruit', 'tertiari', 'care', 'hospit', 'primari', 'midwiferi', 'pract

In [100]:
def build_index(corpus_path):
    index = defaultdict(list)
    doc_ids = {}
    tf = {}
    df = Counter()
    total_docs = 0
    avg_dl = 0

    with codecs.open(corpus_path, 'r', encoding='utf-8', errors='ignore') as corpus:
        doc_id = -1
        for line in corpus:
            if doc_id > -1:
                line_json = json.loads(line[:-2])
                tokens = preprocess(line_json["abstractText"])

                doc_ids[doc_id] = line_json["pmid"]
                total_docs += 1
                avg_dl += len(tokens)

                # Calculate term frequency for the document
                tf[doc_id] = Counter(tokens)
                for token in set(tokens):
                    df[token] += 1
                    if doc_id not in index[token]:
                        index[token].append(doc_id)

            doc_id += 1
            if doc_id >= 100000:
                break
    # Calculate IDF: check with lecture slids which method
    idf = {token: math.log(total_docs / freq) for token, freq in df.items()}
    avg_dl = avg_dl / total_docs

    return index, doc_ids, tf, idf, avg_dl

In [101]:
index, doc_ids, tf, idf, avg_dl = build_index(json_file)

In [89]:
print(idf.get("hirschsprung", 0))

9.210340371976184


In [90]:
print(tf[0]["hirschsprung"])

0


In [91]:
def calc_scores(query, tf, idf, doc_ids, avg_dl, k1 = 1.5, b = 0.75):
    query_tokens = preprocess(query)

    scores = []

    nr_docs = len(doc_ids)
    for doc_id in range(nr_docs):
        score = 0
        for token in query_tokens:
            numerator = idf.get(token, 0) * tf[doc_id][token] * (k1 + 1)
            denominator = tf[doc_id][token] + k1 * (1-b+b*nr_docs/avg_dl)
            score += numerator / denominator
        scores.append(score)
    return scores

In [102]:
scores = calc_scores("Is Hirschsprung disease a mendelian or a multifactorial disorder?", tf, idf, doc_ids, avg_dl)

In [103]:
np.array(scores).argsort()[::-1][:100]

array([ 4393, 32519, 56329, 28015,  8112, 32430, 20705, 99219, 86489,
       12550,  1078,  8650, 18828, 24153,  5523,   675, 27617, 72717,
       30584, 14829,  7850, 17477, 51264, 78976, 93669,  4674, 27835,
       75720, 92379, 28803, 31679, 13747, 62475, 22590, 72817, 66776,
       34441, 73005, 89728, 52068,    46, 50776, 30623,  5406, 15981,
       35537, 11168, 10936, 19899, 31942, 81135, 76997, 72234, 49841,
        8006, 41076, 49050, 28025, 91050, 96974, 17654, 55831, 76223,
       67767, 43003, 14019, 89595, 49117, 73332, 75469, 96825, 27252,
       12992,   952, 97717, 43085, 51369, 24268, 77000, 31488, 26972,
        9523, 23727, 36188, 79966, 22853, 53691, 18348, 82363, 93782,
       40055, 76704, 28942, 13076, 96771, 20326, 13435, 23763, 83888,
       82954])

In [104]:
doc_ids[4393]

'34634250'