In [1]:
import os
import json

from typing import List

from collections import Counter
from dataclasses import dataclass
from dataclasses import field

from nltk.tokenize import word_tokenize

In [2]:
@dataclass
class SentCounter:
    lengths: Counter = field(default_factory=Counter)
    punctuation_marks: Counter = field(default_factory=Counter)
    brackets: Counter = field(default_factory=Counter)
    long_sents: List = field(default_factory=list)
    short_sents: List = field(default_factory=list)
    many_punctuations: List = field(default_factory=list)
    many_brackets: List = field(default_factory=list)
    total: int = 0

In [3]:
tagged_sent_path = os.path.join('data', 'tagged_sentences')

sent_counter = SentCounter()

for index in range(1, 1016):
    s_i = str(index)
    zeros = "0" * (4 - len(s_i))
    tagged_sent_file = f"tagged_pubmed20n{zeros + s_i}.json"
    file_path = os.path.join(tagged_sent_path, tagged_sent_file)
    with open(file_path, "r") as json_file:
        file_data = json.load(json_file)
    #count = 0
    for sent in file_data:
        sent_token = word_tokenize(sent["sentence"])
        punctuation_marks = 0
        number_brackets = 0
        for token in sent_token:
            if token in "!;,?.-–":
                punctuation_marks += 1
            if token in "()[]{}":
                number_brackets += 1
        sent_counter.lengths[f"{len(sent_token)}"] += 1
        sent_counter.punctuation_marks[f"{punctuation_marks}"] += 1
        sent_counter.brackets[f"{number_brackets}"] += 1
        if len(sent_token) >= 200:
            sent_counter.long_sents.append(sent)
        if len(sent_token) <= 5:
            sent_counter.short_sents.append(sent)
        if punctuation_marks >= 100:
            sent_counter.many_punctuations.append(sent)
        if number_brackets >= 50:
            sent_counter.many_brackets.append(sent)
        sent_counter.total += 1
        
        #if count <= 1:
        #    break

In [4]:
sent_length = {int(key): val for key, val in sent_counter.lengths.items()}
print(json.dumps(sent_length, indent=2, sort_keys=True))

{
  "3": 255,
  "4": 827,
  "5": 2145,
  "6": 4735,
  "7": 8280,
  "8": 13107,
  "9": 19596,
  "10": 27588,
  "11": 37387,
  "12": 47439,
  "13": 57660,
  "14": 65820,
  "15": 74065,
  "16": 80999,
  "17": 85405,
  "18": 88700,
  "19": 90850,
  "20": 91936,
  "21": 92302,
  "22": 92505,
  "23": 90599,
  "24": 89519,
  "25": 87609,
  "26": 85590,
  "27": 82909,
  "28": 80645,
  "29": 77904,
  "30": 75328,
  "31": 71557,
  "32": 68073,
  "33": 65488,
  "34": 61974,
  "35": 58401,
  "36": 54517,
  "37": 51577,
  "38": 48399,
  "39": 45246,
  "40": 42432,
  "41": 38701,
  "42": 36266,
  "43": 33578,
  "44": 30872,
  "45": 28460,
  "46": 26915,
  "47": 24412,
  "48": 22806,
  "49": 20685,
  "50": 19060,
  "51": 17450,
  "52": 16167,
  "53": 14862,
  "54": 13549,
  "55": 12315,
  "56": 11530,
  "57": 10713,
  "58": 9863,
  "59": 8976,
  "60": 8300,
  "61": 7708,
  "62": 7147,
  "63": 6368,
  "64": 6062,
  "65": 5511,
  "66": 5150,
  "67": 4792,
  "68": 4410,
  "69": 4261,
  "70": 3713,
  "71

In [5]:
sent_punctuations = {int(key): val for key, val in sent_counter.punctuation_marks.items()}
print(json.dumps(sent_punctuations, indent=2, sort_keys=True))

{
  "0": 2343,
  "1": 1246927,
  "2": 585424,
  "3": 415901,
  "4": 214316,
  "5": 114698,
  "6": 62459,
  "7": 35638,
  "8": 21071,
  "9": 12695,
  "10": 8338,
  "11": 5346,
  "12": 3729,
  "13": 2497,
  "14": 1732,
  "15": 1232,
  "16": 878,
  "17": 666,
  "18": 534,
  "19": 377,
  "20": 314,
  "21": 230,
  "22": 173,
  "23": 139,
  "24": 116,
  "25": 100,
  "26": 55,
  "27": 48,
  "28": 42,
  "29": 47,
  "30": 29,
  "31": 29,
  "32": 23,
  "33": 21,
  "34": 15,
  "35": 14,
  "36": 11,
  "37": 11,
  "38": 7,
  "39": 4,
  "40": 11,
  "41": 5,
  "42": 4,
  "43": 4,
  "44": 5,
  "45": 4,
  "46": 1,
  "47": 3,
  "48": 5,
  "49": 4,
  "50": 2,
  "51": 2,
  "52": 1,
  "54": 1,
  "55": 4,
  "56": 3,
  "58": 3,
  "59": 1,
  "61": 1,
  "63": 2,
  "64": 3,
  "72": 1,
  "75": 1,
  "82": 1,
  "84": 1,
  "87": 1,
  "88": 1,
  "91": 2,
  "93": 1,
  "97": 1,
  "106": 1,
  "108": 2,
  "110": 1,
  "112": 1,
  "114": 1,
  "115": 1,
  "116": 1,
  "117": 2,
  "118": 1,
  "120": 2,
  "122": 1,
  "129": 1

In [6]:
sent_brackets = {int(key): val for key, val in sent_counter.brackets.items()}
print(json.dumps(sent_brackets, indent=2, sort_keys=True))

{
  "0": 1757332,
  "1": 8970,
  "2": 523738,
  "3": 4428,
  "4": 230344,
  "5": 2353,
  "6": 103308,
  "7": 1365,
  "8": 49235,
  "9": 886,
  "10": 23794,
  "11": 512,
  "12": 12807,
  "13": 287,
  "14": 6966,
  "15": 211,
  "16": 4044,
  "17": 150,
  "18": 2535,
  "19": 79,
  "20": 1526,
  "21": 57,
  "22": 956,
  "23": 56,
  "24": 664,
  "25": 35,
  "26": 444,
  "27": 27,
  "28": 317,
  "29": 17,
  "30": 186,
  "31": 7,
  "32": 135,
  "33": 13,
  "34": 118,
  "35": 3,
  "36": 94,
  "37": 4,
  "38": 42,
  "39": 7,
  "40": 53,
  "41": 6,
  "42": 29,
  "43": 3,
  "44": 34,
  "45": 3,
  "46": 18,
  "47": 1,
  "48": 26,
  "49": 1,
  "50": 16,
  "51": 3,
  "52": 13,
  "54": 12,
  "55": 2,
  "56": 7,
  "58": 8,
  "60": 6,
  "62": 5,
  "64": 4,
  "65": 1,
  "66": 3,
  "68": 3,
  "69": 1,
  "70": 4,
  "71": 1,
  "72": 4,
  "74": 1,
  "76": 1,
  "78": 1,
  "80": 3,
  "84": 1,
  "88": 2,
  "92": 1,
  "96": 1,
  "98": 1,
  "102": 1,
  "112": 2,
  "126": 1,
  "162": 1
}


In [7]:
sent_counter.total

2738336

In [8]:
sent_counter.many_punctuations

[{'pmid': 12087878,
  'sentence': 'This issue focuses on the following selection of drugs: Abiciximab, acetylcholine chloride, acetylcysteine, alefacept, alemtuzumab, alicaforsen, alteplase, aminopterin, amoxicillin sodium, amphotericin B, anastrozole, argatroban monohydrate, arsenic trioxide, aspirin, atazanavir, atorvastatin, augmerosen, azathioprine; Benzylpenicillin, BMS-284756, botulinum toxin type A, botulinum toxin type B, BQ-123, budesonide, BXT-51072; Calcium folinate, carbamazepine, carboplatin, carmustine, ceftriaxone sodium, cefuroxime axetil, chorionic gonadotropin (human), cimetidine, ciprofloxacin hydrochloride, cisplatin, citalopram hydrobromide, cladribine, clarithromycin, clavulanic acid, clofarabine, clopidogrel hydrogensulfate, clotrimazole, CNI-1493, colesevelam hydrochloride, cyclophosphamide, cytarabine; Dalteparin sodium, daptomycin, darbepoetin alfa, debrisoquine sulfate, dexrazoxane, diaziquone, didanosine, docetaxel, donezepil, doxorubicin hydrochloride lipos

In [9]:
sent_counter.many_brackets

[{'pmid': 1327785,
  'sentence': "The following compounds were evaluated for their inhibitory activity against clinical strains of herpes simplex virus type 1 (HSV-1) and type 2 (HSV-2) in both primary rabbit kidney (PRK) and HeLa cell cultures: (S)-9-(3-hydroxy-2-phosphonylmethoxypropyl)adenine (HPMPA), 9-(2-phosphonylmethoxyethyl)adenine (PMEA), (S)-1-(3-hydroxy-2-phosphonylmethoxypropyl)cytosine (HPMPC), (RS)-9-(3-hydroxy-2-phosphonylmethoxypropyl)-2,6-diaminopurine (HPMPDAP), 5-(5-bromothien-2-yl)-2'-deoxyuridine (BTDU), 5-(5-chlorothien-2-yl)-2'-deoxyuridine (CTDU), 9-(2-deoxy-2-hydroxymethyl-beta-D-erythro-oxetanosyl)guanine (OXT-G), pentosan polysulfate, heparin, dextran sulfate (MW 10,000), acyclovir, 9-(2-hydroxyethoxymethyl)guanine (ACV), (E)-5-(2-bromovinyl)-2'-deoxyuridine (BVDU), 1-beta-D-arabinofuranosyl-(E)-5-(2-bromovinyl)-uracil (BVaraU), vidarabine (9-beta-D-arabinofuranosyladenine) (ara-A) and phosphonoformate (PFA).",
  'text_type': 'abstract',
  'drug_entities': ['

In [10]:
sent_counter.short_sents

[{'pmid': 5033,
  'sentence': '[Dopamine and shock.',
  'text_type': 'title',
  'drug_entities': ['dopamine'],
  'reaction': ['shock']},
 {'pmid': 5974,
  'sentence': 'injection of timolol.',
  'text_type': 'abstract',
  'drug_entities': ['timolol'],
  'reaction': ['injection']},
 {'pmid': 7135,
  'sentence': 'A controlled double-blind investigation.',
  'text_type': 'title',
  'drug_entities': ['double-blind'],
  'reaction': ['investigation']},
 {'pmid': 12378,
  'sentence': 'Calcium oxalate crystalluria.',
  'text_type': 'title',
  'drug_entities': ['calcium'],
  'reaction': ['crystalluria']},
 {'pmid': 12927,
  'sentence': 'Fosfomycin in pneumococcal meningitis.',
  'text_type': 'title',
  'drug_entities': ['fosfomycin'],
  'reaction': ['meningitis']},
 {'pmid': 15481,
  'sentence': 'Cardiac output remained unchanged.',
  'text_type': 'abstract',
  'drug_entities': ['cardiac'],
  'reaction': ['cardiac output']},
 {'pmid': 23526,
  'sentence': 'Ionised calcium during haemodialysis.',

In [11]:
sent_counter.long_sents

[{'pmid': 19118,
  'sentence': "In DS-rats, pindolol (10-50 mug/kg) produced a dose-dependent fall in blood pressure and elevation of resting heart rate.2 The hypotensive response and tachycardia produced by oral pindolol (50 mug/kg) in DS-rats were prevented by propranolol (5 mg/kg), suggesting that pindolol's effects are mediated by beta-adrenoceptor stimulation.3 After mecamylamine (10 mg/kg), oral pindolol (50 mug/kg) produced a further fall in blood pressure in DS-rats, suggesting that its hypotensive effects are probably mediated in the peripheral vasculature.4 Pretreatment with oral pindolol (10 or 50 mug/kg) resulted in a reduction of neuronally-induced tachycardia in pithed DS-rats; neuronally-evoked pressor effects were also antagonized by pindolol (50 mug/kg, orally).5 Whereas pindolol, 50 mug/kg orally or intraperitoneally, produced a marked and progressive hypotensive response of rapid onset (20 min) in DS-rats the same dose intravenously produced a smaller response of del

In [12]:
print("number sentences:", sent_counter.total)
print("long sentences:", len(sent_counter.long_sents))
print("short sentences:", len(sent_counter.short_sents))
print("many brackets:", len(sent_counter.many_brackets))
print("many punctuations", len(sent_counter.many_punctuations))

number sentences: 2738336
long sentences: 459
short sentences: 3227
many brackets: 110
many punctuations 28


In [13]:
max_3_token = 0
range_4_to_6 = 0
range_7_to_12 = 0
range_13_to_20 = 0
range_21_to_50 = 0
range_51_to_100 = 0
range_101_to_200 = 0
min_201 = 0

for sent_token, count in sent_counter.lengths.items():
    
    if int(sent_token) <= 3:
        max_3_token += count
    if 4 <= int(sent_token) <= 6:
        range_4_to_6 += count
    if 7 <= int(sent_token) <= 12:
        range_7_to_12 += count
    if 13 <= int(sent_token) <= 20:
        range_13_to_20 += count
    if 21 <= int(sent_token) <= 50:
        range_21_to_50 += count
    if 51 <= int(sent_token) <= 100:
        range_51_to_100 += count
    if 101 <= int(sent_token) <= 200:
        range_101_to_200 += count
    if  int(sent_token) >= 201:
        min_201 += count
print("##### Token Number #####")
print("range\t\tnumber")
print("<= 3\t\t", max_3_token)
print("4 - 6\t\t", range_4_to_6)
print("7 - 12\t\t", range_7_to_12)
print("13 - 20\t\t", range_13_to_20)
print("21 - 50\t\t", range_21_to_50)
print("51 - 100\t", range_51_to_100)
print("101 - 200\t", range_101_to_200)
print(">= 201\t\t", min_201)


##### Token Number #####
range		number
<= 3		 255
4 - 6		 7707
7 - 12		 153397
13 - 20		 635435
21 - 50		 1704329
51 - 100	 225569
101 - 200	 11198
>= 201		 446


In [14]:
range_10_to_20 = 0
range_21_to_50 = 0
min_51 = 0

for brack_num, count in sent_counter.brackets.items():
    
    if 10 <= int(brack_num) <= 20:
        range_10_to_20 += count
    if 21 <= int(brack_num) <= 50:
        range_21_to_50 += count
    if int(brack_num) >= 51:
        min_51 += count

print("#### Bracket Number ####")
print("range\t\tnumber")
print("10 - 20\t\t", range_10_to_20)
print("21-50\t\t", range_21_to_50)
print(">= 50\t\t", min_51)

#### Bracket Number ####
range		number
10 - 20		 52911
21-50		 3372
>= 50		 94


In [17]:
range_10_to_20 = 0
range_21_to_50 = 0
range_51_to_100 = 0
min_101 = 0

for punc_num, count in sent_counter.punctuation_marks.items():
    
    if 10 <= int(punc_num) <= 20:
        range_10_to_20 += count
    if 21 <= int(punc_num) <= 50:
        range_21_to_50 += count
    if 51 <= int(punc_num) <= 100:
        range_51_to_100 += count
    if int(punc_num) >= 101:
        min_101 += count

print("#### Punctuation Number ####")
print("range\t\tnumber")
print("10 - 20\t\t", range_10_to_20)
print("21-50\t\t", range_21_to_50)
print("51-100\t\t", range_21_to_50)
print(">= 101\t\t", min_101)

#### Punctuation Number ####
range		number
10 - 20		 25643
21-50		 1162
51-100		 1162
>= 101		 28


In [19]:
count_ttype = Counter()

for index in range(1, 1016):
    s_i = str(index)
    zeros = "0" * (4 - len(s_i))
    tagged_sent_file = f"tagged_pubmed20n{zeros + s_i}.json"
    file_path = os.path.join(tagged_sent_path, tagged_sent_file)
    with open(file_path, "r") as json_file:
        file_data = json.load(json_file)
    for sent_data in file_data:
        count_ttype[sent_data["text_type"]] += 1

In [21]:
count_ttype

Counter({'abstract': 2201386, 'title': 536950})

In [22]:
sum(count_ttype.values())

2738336

In [16]:
kb_path = os.path.join("data", "knowledge_base")
kb_file = os.path.join(kb_path, "rel_drug_react_triple_occ_all.json")
#with open()
    

SyntaxError: unexpected EOF while parsing (<ipython-input-16-9cffd789f2e2>, line 5)