In [158]:
import json
import random
import numpy as np
import pandas as pd
import pickle
import tqdm
import re
import os
import datetime

content_path = '/Users/feiwang/Documents/Projects/query_tool_contents'

### Data source 1: FinnGen endpoints

In [2]:
with open('/Users/feiwang/Documents/Data/endpoint_list.json') as f:
    endpoint_list = json.load(f)

In [4]:
endpoint_list = [endpoint.lower() for endpoint in set([i[0] for i in endpoint_list] + [i[1] for i in endpoint_list])]
endpoint_list = [re.sub('\s{0,1}\(.+\)\s{0,1}',' ',i) for i in endpoint_list] # remove all the (xxx)
endpoint_list = [re.sub('benign neoplasm: ','benign neoplasm of ',i) for i in endpoint_list]
endpoint_list = [re.sub('\s{0,1}&\s{0,1}',' and ',i) for i in endpoint_list]
endpoint_list = [re.sub('\s{0,1}/\s{0,1}',' or ',i) for i in endpoint_list]
endpoint_list = [re.sub('\s{0,1}\[.+\]\s{0,1}',' ',i).strip() for i in endpoint_list] # remove all the [xxx]. Instead, add xxx as a new disease?
endpoint_list = list(set(endpoint_list)) # 2904 -> 2862
endpoint_list = [i for i in endpoint_list if len(i.split(' ')) <= 12] # 2862 -> 2814

diseases_valid = endpoint_list[:len(endpoint_list)//10]
diseases_train = endpoint_list[len(endpoint_list)//10:]

### Data source 2: Phenocodes

In [11]:
endpoint_list = pd.read_csv('/Users/feiwang/Documents/Materials/query tool/phecode_definitions1.2.csv').phenotype.to_list()
endpoint_list = [i.lower() for i in endpoint_list] # 1866
diseases_valid = endpoint_list[:len(endpoint_list)//10]
diseases_train = endpoint_list[len(endpoint_list)//10:]

### Data source 4: Finnish version ICD10

In [4]:
onto = pd.read_excel('/Users/feiwang/Documents/Data/icd10_tautiluokitus_thl_08012020.xls',sheet_name='Sheet0', engine='openpyxl')
onto = [i[0].lower()+i[1:] for i in set(onto['A:Long_name'].dropna().tolist())] # 11608
diseases_valid = onto[:len(onto)//10]
diseases_train = onto[len(onto)//10:]

  warn("Workbook contains no default style, apply openpyxl's default")


In [8]:
pickle.dump(onto, open('onto.pickle', "wb" ))

## Define tags for NER

In [1]:
tag_map = {'B': 1, 'I': 2, 'O': 3, 'X': 4}

# tag_map = {
#     'o': 0,
#     'year': 1,
#     'year_num': 2,
#     'year_num1': 3,
#     'year_num2': 4,
#     'age': 5,
#     'age_num': 6,
#     'age_num1': 7,
#     'age_num2': 8,
#     'sex = 1': 9,
#     'sex = 2': 10,
#     'endpoint': 11
# }

## Setting up the conditions
### Sex

In [144]:
# sex options
# general - general questions; specific - questions for specific persons
# s - singular; p - plural; m - male; f - female
sex_general = {
    '': ['people','individuals','patients', 'kids', 'children', 'seniors', 'senior people', 'young people', 'adults', 'adolescents', 'teenagers'],
    'sex = 1': ['males', 'men', 'boys', 'old males', 'old men', 'young males', 'young men'],
    'sex = 2': ['females', 'women', 'girls', 'old females', 'old women', 'young females', 'young women']
} 

sex_specific = {
    '': ['an individual','a patient', 'a kid', 'a child', 'a senior', 'a person', 'my friend'],
    'sex = 1': ['a male', 'a man', 'a boy', 'my son', 'my husband', 'my father', 'my dad', 'my daddy', 'my grandpa', 'my grandfather', 'my male friend'],
    'sex = 2': ['a female', 'a woman', 'a girl', 'my daughter', 'my wife', 'my mother', 'my mom', 'my mommy', 'my grandma', 'my grandmother', 'my female friend']
}


def sex(specific_bool=False):
    '''
    general_specific_bool: boolean(0-general, 1-specific)
    return: string(one from sex options), list([], ['sex = 1'], ['sex = 2']), list(0,9,10)
    '''
    query = random.choice(list(sex_general.keys()))
    if specific_bool:
        question = random.choice(sex_specific[query])
    else:
        question = random.choice(sex_general[query])
    word_len = len(question.split(' '))
    query = [query] if query != '' else []
    return question, query

In [145]:
sex()

('seniors', [])

### Year

In [260]:
conj_year = {
    'year < ': ['before num','by num','ending in num'],
    'year > ': ['after num','since num','starting from num','starting in num', 'from num to now', 
                'in the last delta years', 'in the past delta years', 'in the recent delta years'],
    'year = ': ['during num','in num']
}
conj_years = ['between num1 and num2','from num1 to num2','during num1 and num2',
              'during a delta-year period starting in num1',
              'during a delta-year period ending by num2',
              'during a delta-year interval beginning num1 and ending num2']

years = list(range(2000,2023))

this_year = datetime.datetime.today().year

word_year = {'last year':['year = '+str(this_year-1)], 'a year ago':['year = '+str(this_year-1)], 'this year':['year = '+str(this_year)]}
word_years = ['delta years ago', 'delta years earlier']

def year():
    '''
    return string, list, list(1,2,3,4)
    '''
    ran_num = random.random()
    if ran_num < 0.3:
        ran_num1 = random.random()
        if ran_num < 0.5:
            question = random.choice(list(word_year.keys()))
            query = word_year[question]
        else:
            question = random.choice(word_years)
            num = random.choice(years)
            delta = this_year - num
            question = question.replace('delta', str(delta))
            query = ['year = '+str(num)]
    else:
        conj_type = random.choice(['', 'one point', 'two points'])
        if conj_type == 'one point':
            condition = random.choice(list(conj_year.keys()))
            conj = random.choice(conj_year[condition])
            num = random.choice(years)
            query = [condition+str(num)]
            delta = this_year - num
            question = conj.replace('num', str(num)).replace('delta', str(delta))
        elif conj_type == 'two points':
            num1 = random.choice(years)
            try:
                num2 = random.choice(years[years.index(num1)+1:])
            except IndexError: #Cannot choose from an empty sequence
                num2 = years[-1]+1 # in this case num1 is the biggest number in years list
            delta = num2 - num1
            query = ['year between '+str(num1)+' and '+str(num2)]
            conj = random.choice(conj_years)
            question = conj.replace('num1', str(num1)).replace('num2', str(num2)).replace('delta', str(delta))
        else:
            question, query = '',[]
        word_len = len(question.split(' '))
    return question, query

In [261]:
for i in range(5):
    print(year())

('ending in 2003', ['year < 2003'])
('a year ago', ['year = 2021'])
('in 2015', ['year = 2015'])
('this year', ['year = 2022'])
('ending in 2009', ['year < 2009'])


### Age

In [244]:
conj_age = {
    'age < ': ['below', 'under', 'less than', 'younger than'],
    'age > ': ['above', 'over', 'more than', 'older than'],
    'age = ': ['at', 'in']
}
conj_ages = ['between num1 and num2','from num1 to num2',
            'between num1 years old and num2 years old','from num1 years old to num2 years old']
ages = list(range(0,101))

def age(special_one_point=False):
    '''
    return string, list
    '''
    if special_one_point:
        num = random.choice(ages)
        question = ' '+str(num)+'-year-old '
        query = ['age = '+str(num)]
        return question, query
    conj_type = random.choice(['', 'one point', 'two points'])
    if conj_type == 'one point':
        condition = random.choice(list(conj_age.keys()))
        conj = random.choice(conj_age[condition])
        num = random.choice(ages)
        query = [condition+str(num)]
        question = conj+' '+str(num)+random.choice([' years old', ''])
    elif conj_type == 'two points':
        num1 = random.choice(ages)
        try:
            num2 = random.choice(ages[ages.index(num1)+1:])
        except IndexError: #Cannot choose from an empty sequence
            num2 = ages[-1]+1 # in this case num1 is the largest in ages list
        query = ['year between '+str(num1)+' and '+str(num2)]
        conj = random.choice(conj_ages)
        question = conj.replace('num1', str(num1))
        question = question.replace('num2', str(num2))
    else:
        question, query = '',[]
    return question, query

In [139]:
age()

('', [])

In [232]:
first = ["Chelm", "Elm", "El", "Bur", "En", "Eg", "Pem", "Pen", "Edg", "Sud", "Sod", "Hors", "Dur", "Sun", "Nort", "Brad", "Farn", "Barn", "Dart", "Hart", "South", "Shaft", "Blan", "Rock", "Alf", "Wy", "Marl", "Staf", "Wet", "Cas", "Stain", "Whit", "Stap", "Brom", "Wych", "Watch", "Win", "Horn", "Mel", "Cook", "Hurst", "Ald", "Shriv", "Kings", "Clere", "Maiden", "Leather", "Brack","Brain", "Walt", "Prest", "Wen", "Flit", "Ash"]
doubles = ["Bass", "Chipp", "Sodd", "Sudd", "Ell", "Burr", "Egg", "Emm", "Hamm", "Hann", "Cann", "Camm", "Camb", "Sund", "Pend", "End", "Warr", "Worr", "Hamp", "Roth", "Both", "Sir", "Cir", "Redd", "Wolv", "Mill", "Kett", "Ribb", "Dribb", "Fald", "Skell", "Chedd", "Chill", "Tipp", "Full", "Todd", "Abb", "Booth"]
postdoubles = ["ing", "en", "er"]
mid = ["bas", "ber", "stan", "ring", "den", "-under-", " on ", "en", "re", "rens", "comp", "mer", "sey", "mans"]
last = ["ford", "stoke", "ley", "ney",  "don", "den", "ton", "bury", "well", "beck", "ham", "borough", "side", "wick", "hampton", "wich", "cester", "chester", "ling", "moor", "wood", "brook", "port", "wold", "mere", "castle", "hall", "bridge", "combe", "smith", "field", "ditch", "wang", "over", "worth", "by", "brough", "low", "grove", "avon", "sted", "bourne", "borne", "thorne", "lake", "shot", "bage", "head", "ey", "nell", "tree", "down"]

def places():
    finished_name = ""
    pd = 0
    if(random.random()  > 0.4):
        finished_name = finished_name + random.choice(doubles)
        if(random.random()  > 0.6):
            finished_name = finished_name + random.choice(postdoubles)
            pd = 1
        else:
            finished_name = finished_name[0:len(finished_name) - 1]
    else:
        finished_name = finished_name + random.choice(first)

    if(random.random()  > 0.5 and not pd):
        if(finished_name.endswith("r") or finished_name.endswith("b")):
            if(random.random()  > 0.4):
                finished_name = finished_name + "ble"
            else:
                finished_name = finished_name + "gle"
        elif(finished_name.endswith("n") or finished_name.endswith("d")):
            finished_name = finished_name + "dle"
        elif(finished_name.endswith("s")):
            finished_name = finished_name + "tle"

    if(random.random()  > 0.7 and finished_name.endswith("le")):
        finished_name = finished_name + "s"

    elif(random.random()  > 0.5):
        if(finished_name.endswith("n")):
            if(random.random()  > 0.5):
                finished_name = finished_name + "s"
            else:
                finished_name = finished_name + "d"
        elif(finished_name.endswith("m")):
            finished_name = finished_name + "s"

    if(random.random()  > 0.7):
        finished_name = finished_name + random.choice(mid)
    finished_name = finished_name + random.choice(last)

    fix = finished_name.rpartition(' ')
    if(fix[1] == ' '):
        finished_name = fix[0] + ' ' + fix[2].capitalize()

    fix = finished_name.rpartition('-')
    if(fix[1] == '-'):
        finished_name = fix[0] + '-' + fix[2].capitalize()

    return finished_name

In [207]:
places()

'Wychbrook'

In [128]:
cut_yr = [0,1,5,15]

def get_cut_yr():
    '''
    return string - after xx years follow-up, ['cut_year = xx']
    '''
    cut = random.choice(cut_yr)
    if cut == 0:
        return '', ['cut_year = 0']
    elif cut == 1:
        ans = 'after a year follow-up' if random.random() < 0.5 else 'after one year follow-up'
        return ans, ['cut_year = 1']
    else:
        return 'after '+str(cut)+' years follow-up', ['cut_year = '+str(cut)]

### Possible question formats
#### Question 1
    'How many women older than 70 have been diagnosed with flatulence?',
    'How many women over the age of 70 got flatulence?',
    'How many women over 70 years old suffer from flatulence?',
     *'What proportion of females over 70 have flatulence?',

     'count(*)-long_registry',
     'age > 70, endpoint = flatulence, sex = female'],
    
    Question - input: How many individuals under 78 have been diagnosed with migraine before 2005?\n
    Answer - output: count; people, migraine, 1, 78, 2000, 2005\n\n"

In [241]:
def sentence_clean(sentence):
    sentence = re.sub(r'(^, )', '', sentence)
    sentence = re.sub(r' ([.|?]$)', r'\1', sentence)
    sentence = re.sub(r'aged  (diagnosed)', r'\1', sentence)
    sentence = sentence.replace('  ', ' ')
    return sentence

In [242]:
questions_1 = [
    'What is the number of <sex> who are <age> diagnosed with <disease> <year>?',
    "What's the number of <sex> who are <age> diagnosed with <disease> <year>?",
    'What is the number of <sex> aged <age> diagnosed with <disease> <year>?',
    "What's the number of <sex> aged <age> diagnosed with <disease> <year>?"
    'What is the number of <sex> who have <disease> <age> <year>?',
    "What's the number of <sex> who have <disease> <age> <year>",
    'What is the number of <sex> diagnosed <disease> <age> <year>?',
    "What's the number of <sex> diagnosed <disease> <age> <year>",
    '<year>, what is the number of <sex> who have <disease> <age>?',
    "<year>, what's the number of <sex> who have <disease> <age>?",
    '<year>, what is the number of <sex> who are <age> diagnosed with <disease>?',
    "<year>, what's the number of <sex> who are <age> diagnosed with <disease>?",
    'How many <sex> <age> have been diagnosed with <disease> <year>?',
    'How many <sex> <age> are diagnosed with <disease> <year>?',
    '<year>, how many <sex> <age> have been diagnosed with <disease>?',
    '<year>, how many <sex> <age> are diagnosed with <disease>?',
    'How many <sex> <age> suffer from <disease> <year>?',
    'How many <sex> diagnosed <disease> <age> <year>?',
    'How many <sex> aged <age> diagnosed with <disease> <year>?',
    '<year>, how many <sex> <age> suffer from <disease>?',
    'Frequency of <sex> with <disease> <age> <year>.',
    'In <sex> <age> , how many suffer from <disease> <year>?',
    'Among <sex> <age>, how many suffer from <disease> <year>?',
]

def question1():
    '''
    return string(question), list(conditions), string(columns seperated by ',')
    '''
    sex_q, sex_a = sex()
    age_q, age_a = age()
    year_q, year_a = year()
    q = random.choice(questions_1).replace('<sex>', sex_q).replace('<age>', age_q).replace('<year>', year_q)
    q = sentence_clean(q)
    a = sex_a+['endpoint IN (<disease>)']+age_a+year_a
    return q.capitalize(), a, 'counts'

In [262]:
for i in range(50):
    print(question1())

('Frequency of boys with <disease> over 4 years old between 2021 and 2022.', ['sex = 1', 'endpoint IN (<disease>)', 'age > 4', 'year between 2021 and 2022'], 'counts')
('How many young males from 59 years old to 66 years old suffer from <disease>?', ['sex = 1', 'endpoint IN (<disease>)', 'year between 59 and 66'], 'counts')
('Among males from 6 years old to 78 years old, how many suffer from <disease> after 2009?', ['sex = 1', 'endpoint IN (<disease>)', 'year between 6 and 78', 'year > 2009'], 'counts')
('In 2009, how many young women are diagnosed with <disease>?', ['sex = 2', 'endpoint IN (<disease>)', 'year = 2009'], 'counts')
('What is the number of males diagnosed <disease> more than 94 from 2001 to 2015?', ['sex = 1', 'endpoint IN (<disease>)', 'age > 94', 'year between 2001 and 2015'], 'counts')
("What's the number of people who are at 63 diagnosed with <disease> this year?", ['endpoint IN (<disease>)', 'age = 63', 'year = 2022'], 'counts')
('This year, what is the number of old

#### Question 2
    Question - What is the mean age at the first event for migraine among individuals over 71 from 2013 to 2017?\n
    Answer - output: mean age; people, migraine, 71, 100, 2013, 2017\n\n"

In [239]:
questions_2 = [
    'What is the <bar> age at the first event of <disease> <prep> <sex> <year>?',
    "What's the <bar> age at the first event of <disease> <prep> <sex> <year>?",
    '<year>, what is the <bar> age at the first event of <disease> <prep> <sex>?',
    "<year>, what's the <bar> age at the first event of <disease> <prep> <sex>?",
    'What is the <bar> age of <sex> who get diagnosed with <disease> for their 1st time <year>?',
    "What's the <bar> age of <sex> who get diagnosed with <disease> for their first time <year>?",
    '<year>, what is the <bar> age of <sex> who get diagnosed with <disease> for their 1st time?',
    "<year>, what's the <bar> age of <sex> who get diagnosed with <disease> for their first time?",
    'What is the <bar> age of <sex> with <disease> for the first time <year>?',
    "What's the <bar> age of <sex> with <disease> for the 1st time <year>?",
    '<year>, what is the <bar> age of <sex> with <disease> for the first time?',
    "<year>, what's the <bar> age of <sex> with <disease> for the 1st time?",
    'Normal <sex> age for initial <disease> diagnosis <year>.',
    'Normal <sex> age for initial diagnosis of <disease> <year>?',
    'For <sex> diagnosed with <disease>, how old are they at first diagnosis <year>?',
    'At what age are <sex> typically diagnosed with their first <disease> <year>?',
    'How old on average are the <sex> diagnosed with <disease> for the 1st time <year>?',
    '<year>, how old on average are the <sex> diagnosed with <disease> for the 1st time?',
    'What is the <bar> age of women who get diagnosed with <disease> for their first time <year>?',
    "What's the <bar> age of women who get diagnosed with <disease> for their first time <year>?",
    '<year>, what is the <bar> age of women who get diagnosed with <disease> for their first time?',
    "<year>, what's the <bar> age of women who get diagnosed with <disease> for their first time?"
]

def question2():
    '''
    return string(question), list(conditions), string(columns seperated by ',')
    '''
    num = random.random() # random float 0.0 <= x < 1.0
    bar = 'mean' if random.random() < 0.5 else 'average'
    prep = random.choice(['in','among','for'])
    if num < 0.2:
        q_list = [
            'What is the '+bar+' age at the first event for <disease>?',
            "What's the "+bar+' age at the first event for <disease>?'
        ]
        q = random.choice(q_list)
        a = ['endpoint IN (<disease>)']
    else: 
        sex_q, sex_a = sex()
        year_q, year_a = year()
        q = sentence_clean(random.choice(questions_2).replace('<sex>', sex_q).replace('<year>', year_q))
        a = sex_a+['endpoint IN (<disease>)']+year_a
    q = q.replace('<bar>', bar).replace('<prep>', prep)
    return q.capitalize(), a, 'age, counts'

In [189]:
for i in range(5):
    print(question2())

('During a 1-year period starting in 2018, what is the average age at the first event of <disease>  among  children?', ['endpoint IN (<disease>)', 'year between 2018 and 2019'])
("What's the mean age of old men who get diagnosed with <disease> for their first time?", ['sex = 1', 'endpoint IN (<disease>)'])
("What's the average age at the first event of <disease>  for  adolescents after 2000?", ['endpoint IN (<disease>)', 'year > 2000'])
("Ending in 2000, what's the average age of women who get diagnosed with <disease> for their first time?", ['sex = 2', 'endpoint IN (<disease>)', 'year < 2000'])
('What is the mean age of males who get diagnosed with <disease> for their 1st time by 2002?', ['sex = 1', 'endpoint IN (<disease>)', 'year < 2002'])


#### Question 3
    Question - What is the prevalence of coronary heart disease among women?\n
    Answer - output: prevalence; female, coronary heart disease\n\n"

In [264]:
a = pd.read_csv(content_path+'/key_figures_all_2022-03-22.csv', index_col=0)[['endpoint','prevalence_all','prevalence_female','prevalence_male']]
a.head()

Unnamed: 0,endpoint,prevalence_all,prevalence_female,prevalence_male
0,AB1TUBERCU_MILIARY,0.000166,0.000148,0.000155
1,AB1_ACTINOMYCOSIS,0.000138,0.000144,0.00013
2,AB1_AFRICAN_TRYPANOSOMIASIS,4e-06,3e-06,5e-06
3,AB1_AMOEBIASIS,0.000303,0.000316,0.000291
4,AB1_ANOGENITAL_HERPES_SIMPLEX,0.002541,0.003071,0.001513


In [289]:
questions_3 = [
    'What is the prevalence of <disease> <prep> <sex> <year>?',
    "What's the prevalence of <disease> <prep> <sex> <year>?",
    '<year>, what is the prevalence of <disease> <prep> <sex>?',
    "<year>, what's the prevalence of <disease> <prep> <sex>?",
    'Prevalence of <disease> <prep> <sex> <year>?',
    'Prevalence of <disease> <prep> <sex> <year>.',
    '<prep> <sex>, what is the prevalence of <disease>?',
    "<prep> <sex>, what's the prevalence of <disease>?",
#     'How common is <disease> <prep> <sex> <year>?',
#     'How common is <disease> in <place>?',
    'How prevalent is <disease> <prep> <sex> <year>?',
    'How prevalent is <disease> in <place>?',
    'What is the proportion of <sex> with <disease> <year>?',
    "What's the proportion of <sex> with <disease> <year>?"
]

def question3():
    '''
    return string(question), list(conditions), string(columns seperated by ',')
    '''
    num = random.random() # random float 0.0 <= x < 1.0
    prep = random.choice(['in','among','for'])
    if num < 0.5:
        q_list = [
            'What is the prevalence of <disease>?',
            "What's the prevalence of <disease>?",
            "What's <disease>'s prevalence?",
            "What is <disease>'s prevalence?",
            "How prevalent is <disease>?",
#             "How common is <disease>?",
            'Prevalence of <disease>?',
            'Prevalence of <disease>.',
            'Proportion of population with <disease>?',
            'Proportion of population with <disease>.'
        ]
        q = random.choice(q_list)
        a = ['endpoint IN (<disease>)']
        col = 'prevalence_all'
    else: 
        sex_q, sex_a = sex()
        year_q, year_a = year()
        q = sentence_clean(random.choice(questions_3).replace('<sex>', sex_q).replace('<year>', year_q))
        a = ['endpoint IN (<disease>)']
        col = 'prevalence_female' if sex_a == ['sex = 2'] else 'prevalence_male' if sex_a == ['sex = 1'] else 'prevalence_all'
    q = q.replace('<prep>', prep)
    num_ = random.random()
    q = re.sub(r'(prevalence)',r'unadjusted \1', q) if num_ < 0.4 else q
    return q.capitalize().replace('<place>', places()), a, col

In [281]:
re.sub(r'(prevalence)',r'unadjusted \1', 'What is the prevalence of <disease> in patients in the past 14 years?')

'What is the unadjusted prevalence of <disease> in patients in the past 14 years?'

In [230]:
question3()

('Prevalence of <disease>.', ['endpoint IN (<disease>)'], 'prevalence_all')

In [287]:
for i in range(5):
    print(question3())

("What's the prevalence of <disease>?", ['endpoint IN (<disease>)'], 'prevalence_all')
("What's the prevalence of <disease>?", ['endpoint IN (<disease>)'], 'prevalence_all')
('How common is <disease>?', ['endpoint IN (<disease>)'], 'prevalence_all')
('How common is <disease>?', ['endpoint IN (<disease>)'], 'prevalence_all')
("What's the unadjusted prevalence of <disease> for people last year?", ['endpoint IN (<disease>)'], 'prevalence_all')


#### Question 4
    Question - What is the incidence of eczema for females over 15 years?\n
    Answer - output: incidence; female, eczema\n\n"

In [271]:
a = pd.read_csv(content_path+'/cumulative_incidence_2022-04-14.csv', index_col=0)
a['sex'] = np.select([(a.sex == 'female'), (a.sex == 'male')], [2, 1])
a = pd.DataFrame(a.to_records())
a.head()

Unnamed: 0,age,sex,cumulinc,endpoint
0,0.0,2,0.0,ENTEROPATH_E_COLI
1,1.0,2,0.0,ENTEROPATH_E_COLI
2,2.0,2,0.0001,ENTEROPATH_E_COLI
3,3.0,2,0.0001,ENTEROPATH_E_COLI
4,4.0,2,0.0001,ENTEROPATH_E_COLI


In [296]:
questions_4 = [
    'What is the <key> of <disease> <prep> <sex> <age> <year>?',
    "What's the <key> of <disease> <prep> <sex> <age> <year>?",
    '<year>, what is the <key> of <disease> <prep> <sex> <age>?',
    "<year>, what's the <key> of <disease> <prep> <sex> <age>?",
    '<key> of <disease> <prep> <sex> <age> <year>?',
    '<key> of <disease> <prep> <sex> <age> <year>.',
    '<prep> <sex> <age>, what is the <key> of <disease> in <place>?',
    "<prep> <sex> <age>, what's the <key> of <disease>?",
#     'How common is <disease> <prep> <sex> <year>?',
#     'How common is <disease> in <place>?',
]

questions_4_special = [
    'What is the '
]

def question4():
    '''
    return string(question), list(conditions), string(columns seperated by ',')
    '''
    num = random.random() # random float 0.0 <= x < 1.0
    prep = random.choice(['in','among','for'])
    key_word = random.choice(['incidence', 'incidence rate', 'cumulative incidence', 'cumulative incidence rate', 
                              'accumulative incidence', 'accumulative incidence rate'])
    if num < 0.4:
        q_list = [
            'What is the <key> of <disease>?',
            "What's the <key> of <disease>?",
            "What's <disease>'s <key> in <place>?",
            "What is <disease>'s <key>?",
#             "How common is <disease>?",
            '<key> of <disease>?',
            '<key> of <disease>.',
            '<key> of <disease> in <place>.',
            'Incidence proportion of <disease>.',
            'Risk of <disease>.',
            'Attack rate of <disease>.'
        ]
        q = random.choice(q_list)
        a = ['endpoint IN (<disease>)']
    else: 
        year_q, year_a = year()
        if num < 0.7:
            sex_q, sex_a = sex()
            age_q, age_a = age()
            q = sentence_clean(random.choice(questions_3).replace('<sex>', sex_q).replace('<year>', year_q)).replace('<age>', age_q)
        else:
            sex_q, sex_a = sex(True)
            age_q, age_a = age(True)
            
        a = sex_a+['endpoint IN (<disease>)']+age_a
    q = q.replace('<prep>', prep).replace('<key>', key_word)
    return q.capitalize().replace('<place>', places()), a, 'age, sex, cumulinc'

In [None]:
for i in range(50):
    print(question4())

In [291]:
((0.06/1)+(0.04/1))/2

0.05

In [292]:
((0.13-0.06)/(1-0.06)+(0.09-0.04)/(1-0.04))/2

0.06327570921985816

In [294]:
(0.13+0.06)/2

0.095

In [295]:
(0.13-0.06)/(1-0.06)

0.07446808510638299

In [272]:
a.age.max()

103.0

In [None]:
sex_q, sex_a = sex(True)
age_q, age_a = age(True)
re.sub(r'([a|an|my]) ', r'\1'+age_q, sex_q)

In [32]:
#     print('sex_q',sex_q, 'age_q', age_q, 'year_q', year_q)    
#     a = 'count(*); long_registry; '+', '.join(age_a+['endpoint = <disease>']+sex_a+year_a)
#     a = ', '.join(age_a+['endpoint IN (<disease>)']+sex_a+year_a)

' 92-year-old '

In [140]:

    

    
def avgage_mortality(endpoint):
    '''
    return string(question), list(answer)
    
        'What is the average age of women who died after letrozole diagnosis?',
  'How old on average are women who died after letrozole?',
 'What is the average age of women who died after letrozole?',
 'On average, how old is a woman who died of the diagnosis of letrozole?',
  'What is the mean age of death regarding to disease?',
  'Considering disease, what is the age of death on average?',
  'What is the average age of men who died of the diagnosis of disease after five-year survival?',
  'After a five-year survival, what is the average age of men who died of disease?',
  'What is the average age of men who die of disease 5 years after their first diagnosis?',
  'For those men who died 5 years after the first event of disease, what is their mean age?',
  'On average, what is the age of men who died of the diagnosis of disease after a survival of fifteen years?',
    
    Question - What is the mean age of males who died 10 years after HIV diagnosis?\n
    Answer - output: mean age; people, migraine, 71, 100, 2013, 2017\n\n"
    '''
    num = random.random() # random float 0.0 <= x < 1.0
    bar = 'mean' if random.random() < 0.5 else 'average'

    sex_q, sex_a = get_sex()
    year_q, year_a = get_year()
    
    if num < 0.1:
        q_list = [
            'What is the '+bar+' age of death given the diagnosis of '+endpoint+'?',
            'What is the '+bar+' age of death after being diagnosed with '+endpoint+'?',
            'If one has '+endpoint+'. What is the '+bar+' age of death?'
        ]
        q = random.choice(q_list)
        a = 'avg(age); long_registry; endpoint='+endpoint
        return input_prefix+q+input_suffix+output_prefix+a+output_suffix
    else: 
        sex_q, sex_a = get_sex()
        year_q, year_a = get_year()
        if year_q == '':
            q_list = [
                bar.capitalize()+' age of death for '+sex_q+' get diagnosed with '+endpoint+'.',
                'Typical life expectancy of '+sex_q+' who get diagnosed with '+endpoint+'.',
                'On average, how old are '+sex_q+' who died of the diagnosis of '+endpoint+'?',
                'For '+sex_q+' having '+endpoint+', how many years do they normally live?',
                'For '+sex_q+' having '+endpoint+', how many years do they normally live for?',
                "After the diagnosis of "+endpoint+", what's the "+bar+" age of "+sex_q+" who survived for another 5 years?",
                'How old on average age are the '+sex_q+' who died of '+endpoint+'?',
                'What is the '+bar+' age of '+sex_q+' who deceased after the diagnosis of '+endpoint+'?',
                "What's the "+bar+' age of '+sex_q+' deceased after the diagnosis of '+endpoint+'?',
                'What is the '+bar+' age of '+sex_q+' who died of '+endpoint+'?',
                'What is the '+bar+' age of '+sex_q+' who died after the diagnosis of '+endpoint+'?',
                "What's the "+bar+' age of '+sex_q+' died after the diagnosis of '+endpoint+'?',
                "After being diagnosed with "+endpoint+", what's the "+bar+' age of '+sex_q+' who survived for another 5 years?'
            ]
        else:
            q_list = [
                bar.capitalize()+' age of death for '+sex_q+' get diagnosed with '+endpoint+year_q+'.',
                'Typical life expectancy of '+sex_q+' who get diagnosed with '+endpoint+year_q+'.',
                'On average, how old are '+sex_q+' who died of the diagnosis of '+endpoint+year_q+'?',
                'How old on average age are the '+sex_q+' who died of '+endpoint+year_q+'?',
                year_q.strip().capitalize()+', how old on average age are the '+sex_q+' who died of '+endpoint+'?',
                'What is the '+bar+' age of '+sex_q+' who deceased after the diagnosis of '+endpoint+year_q+'?',
                "What's the "+bar+' age of '+sex_q+' deceased after the diagnosis of '+endpoint+year_q+'?',
                'What is the '+bar+' age of '+sex_q+' who died of '+endpoint+year_q+'?',
                'What is the '+bar+' age of '+sex_q+' who died after the diagnosis of '+endpoint+year_q+'?',
                "What's the "+bar+' age of '+sex_q+' died after the diagnosis of '+endpoint+year_q+'?',
                year_q.strip().capitalize()+", what's the "+bar+' age of '+sex_q+' who died of '+endpoint+'?',
            ]
#     cut = random.choice(cut_yr)
#     if cut == 0:
#         q = 'What is the '+bar+' age of '+sex_q+' who died after '+endpoint+' diagnosis?'
#     elif cut == 1:
#         q = 'What is the '+bar+' age of '+sex_q+' who died '+random.choice([str(cut),'one','a'])+' year after '+endpoint+' diagnosis?'
#     else:
#         q = 'What is the '+bar+' age of '+sex_q+' who died '+str(cut)+' years after '+endpoint+' diagnosis?'
        q = random.choice(q_list)
        a = 'avg(age); mortality; '+', '.join(['endpoint = '+endpoint]+sex_a)
#         a = 'avg(age); mortality; '+', '.join(['cut_year = '+str(cut),'endpoint = '+endpoint]+sex_a)
        return input_prefix+q+input_suffix+output_prefix+a+output_suffix

def count_mortality(endpoint):
    '''
    return string(question), list(answer)
    
 'How many individuals under 72 have died 5 years after the diagnosis of nasal and breast cancer?',
 'How many people under 72 years of age died 5 years after diagnosis of sinus cancer?',
 *'Death rate of patients under 72 who were found to have nasal and sinus cancer five years earlier.',
 *'What is rate of mortality for those aged 71 years or less and had a nasal and sinus cancer diagnosis at least 5 years ago?',
 ['How many men between 20 and 21 died 5 years after they diagnosed with aciclovir?',
  'How many men aged from 20 to 21 has died 5 years after their diagnosis of aciclovir?',
  'How many men between 20 and 21 years have died 5 years after they got aciclovir?',
  'How many men between the ages of 20 and 21 die five years after being diagnosed with acetic acid?',
  'How many men from 20 to 21 years old have died after being diagnosed with acetic acid for 5 years?',
['How many women over 73 years old died 15 years after clonazepam diagnosis?',
  'How many women over 73 are dead 15 years after the diagnosis of clonazepam?',
  'How many old women over the age of 73 die 15 years after the diagnosis of Clonazepam?',
  'How many 73 years old or older grandmas died after 15 years of diagnosis of clonazepam?',
["What's the number of people below 72 years old if they died 5 years after nasal and sinus cancer diagnosis?",
  'How many people under 72 years old died after they suffered 5 years from nasal and sinus cancer?',
  'How many individuals under 72 have died 5 years after the diagnosis of nasal and breast cancer?',
  'What is the death toll of nasal and sinus cancer if these patients are younger than 72 and have survived for 5 years?',

    
    Question - How many females at 59 died 15 years after they diagnosed as asthma in 2012??\n
    Answer - output: count(*) ; motality ; endpoint = asthma , cut_year = 15 , age = 59 , year = 2012\n\n"
    '''
#     num = random.random() # random float 0.0 <= x < 1.0

    sex_q, sex_a = get_sex()
    age_q, age_a = get_age()
#     year_q, year_a = get_year()
    cut = random.choice(['0','1','5','10','15','five','ten','fifteen'])
    
    if cut == '0':
        q_list = [
            'How many '+sex_q+age_q+' died after '+endpoint+' diagnosis?',
            'How many '+sex_q+age_q+' died after they diagnosed with '+endpoint+'?',
            'How many '+sex_q+age_q+' died of '+endpoint+'?'
        ]
    elif cut == '1':
        q_list = [
            'How many '+sex_q+age_q+' died '+random.choice([cut,'one','a'])+' year after '+endpoint+' diagnosis?',
            'How many '+sex_q+age_q+' died '+random.choice([cut,'one','a'])+' year after they diagnosed with '+endpoint+'?'
        ]
    else:
        q_list = [
#             'How many '+sex_q+' with '+endpoint+' are dying '+cut+' years later, considering only those who are '+age_q+'?',
            'How many '+sex_q+age_q+' died '+cut+' years after '+endpoint+' diagnosis?',
            'How many '+sex_q+age_q+' died '+cut+' years after they diagnosed with '+endpoint+'?',
            'How many '+sex_q+age_q+' died of '+endpoint+' '+cut+' years after the first diagnosis?',
            'What was the number of '+sex_q+age_q+' who died '+cut+' years after having '+endpoint+'?',
            'What was the number of '+sex_q+age_q+' who died '+cut+' years after the diagnosis of '+endpoint+'?',
            'What is the number of '+sex_q+age_q+' who died '+cut+' years after being diagnosed with '+endpoint+'?',
            "What's the number of "+sex_q+age_q+' who died '+cut+' years after being diagnosed with '+endpoint+'?',
            'How many '+sex_q+age_q+' died after they suffered 5 years from '+endpoint+'?',
        ]
    
    q = random.choice(q_list)        
    a = 'count(*); mortality; '+', '.join(age_a+['cut_year = '+str(cut),'endpoint = '+endpoint]+sex_a)
    return input_prefix+q+input_suffix+output_prefix+a+output_suffix

def hr_cox(endpoint):
    '''
    return string(question), list(answer)
        
    Question - What is the hazard ratio of COPD given asthma after 2010?\n
    Answer - output: hr ; cox_hrs ; prior = asthma, outcome = COPD, year > 2010\n\n"
    '''
#     num = random.random() # random float 0.0 <= x < 1.0
    sex_dict = {
        'males':'male','females':'female','men':'man','women':'woman','people':'person','individuals':'person','patients':'patient'
    }

    sex_q, sex_a = get_sex()
    age_q, age_a = get_age()
    year_q, year_a = get_year()
    prior = 'stroke'#random.choice(diseases_train)
    prep = random.choice([' in ',' among ',' for '])#' in ' if random.random() < 0.5 else ' among '
    i_am_a = random.choice(['I am a ','This is a ',"I'm a "])
    
    if year_q == '':
        q_list = [
            'What is the hazard ratio of '+endpoint+' given '+prior+prep+sex_q+age_q+'?',
            "What's the hazard ratio of "+endpoint+' given '+prior+prep+sex_q+age_q+'?',
            'How much more likely are '+sex_q+age_q+' to develop '+endpoint+' if they are already diagnosed with '+prior+'?',
            'What extra risk of '+endpoint+' '+sex_q+age_q+' does a previous diagnosis of '+prior+' confer?',
            'How does a prior diagnosis of '+prior+' effect the risk of '+sex_q+age_q+' developing '+endpoint+'?'
            'If my '+sex_dict[sex_q]+' is'+age_q+' having '+prior+'. What is the risk ratio of '+endpoint+'?',
            'If my '+sex_dict[sex_q]+' is'+age_q+' diagnosed with '+prior+'. What is the risk ratio of having '+endpoint+'?',
            i_am_a+sex_dict[sex_q]+age_q+' with '+prior+'. What is my risk rate of having '+endpoint+'?',
        ]
    else:
        # What is the hazard ratio of COPD for those who had asthma after 2010?
        q_list = [
            'What is the hazard ratio of '+endpoint+' for those '+sex_q+age_q+' who had '+prior+year_q+'?',
            'What is the hazard ratio of '+endpoint+' given '+prior+prep+sex_q+age_q+year_q+'?',
            "What's the hazard ratio of "+endpoint+' given '+prior+prep+sex_q+age_q+year_q+'?',
            year_q.strip().capitalize()+", what is the hazard ratio of "+endpoint+' given '+prior+prep+sex_q+age_q+'?',
            year_q.strip().capitalize()+", what's the hazard ratio of "+endpoint+' given '+prior+prep+sex_q+age_q+'?'
            'How much more likely are '+sex_q+age_q+' to develop '+endpoint+' if they are already diagnosed with '+prior+year_q+'?',
            year_q.strip().capitalize()+', how much more likely are '+sex_q+age_q+' to develop '+endpoint+' if they are already diagnosed with '+prior+'?',
            year_q.strip().capitalize()+', what extra risk of '+endpoint+prep+sex_q+age_q+' does a previous diagnosis of '+prior+' confer?',
            'What extra risk of '+endpoint+prep+sex_q+age_q+' does a previous diagnosis of '+prior+' confer '+year_q+'?',
            'How does a prior diagnosis of '+prior+' effect the risk of '+sex_q+age_q+' developing '+endpoint+year_q+'?',
            year_q.strip().capitalize()+', how does a prior diagnosis of '+prior+' effect the risk of '+sex_q+age_q+' developing '+endpoint+'?'
        ]
        
    q = random.choice(q_list)     
    a = 'hr; cox_hrs; '+', '.join(age_a+['outcome = '+endpoint,'prior = '+prior]+sex_a+year_a)
    return input_prefix+q+input_suffix+output_prefix+a+output_suffix

def outcome_cox(endpoint):
    '''
    return string(question), list(answer)
    
    Question - Which diseases may be diagnosed after asthma for people between 25 to 65?\n
    Answer - output: hr ; cox_hrs ; prior = asthma, outcome = COPD, year > 2010\n\n"
    '''
    num = random.random() # random float 0.0 <= x < 1.0
    
    sex_dict = {
        'males':'male','females':'female','men':'man','women':'woman','people':'person','individuals':'person','patients':'patient'
    }

    sex_q, sex_a = get_sex()
    age_q, age_a = get_age()
    year_q, year_a = get_year()
    ep = random.choice(['disease','complication','comorbidity','illness','diagnosis','condition'])
    eps = random.choice(['diseases','complications','comorbidities','illnesses','diagnoses','conditions'])
    prep = random.choice([' in ',' among ',' for '])
    i_am_a = random.choice(['I am a ','This is a ',"I'm a "])
    
    if num < 0.2:
        q_list = [
            'My patient is a '+sex_dict[sex_q]+age_q+'. What is the most related '+ep+' given '+endpoint+year_q+'?',
            'What is the most related '+ep+' given '+endpoint+year_q+' if my patient is a '+sex_dict[sex_q]+age_q+'?',
            i_am_a+sex_dict[sex_q]+' with '+endpoint+'. What is the most possible '+ep+' I '+random.choice(['may','will','might'])+' get?',
            'Which '+ep+' may be diagnosed given '+endpoint+prep+sex_q+age_q+year_q+'?',
            'What is the most '+random.choice([' severe ',' possible '])+ep+prep+sex_q+age_q+year_q+' given '+endpoint+'?',
            'Given '+endpoint+', what is the strongest associated '+ep+prep+sex_q+age_q+year_q+'?',
            'Given '+endpoint+", what's the strongest associated "+ep+prep+sex_q+age_q+year_q+'?',
            'If '+sex_q+age_q+' are diagnosed with '+endpoint+', what '+ep+' are they most likely to suffer from in the period'+year_q+'?',
            'What are the most common '+ep+prep+age_q+' who had '+endpoint+year_q+'?',
            'Which '+ep+' is most commonly diagnosed'+prep+sex_q+age_q+year_q+'?',
        ]
        q = random.choice(q_list) 
        a = 'outcome; cox_hrs; '+', '.join(age_a+['prior = '+endpoint]+sex_a+year_a)+'; order by risk desc limit 1'
    elif num < 0.6:
        q_list = [
            'My patient is a '+sex_dict[sex_q]+age_q+'. What is the related '+eps+' given '+endpoint+year_q+'?',
            'What is the most related '+ep+' given '+endpoint+year_q+' if my patient is a '+sex_dict[sex_q]+age_q+year_q+'?',
            'Which '+eps+' may be diagnosed given '+endpoint+' for '+sex_q+age_q+year_q+'?',
            'What are the '+random.choice([' ',' severe ',' possible '])+eps+prep+sex_q+age_q+year_q+' given '+endpoint+'?',
            'Given '+endpoint+', what are the strongest associated '+eps+prep+sex_q+age_q+year_q+'?',
            'Given '+endpoint+", what're the strongest associated "+eps+prep+sex_q+age_q+year_q+'?',
            'If '+sex_q+age_q+' are diagnosed with '+endpoint+', what are they likely to suffer from in the period'+year_q+'?',
            'What are the most common '+eps+prep+age_q+' who had '+endpoint+year_q+'?',
            i_am_a+sex_dict[sex_q]+' with '+endpoint+'. What are the most possible '+eps+' this '+sex_dict[sex_q]+' will get?',
            'Which '+eps+' are most commonly diagnosed'+prep+sex_q+age_q+year_q+'?',
        ]
        q = random.choice(q_list) 
        a = 'outcome; cox_hrs; '+', '.join(age_a+['prior = '+endpoint]+sex_a+year_a)+'; order by risk desc limit 5'
    
    else:
        # What are the top 5 severe complications given cardiovascular diseases for men above 65?
        num_disease = random.choice([str(i) for i in range(2,11)]+['two','three','four','five','six','seven','eight','nine','ten'])
        q_list = [
            'My patient is a '+sex_dict[sex_q]+age_q+'. What is the '+num_disease+' most relevant '+eps+' given '+endpoint+year_q+'?',
            'What are the top '+num_disease+random.choice([' ',' severe ',' possible '])+eps+prep+sex_q+age_q+year_q+' given '+endpoint+'?',
            "What're the top "+num_disease+random.choice([' ',' severe ',' possible '])+eps+prep+sex_q+age_q+year_q+' given '+endpoint+'?',
            'Given '+endpoint+', what are the top '+num_disease+' strongest associated '+eps+prep+sex_q+age_q+year_q+'?',
            'Given '+endpoint+", what're the top "+num_disease+' strongest associated '+eps+prep+sex_q+age_q+year_q+'?',
            'If '+sex_q+age_q+' are diagnosed with '+endpoint+', what '+num_disease+' things are they likely to suffer from in the period'+year_q+'?',
            'What are the '+num_disease+' most common '+eps+prep+age_q+' who had '+endpoint+year_q+'?',
            i_am_a+sex_dict[sex_q]+' with '+endpoint+'. What are the '+num_disease+' most possible '+eps+' this '+sex_dict[sex_q]+' will get?',
            'Which '+num_disease+' '+eps+' are most commonly diagnosed'+prep+sex_q+age_q+year_q+'?',
        ]
        q = random.choice(q_list) 
        a = 'outcome; cox_hrs; '+', '.join(age_a+['prior = '+endpoint]+sex_a+year_a)+'; order by risk desc limit '+str(num_disease)
    
    return input_prefix+q+input_suffix+output_prefix+a+output_suffix



def prior_cox(endpoint):
    '''
    return string(question), list(answer)
     
    Question - Which diseases may be diagnosed before asthma?\n
    Answer - output: prior ; cox_hrs ; outcome = asthma ; order by risk desc limit 5\n\n"
    '''
#     num = random.random() # random float 0.0 <= x < 1.0
    ep = random.choice(['diseases','diagnoses','conditions','illnesses'])
    wh = 'what ' if random.random() < 0.5 else 'which '
    
    sex_q, sex_a = get_sex()
    age_q, age_a = get_age()
    year_q, year_a = get_year()
    
    if year_q == '':
        q_list = [
            'Prior to '+endpoint+', '+wh+ep+' are most likely to be diagnosed in '+sex_q+age_q+'?',
            wh.capitalize()+ep+' may be diagnosed before '+endpoint+' in '+sex_q+age_q+'?',
            wh.capitalize()+ep+' will '+sex_q+age_q+' have with before '+endpoint+'?',
        ]
    else:
        q_list = [
            'Prior to '+endpoint+', '+wh+ep+' are most likely to be diagnosed in '+sex_q+age_q+year_q+'?',
            wh.capitalize()+ep+' may be diagnosed before '+endpoint+' in '+sex_q+age_q+year_q+'?',
            year_q.strip().capitalize()+', '+wh+ep+' may be diagnosed before '+endpoint+' in '+sex_q+age_q+'?',
            wh.capitalize()+ep+' will '+sex_q+age_q+' have with before '+endpoint+year_q+'?',
            year_q.strip().capitalize()+', '+wh+ep+' will '+sex_q+age_q+' have with before '+endpoint+'?'
        ]
    q = random.choice(q_list)    
    a = 'prior; cox_hrs; '+', '.join(age_a+['outcome = '+endpoint]+sex_a+year_a)+'; order by risk desc limit 5'
    
    return input_prefix+q+input_suffix+output_prefix+a+output_suffix



In [126]:
re.findall('\d{4}','between 2001 and 2006')

['2001', '2006']

In [107]:
# data = [make_data1(i) for i in endpoint_list]+[make_data2(i) for i in endpoint_list]+[make_data3(i) for i in endpoint_list]
# data_train = [count_longre(i) for i in diseases_train]+\
#     [aveage_longre(i) for i in diseases_train]+\
#     [avgage_mortality(i) for i in diseases_train]+\
#     [count_mortality(i) for i in diseases_train]+\
#     [hr_cox(i) for i in diseases_train]+\
#     [prior_cox(i) for i in diseases_train]
# #     [outcome_cox(i) for i in diseases_train]+\

# data_valid = [count_longre(i) for i in diseases_valid]+\
#     [aveage_longre(i) for i in diseases_valid]+\
#     [avgage_mortality(i) for i in diseases_valid]+\
#     [count_mortality(i) for i in diseases_valid]+\
#     [hr_cox(i) for i in diseases_valid]+\
#     [prior_cox(i) for i in diseases_valid]
# #     [outcome_cox(i) for i in diseases_valid]+\

# train_data= []
# for i in data_train:
#     question = re.findall('input: (.+) \\n output: ',i)[0]
#     groups = i.split('; ')
#     group1 = re.findall('output: (.+)',groups[0])[0]
#     group2 = groups[1]
#     group3 = re.sub(' \\n\\n','',groups[2])
#     train_data.append([question, group1+'-'+group2, group3])    

# valid_data = []
# for i in data_valid:
#     question = re.findall('input: (.+) \\n output: ',i)[0]
#     groups = i.split('; ')
#     group1 = re.findall('output: (.+)',groups[0])[0]
#     group2 = groups[1]
#     group3 = re.sub(' \\n\\n','',groups[2])
#     valid_data.append([question, group1+'-'+group2, group3])    
# # order desc is not included in this version

In [141]:
choice_list = [count_longre, aveage_longre, avgage_mortality, count_mortality, hr_cox, prior_cox, outcome_cox]
data = [random.choice(choice_list)(i) for i in ['diabetes']*700]
data = list(set(data))

structures = []
for i in data:
    question = re.findall('input: (.+) \\n output: ',i)[0]
    groups = i.split('; ')
    group1 = re.findall('output: (.+)',groups[0])[0]
    group2 = groups[1]
    group3 = re.sub(' \\n\\n','',groups[2])
    structures.append([question, group1+'-'+group2, group3])    

In [351]:
len(structures)

684

In [353]:
structures[:3]

[['How many males less than 79 years old have been diagnosed with diabetes during 2008?',
  'count(*)-long_registry',
  'age < 79, endpoint = diabetes, sex = male, year = 2008'],
 ['During a 17 year period ending by 2020, how old on average age are the males who died of diabetes?',
  'avg(age)-mortality',
  'endpoint = diabetes, sex = male'],
 ['Prior to diabetes, what illnesses are most likely to be diagnosed in individuals after 2011?',
  'prior-cox_hrs',
  'outcome = diabetes, year > 2011']]

In [85]:
for i in data_valid[50:55]:
    print(i)

input: Before 2004, what extra risk of other and unspecified parapsoriasis in women more than 61 years old does a previous diagnosis of other diseases of anus and rectum confer? 
 output: hr; cox_hrs; age > 61, outcome = other and unspecified parapsoriasis, prior = other diseases of anus and rectum, sex = female, year < 2004 


input: In 2012, how many males from 29 to 99 are diagnosed with inflammatory disorders of breast? 
 output: count(*); long_registry; age between 29 and 99, endpoint = inflammatory disorders of breast, sex = male, year = 2012 


input: What is the average age of people who get diagnosed with type 2 diabetes for their 1st time  from 2002 to 2010? 
 output: avg(age); long_registry; endpoint = type 2 diabetes, year between 2002 and 2010 


input: During 2007, how many females are diagnosed with secondary thrombocytopenia? 
 output: count(*); long_registry; endpoint = secondary thrombocytopenia, sex = female, year = 2007 


input: What extra risk of other and unspeci

# Data augmentation
## <font color='green'>Back translation

In [145]:
questions = [i[0] for i in structures]

In [7]:
import googletrans
print(googletrans.LANGUAGES)

{'af': 'afrikaans', 'sq': 'albanian', 'am': 'amharic', 'ar': 'arabic', 'hy': 'armenian', 'az': 'azerbaijani', 'eu': 'basque', 'be': 'belarusian', 'bn': 'bengali', 'bs': 'bosnian', 'bg': 'bulgarian', 'ca': 'catalan', 'ceb': 'cebuano', 'ny': 'chichewa', 'zh-cn': 'chinese (simplified)', 'zh-tw': 'chinese (traditional)', 'co': 'corsican', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'eo': 'esperanto', 'et': 'estonian', 'tl': 'filipino', 'fi': 'finnish', 'fr': 'french', 'fy': 'frisian', 'gl': 'galician', 'ka': 'georgian', 'de': 'german', 'el': 'greek', 'gu': 'gujarati', 'ht': 'haitian creole', 'ha': 'hausa', 'haw': 'hawaiian', 'iw': 'hebrew', 'he': 'hebrew', 'hi': 'hindi', 'hmn': 'hmong', 'hu': 'hungarian', 'is': 'icelandic', 'ig': 'igbo', 'id': 'indonesian', 'ga': 'irish', 'it': 'italian', 'ja': 'japanese', 'jw': 'javanese', 'kn': 'kannada', 'kk': 'kazakh', 'km': 'khmer', 'ko': 'korean', 'ku': 'kurdish (kurmanji)', 'ky': 'kyrgyz', 'lo': 'lao', 'la': 'lat

In [165]:
from googletrans import Translator
import time

In [166]:
def translate(sentences, src='en', dest='zh-cn'):
    translator = Translator()
    translator.raise_Exception = True
    encode, decode = [], []
    for i,t in enumerate(sentences): 
        try: 
            if (i != 0) & (i%150 == 0):
                time.sleep(5)
            encode.append(translator.translate(t, src=src, dest=dest).text)
            decode.append(translator.translate(encode[-1], src=dest, dest=src).text)
        except Exception as e: 
            print(i,e)
            break
    return encode, decode

def eur_trans(questions, dest):
    n = 0
    encodes, decodes = [], []
    while n < len(questions):
        encode_, decode_ = translate(questions[n:],dest=dest)
        encodes += encode_
        decodes += decode_
        if len(decode_) == 0:
            print('The number of quotas reaches maximum today :(')
            break
        n += len(decode_)
        print('n: ',n)
        if n < len(questions): time.sleep(20)
    print('This while loop is over.')
    return encodes, decodes

In [167]:
encodes_ja, decodes_ja = eur_trans(questions, 'ja')

0 'NoneType' object has no attribute 'group'
The number of quotas reaches maximum today :(
This while loop is over.


In [235]:
batch_cn, batch_fr, batch_fi, batch_it, batch_ja, batch_de = [], [], [], [], [], []
for i in range(len(decode1)):
    batch_cn.append([questions[i], encode1[i], decode1[i],''])
    batch_fr.append([questions[i], encode2[i], decode2[i],''])
    batch_fi.append([questions[i], encode3[i], decode3[i],''])
    batch_it.append([questions[i], encode4[i], decode4[i],''])
    batch_ja.append([questions[i], encode5[i], decode5[i],''])
    batch_de.append([questions[i], encode6[i], decode6[i],''])

In [87]:
import time, re
from selenium import webdriver
from selenium.webdriver.support.ui import Select

In [389]:
# to initiate in terminal for the 1st time use
# xattr -d com.apple.quarantine chromedriver
driver = webdriver.Chrome('/Users/feiwang/Documents/Projects/chromedriver')

In [199]:
def substitute(sentence):
    return re.sub(regex,'between '+re.findall(regex,b)[0][0]+' and '+re.findall(regex,b)[0][1],sentence)

def multi_translate(sentence, src_xpath, dest_xpath, translator):
    driver.find_element_by_xpath(src_xpath).clear()
#     if check_exists_by_xpath(delete_xpath): 
#         driver.find_element_by_xpath(delete_xpath).click()
    driver.find_element_by_xpath(src_xpath).send_keys(sentence)
    try:
        if translator == 'bing': 
            time.sleep(5)
            return driver.find_element_by_xpath(dest_xpath).get_attribute('value')
        elif translator == 'deepl': 
            time.sleep(3)
            return driver.find_element_by_xpath(dest_xpath).get_attribute('value')
        else: 
            time.sleep(3)
            return driver.find_element_by_xpath(dest_xpath).text
    except NoSuchElementException:
        time.sleep(5)
        if translator in ['bing','deepl']: 
            return driver.find_element_by_xpath(dest_xpath).get_attribute('value')
        else: return driver.find_element_by_xpath(dest_xpath).text

from selenium.common.exceptions import NoSuchElementException        
def check_exists_by_xpath(xpath):
    try:
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

### Youdao translate

In [152]:
def youdao_trans(questions):
    youdao_link = 'https://fanyi.youdao.com/'
    trans_link = '//*[@id="langSelect"]/span'
    encode_link = '//*[@id="languageSelect"]/li[3]/a'
    decode_link = '//*[@id="languageSelect"]/li[2]/a'
    src_xpath = '//*[@id="inputOriginal"]'
    dest_xpath = '//*[@id="transTarget"]/p/span'
#     delete_xpath = '//*[@id="inputDelete"]'

    encodes_youdao = []
    
    driver.get(youdao_link)
    
    driver.find_element_by_xpath(trans_link).click()
    time.sleep(1)
    driver.find_element_by_xpath(encode_link).click()
    
    for i in tqdm.tqdm(questions):
        ans = multi_translate(i,src_xpath,dest_xpath,'youdao')
#         print(ans)
        encodes_youdao.append(ans)
#     encodes_youdao = [multi_translate(i,src_xpath,dest_xpath,delete_xpath) for i in questions]
    
#     driver.find_element_by_xpath(trans_link).click()
#     time.sleep(2)
#     driver.find_element_by_xpath(decode_link).click()
    
#     for i in tqdm.tqdm(encodes_youdao):
#         ans = multi_translate(i,src_xpath,dest_xpath,'youdao')
# #         print(ans)
#         decodes_youdao.append(ans)
# #     decodes_youdao = [multi_translate(i,src_xpath,dest_xpath,delete_xpath) for i in encodes_youdao]
    
    return encodes_youdao

In [262]:
encodes_youdao = encodes_youdao[:105]

In [264]:
res = youdao_trans(questions[105:250])

100%|██████████| 145/145 [07:30<00:00,  3.11s/it]


### Bing translate

In [357]:
bing_link = 'https://www.bing.com/translator'
encode_link = '//*[@id="tta_srcsl"]'
decode_link = '//*[@id="tta_tgtsl"]'
src_xpath = '//*[@id="tta_input_ta"]'
dest_xpath = '//*[@id="tta_output_ta"]'

driver.get(bing_link)
Select(driver.find_element_by_xpath(encode_link)).select_by_visible_text('English')
Select(driver.find_element_by_xpath(decode_link)).select_by_visible_text('Chinese Simplified')
encodes_bing = [multi_translate(i,src_xpath,dest_xpath,'bing') for i in questions]

Select(driver.find_element_by_xpath(encode_link)).select_by_visible_text('Chinese Simplified')
Select(driver.find_element_by_xpath(decode_link)).select_by_visible_text('English')
decodes_bing = [multi_translate(i,src_xpath,dest_xpath,'bing') for i in encodes_bing]

In [159]:
for i in range(5):#len(encodes_bing)):
    print(questions[i])
    print(encodes_bing[i])
    print(decodes_bing[i])
    print()

How many males less than 79 years old have been diagnosed with diabetes during 2008?
2008年有多少79岁以下的男性被诊断出患有糖尿病？
How many men under the age of 79 were diagnosed with diabetes in 2008?

During a 17 year period ending by 2020, how old on average age are the males who died of diabetes?
在截至2020年的17年期间，死于糖尿病的男性的平均年龄是多少？
What is the average age of men who die of diabetes in the 17 years to 2020?

Prior to diabetes, what illnesses are most likely to be diagnosed in individuals after 2011?
在糖尿病之前，2011年以后哪些疾病最有可能在个人中诊断出来？
Before diabetes, which diseases are most likely to be diagnosed in individuals after 2011?

Frequency of individuals with diabetes below 51 years old by 2016.
到2016年，糖尿病患者年龄在51岁以下。
By 2016, people with diabetes are under the age of 51.

Prior to diabetes, which diagnoses are most likely to be diagnosed in females over 10 years old during a 2 year period ending by 2013?
在糖尿病之前，在截至2013年的2年期间，10岁以上的女性最有可能被诊断出糖尿病？
Before diabetes, women over the age of 10 were most likely to be dia

### DeepL translate

In [372]:
deepl_link = 'https://www.deepl.com/translator'
encode_link = '//*[@id="dl_translator"]/div[4]/div[4]/div[1]/div[1]/div/button'
encode_link_en = '//*[@id="dl_translator"]/div[4]/div[4]/div[1]/div[1]/div/div/button[7]'
encode_link_cn = '//*[@id="dl_translator"]/div[4]/div[4]/div[1]/div[1]/div/div/button[3]'
encode_link_jp = '//*[@id="dl_translator"]/div[4]/div[4]/div[1]/div[1]/div/div/button[15]'

decode_link = '//*[@id="dl_translator"]/div[4]/div[4]/div[3]/div[1]/div[2]/div[1]/button'
decode_link_cn = '//*[@id="dl_translator"]/div[4]/div[4]/div[3]/div[1]/div[2]/div[1]/div[2]/button[2]'
decode_link_en = '//*[@id="dl_translator"]/div[4]/div[4]/div[3]/div[1]/div[2]/div[1]/div[2]/button[6]'
decode_link_jp = '//*[@id="dl_translator"]/div[4]/div[4]/div[3]/div[1]/div[2]/div[1]/div[2]/button[15]'
src_xpath = '//*[@id="dl_translator"]/div[4]/div[4]/div[1]/div[2]/div[2]/textarea'
dest_xpath = '//*[@id="dl_translator"]/div[4]/div[4]/div[3]/div[3]/div[1]/textarea'

In [406]:
encodes_deepl, decodes_deepl = [], []

In [390]:
driver.get(deepl_link)

In [405]:
b = []
# driver.get(deepl_link)
# driver.find_element_by_xpath(encode_link).click()
# driver.find_element_by_xpath(encode_link_en).click()
# driver.find_element_by_xpath(decode_link).click()
# driver.find_element_by_xpath(decode_link_cn).click()
for i in questions1:
    b.append(multi_translate(i,src_xpath,dest_xpath,'deepl'))

In [407]:
encodes_deepl += b
len(encodes_deepl)

100

In [408]:
# driver.get(deepl_link)
# driver.find_element_by_xpath(encode_link).click()
# driver.find_element_by_xpath(encode_link_jp).click()
# driver.find_element_by_xpath(decode_link).click()
# driver.find_element_by_xpath(decode_link_en).click()
b = []
for i in encodes_deepl:
    b.append(multi_translate(i,src_xpath,dest_xpath,'deepl'))
decodes_deepl += b

In [410]:
decodes_deepl = b 
len(decodes_deepl)

100

In [403]:
len(encodes_deepl)

100

In [413]:
decodes_deepl_jp1 = decodes_deepl
decodes_deepl_jp += decodes_deepl
encodes_deepl_jp1 = encodes_deepl
encodes_deepl_jp += encodes_deepl

In [256]:
decodes_deepl_cn = decodes_deepl
encodes_deepl_cn = encodes_deepl

In [303]:
decodes_deepl_jp = decodes_deepl
encodes_deepl_jp = encodes_deepl

### Google translate

In [162]:
encode_link = 'https://translate.google.com/?sl=en&tl=zh-CN&op=translate'
decode_link = 'https://translate.google.com/?sl=zh-CN&tl=en&op=translate'
consent_xpath = '//*[@id="yDmH0d"]/c-wiz/div/div/div/div[2]/div[1]/div[4]/form/div[1]/div/button'
# regex = 'from ([\d]{,2}) to ([\d]{,2})'
# questions_ = [substitute(i) for i in questions]
# delete_xpath = '//*[@id="ow42"]/div/span/button'
src_xpath = '//*[@id="yDmH0d"]/c-wiz/div/div[2]/c-wiz/div[2]/c-wiz/div[1]/div[2]/div[2]/c-wiz[1]/span/span/div/textarea'
dest_xpath = '//*[@id="yDmH0d"]/c-wiz/div/div[2]/c-wiz/div[2]/c-wiz/div[1]/div[2]/div[2]/c-wiz[2]/div[5]/div/div[1]/span[1]/span/span'

driver.get(decode_link)
if check_exists_by_xpath(consent_xpath):
    driver.find_element_by_xpath(consent_xpath).click()
decodes_google_train2 = [multi_translate(i,src_xpath, dest_xpath, 'google') for i in tqdm.tqdm(encodes_bing)]

NameError: name 'encodes_bing_train' is not defined

In [None]:
decodes_google_train1 = [multi_translate(i,src_xpath, dest_xpath, 'google') for i in encodes_youdao]

### Merge data

In [414]:
structures_bt = [i for i in structures]
for i in range(len(structures)):
    structures_bt.append([decodes_bing[i],structures[i][1],structures[i][2]])
    structures_bt.append([decodes_deepl_jp[i],structures[i][1],structures[i][2]])
    structures_bt.append([decodes_deepl_cn[i],structures[i][1],structures[i][2]])
# structures_bt = list(zip(*structures_bt))

# data_df = pd.DataFrame({
#     'x':data_bt[0], 'x_bing':decodes_fr_train, 'x_deepl_cn':decodes_it_train, 'x_deepl_ja':decodes_ja_train, 
#     'y1':data_bt[1], 'y2':data_bt[2]
# })

# random.shuffle(data_bt)

In [424]:
len(diseases_valid)*2

562

In [422]:
# train_data = [[
#     re.sub('stroke',diseases_train[i+150],re.sub('diabetes',diseases_train[i],structures_bt[i][0])),
#     re.sub('stroke',diseases_train[i+150],re.sub('diabetes',diseases_train[i],structures_bt[i][1])),
#     re.sub('stroke',diseases_train[i+150],re.sub('diabetes',diseases_train[i],structures_bt[i][2])),
# ] for i in range(len(structures_bt))]

train_data = []
for i in range(len(diseases_train)*3):
    qna = random.choice(structures_bt)
    prior = random.choice(diseases_train)
    q = re.sub('stroke',prior,re.sub('diabetes',(diseases_train*3)[i],qna[0]))
    a1 = re.sub('stroke',prior,re.sub('diabetes',(diseases_train*3)[i],qna[1]))
    a2 = re.sub('stroke',prior,re.sub('diabetes',(diseases_train*3)[i],qna[2]))
    train_data.append([q,a1,a2])

valid_data = []
for i in range(len(diseases_valid)*3):
    qna = random.choice(structures_bt)
    prior = random.choice(diseases_valid)
    q = re.sub('stroke',prior,re.sub('diabetes',(diseases_valid*3)[i],qna[0]))
    a1 = re.sub('stroke',prior,re.sub('diabetes',(diseases_valid*3)[i],qna[1]))
    a2 = re.sub('stroke',prior,re.sub('diabetes',(diseases_valid*3)[i],qna[2]))
    valid_data.append([q,a1,a2])
    
# train_data = list(zip(*train_data)) # 7599
# valid_data = list(zip(*valid_data)) # 843

In [435]:
with open('custom_data_train.txt','w') as f:
    json.dump(train_data,f)

In [434]:
len(train_data)

7599

In [747]:
train = list(zip(*train_data_bt))
valid = list(zip(*valid_data_bt))
train_X, train_y = train[0], train[1]
valid_X, valid_y = valid[0], valid[1]

In [789]:
len(train[1])

2910

In [792]:
data_X = train[0] + valid[0]
data_y = train[1] + valid[1]

In [793]:
len(data_X)

3228

In [794]:
data_len = [*range(len(data_X))]

In [868]:
passed_endpoints = []
passed_indice = []
failed_indice = []

for i in data_X:
    sub_endpoints = []
    for j in diseases:
        if j in i.lower():
            sub_endpoints.append(j)
    if len(sub_endpoints) == 0:
        failed_indice.append(data_X.index(i))
    else:
        passed_endpoints.append(sub_endpoints)
        passed_indice.append(data_X.index(i))

In [866]:
# index_failed = [i for i in endpoints if type(i) == int]
# index_passed = [i for i in endpoints if type(i) == str] #list(set(data_len) - set(index_failed))

In [867]:
len(endpoints) == len(data_X)

True

In [869]:
trainx = [data_X[i] for i in passed_indice]
trainy1 = [data_y[i] for i in passed_indice]
trainy2 = [passed_endpoints[passed_indice.index(i)] for i in passed_indice]

validx = [data_X[i] for i in failed_indice]
validy1 = [data_y[i] for i in failed_indice]

In [873]:
trainx = [re.sub('in people','in the population',i) if 'in people' in i else i for i in trainx]
validx = [re.sub('in people','in the population',i) if 'in people' in i else i for i in validx]

In [888]:
with open('custom_data.json','w') as f:
    json.dump({'train_X':trainx,'train_y1':trainy1,'train_y2':trainy2,'valid_X':validx,'valid_y1':validy1},f)

In [824]:
len(endpoints)

3228

In [897]:
trainx.index('How many men are at age 18 after they were diagnosed with Ebola virus disease?')

10

In [None]:
# Which diseases may be diagnosed before heart failure for females from 10 to 95 from 2018 to 2019?

In [886]:
trainx = trainx[:43]+["What diseases can I have diagnosed before heart failure if I'm a female from 10-95 years old in 2018-2019?"]+trainx[44:]

In [841]:
endpoints[960]

['duloxetine']

In [890]:
trainy2[960]

['alcohol-related liver disease', "addison's disease", 'liver disease']

In [891]:
import pandas as pd

In [894]:
train_data = pd.DataFrame({'train_X':trainx,'train_y1':trainy1,'train_y2':trainy2})
valid_data = pd.DataFrame({'valid_X':validx,'valid_y1':validy1})

In [895]:
def save_xls(list_dfs, xls_path):
    with pd.ExcelWriter(xls_path) as writer:
        for n, df in enumerate(list_dfs):
            df.to_excel(writer,'sheet%s' % n)
        writer.save()

In [896]:
save_xls([train_data,valid_data], 'custom_data.xlsx')

In [933]:
save_xls([train_df,valid_df], 'custom_data_bt.xlsx')

In [940]:
len(train_df)

485

In [None]:
disease_name changed, 

In [951]:
train_datalist_copy = train_datalist

In [941]:
train_datalist=[]

In [1035]:
train_df.iloc[32,:].tolist() # not yet

["What is the average age of women who died after alzheimer's disease diagnosis?",
 "What is the average age of women deceased after the diagnosis of Alzheimer's disease?",
 "What is the average age of women who died after the diagnosis of Alzheimer's disease?",
 "What is the average age of died women after diagnosing Alzheimer's disease?",
 "What is the average age of women who die after being diagnosed with Alzheimer's disease?",
 "How old is the average age of women who died after the diagnosis of Alzheimer's disease?",
 'avg(age)-mortality',
 "cut_year = 0, endpoint = alzheimer's disease, sex = female"]

In [1053]:
data_list = [train_df.iloc[i,:].tolist() for i in range(32,len(train_df))]
# data_list

In [1050]:
[valid_df.iloc[18,:].tolist(),valid_df.iloc[19,:].tolist(),valid_df.iloc[20,:].tolist(),valid_df.iloc[21,:].tolist(),valid_df.iloc[22,:].tolist()]

[['Given mebendazole, what are the strongest associated comorbidities for women over 58 years old between 2019 and 2020?',
  'Give Mebendazole, what are the strongest comorbidities associated with women over 58 between 2019 and 2020?',
  'Given Mebendazole, what are the strongest associated comorbidities for women over 58 years between 2019 and 2020?',
  'Considering Mebendazole, what is the strongest related labor force for women over 58 years or older between 2019 and 2020?',
  'Given Mebendazor, what is the strongest correlation among women over the age of 58 between 2019 and 2020?',
  "During 2016 to 2020, how is the average age of 29-year-old children's first fever?",
  'outcome-cox_hrs',
  'age > 58, prior = mebendazole, sex = female, year between 2019 and 2020'],
 ['How many individuals under 73 years old died one year after fentanyl diagnosis?',
  'How many people under 73 years old died one year after the diagnosis of fentanyl?',
  'How many individuals under the age of 73 hav

In [None]:
'How many women between 1 and 56 have been diagnosed with corona virus?',
  'How many women aged from 1 to 56 are suffering from corona virus?',
  'What is the number of women between 1 and 56 diagnosed with corona virus?',
  'How many women are diagnosed corona virus if they are between 1 year old and 56 years old?',
  'How many women between the ages of 1 and 56 are diagnosed with overeating?',
  'How many women from 1 to 56 were diagnosed as corona virus?',

In [None]:
'What is the average age at the first event of paracetamol among individuals below 7 years old in 2016?',
  'What is the average age for the first paracetamol event among children under 7 in 2016?',
  'In 2016, what is the average age at the first paracetamol event in kids under 7 years old?',
  "During 2016, what's the mean age for first Paracetamor diagnosis for children under 7 years old?",
  'What is the average age of children below the age of 7 when they had first paracetamol diagnosis in 2016?',
  'What is the average age of children below 7 years old who got paracetamol for the first time in 2016?',
  'avg(age)-long_registry',
  'age < 7, endpoint = paracetamol, year = 2016'],

In [1049]:
this = [
['What is the hazard ratio of chloramphenicol for those females above 67 years old who had diazepam?',
  'If my patient is above 67 having diazepam. What is her risk ratio of chloramphenicol?',
  'If my patient is above 67 and was diagnosed with diazepam. What is her risk ratio of having chloramphenicol??',
  'What is the hazard ratio of chloramphenicol diagnosis if my patient is above 67 and was diagnosed with diazepam?',
  'What is the risk ratio of having chloramphenicol if I am above 67 years old with diazepam?',
  'For women with diazepam and over 67 years old, what is the risk ratio of chloramphenicol?',
  'hr-cox_hrs',
  'age > 67, outcome = chloramphenicol, prior = diazepam, sex = female'],
 ['What is the hazard ratio of irritable bowel syndrome in ibs given heparinoid for individuals over 89 years old between 2010 and 2011?',
  'What is the danger ratio of irritable intestine syndrome at IBS given the heparinoid for people over 89 between 2010 and 2011?',
  'between 2010 and 2011, What is the danger relationship of irritable bowel syndrome in IBS given eparinoid for individuals over 89 years between 2010 and 2011?',
  'What is the hazard ratio of IBS hypertherminal syndrome given heparinoids for individuals over 89 in the year of 2010 and 2011?',
  'Between 2010 and 2011, for people over the age of 89, what is their risk ratio of having irritable bowel syndrome in ibs after they have heparinoid?',
  'For seniors over 67 years old, what is the risk ratio of having irritable bowel syndrome in ibs if they have heparinoid??',
  'hr-cox_hrs',
  'age > 89, outcome = irritable bowel syndrome in ibs, prior = heparinoid, year between 2010 and 2011'],
 ['What is the hazard ratio of clobetasone for those men above 96 years old who had tramadol in 2009?',
  'What is the danger ratio of Clobésone for old men over 96 who had Tramadol in 2009?',
  "What's the risk ratio of Clobetasone if I have Tramadol in 2009? I'm an old man above 96",
  'What is the hazard ratio of Clobetasone given Tramador during 2009? He is over 96 years old?',
  'What is the risk ratio of Clobetasone to men over the age of 96 who had cumado in 2009?',
  'He is older than 96 years old in 2009, so what is his hazard ratio of clobetasone if he has tramadol already?',
  'hr-cox_hrs',
  'age > 96, outcome = clobetasone, prior = tramadol, sex = male, year = 2009'],
 ['Given leg cramps, what are the top 6 strongest associated complications for people over 55 years old before 2014?',
  'If my patients are above 55. What are the 6 strongest complications associated with leg cramps before 2014?',
  'Given leg cramps, what are the first 6 strongest complications associated with people over 55 years before 2014?',
  'What is the strongest relevant complications of leg cramps by 2014 if we are over 55 years old?',
  'Given leg cramps, what are the top 6 related complications by 2014? We are some people over the age of 55.',
  'What is the risk ratio of leg cramps by 2014 if my patients are all above 55 years old?',
  'outcome-cox_hrs',
  'age > 55, prior = leg cramps, year < 2014'],
 ['What is the mean age at the first event of fever between 2016 and 2020?',
  'What is the average age at the first fever event between 2016 and 2020?',
  'between 2016 and 2020, What is the average age for the first fever diagnosis between 2016 and 2020?',
  "between 2016 and 2020, What's the average age for the first fever diagnosis?",
  'how old on average are these people when they had first paracetamol diagnosis between 2016 and 2020?',
  "During 2016 to 2020, how old are these patients who had their first fever diagnosis?",
  'avg(age)-long_registry',
  'endpoint = fever, year between 2016 and 2020']
]
train_datalist += this

In [None]:
'What is the average age of men who died 5 years after obstructive sleep apnoea diagnosis?',
  'What is the average age of men who died 5 years after the diagnosis of obstructive sleep apnea?',
  "What's males's average life expectancy 5 years after their first diagnosis of obstructive sleep apnoea?",
  "What's the average life expectancy of men after a five-year survival of obstructive sleep apnoea?",
  'What is the average age of men who died after a five year survival of obstructive sleep apnea?',
  'What is the average age of men who die five years after diagnosis of obstructive sleep apnea?',
  'How old is the average age of men in obstructive sleep apnea after 5 years?',

'On average, how old are these women who died 5 years after pseudoqualine diagnosis?',

In [1054]:
with open('custom_data_bt.json','w') as f:
    json.dump(train_datalist,f)

In [1023]:
train_datalist = train_datalist[:-1]

In [1003]:
train_datalist = [['What are the top 5 comorbidities for males between 13 and 93 between 2002 and 2012 given clarithromycin?',
  'What are the 5 main comorbidities for men between 13 and 93 between 2002 and 2012 if they have clarithromycin?',
  'During 2002 and 2012, what are the first 5 comorbidities for males between 13 and 93 who have Claritromycin?',
  'Between 2002 and 2012, what is the top 5 diseases if a male aged from 13 to 93 years old has clarithromycin?',
  'What were the five combinations of clarithromycin for those men aged 13 to 93 between 2002 and 2012?',
  'From 2002 to 2012, what is the five main prolblems for males having clarithromycin, given age range between 13 and 93?',
  'outcome-cox_hrs',
  'age between 13 and 93, prior = clarithromycin, sex = male, year between 2002 and 2012'],
 ['Which diseases may be diagnosed before budesonide rectal foam and enemas for men at 36 years old after 2019?',
  'What diseases can be diagnosed before the rectal foam of budesonide among males  at age 36 after 2019?',
  'After 2019, what diseases can be diagnosed for those men at 36 years who already have rectal foam of Budesonide and enemies?',
  'Which disease may be diagnosed before budesonide rectal foam and enema for 36-year-old men since 2019?',
  'What diseases can be diagnosed before budson rectal foam and enemas in the male population aged 36 after 2019?',
  'Which diseases can be diagnosed since 2019 for a 36-year-old male with rectal foam of Budesonide and enemies?',
  'prior-cox_hrs',
  'age = 36, outcome = budesonide rectal foam and enemas, sex = male, year > 2019'],
 ['What is the average age of women who died after letrozole diagnosis?',
  'What is the average age of women deceased after the diagnosis of letrozole?',
  'What is the average age of women who died after the diagnosis of Letrozole?',
  'How old on average are women who died after letrozole?',
  'What is the average age of women who died after letrozole?',
  'On average, how old is a woman who died of the diagnosis of letrozole?',
  'avg(age)-mortality',
  'cut_year = 0, endpoint = letrozole, sex = female'],
 ['How many women above 70 years old have been diagnosed with flatulence?',
  'How many women older than 70 have been diagnosed with flatulence?',
  'How many women over 70 are diagnosed with flatulence?',
  'What is the number of women over 70 years old diagnosed with the flatulence?',
  'How many women over the age of 70 got flatulence?',
  'How many women over 70 years old suffer from flatulence?',
  'count(*)-long_registry',
  'age > 70, endpoint = flatulence, sex = female'],
 ['What is the hazard ratio of osteoporosis given ewing sarcoma for children under 9 years old after 2018?',
  "What's the risk of osteoporosis given to Ewing Sarcoma among children under 9 years after 2018?",
  'What is the risk ratio of osteoporosis if a kid under 9 diagnosed with ewing sarcoma after 2018?',
  'What is the hazard ratio of osteoporosis given the diagnosis of ewing sarcoma for nine-year-old people since 2018?',
  'After 2018, what is the risk of osteoporosis in children under 9 years old given ewing sarcoma?',
  'After 2018, how risky is osteoporosis for kids under the age of 9 if they have ewing sarcoma?',
  'hr-cox_hrs',
  'age < 29, outcome = osteoporosis, prior = ewing sarcoma, year > 2018'],
 ['Which diseases may be diagnosed before salbutamol inhaler for men below 66 years old between 2003 and 2008?',
  'In year 2003-2008, what diseases can be diagnosed before the Salbutamol inhaler in men under 66?',
  'What diseases can be diagnosed before Salbutamol inhaler for men under 66 between 2003 and 2008?',
  'Regarding to the men younger than 66 years old, which disease may be diagnosed before the Salbutamol inhaler from 2003 to 2008?',
  'What diseases could men under the age of 66 get diagnosed before they had inhaling salbutamol between 2003 and 2008?',
  'From 2003 to 2008, what diseases can be diagnosed before salbutamol inhaler for males younger than the age of 66?',
  'prior-cox_hrs',
  'age < 66, outcome = salbutamol inhaler, sex = male, year between 2003 and 2008'],
 ['How many males over 69 years old have been diagnosed with lisinopril before 2018?',
  'How many men over 69 were diagnosed with Lisinopril before 2018?',
  'How many males above 69 have been diagnosed with Lisinopril before 2018?',
  'What is the number of men over 69 years old before 2018 who have been diagnosed with Lisinopril?',
  'How many men over the age of 69 got diagnosed with lisinopril by 2018?',
  'Before 2018, how many men over 69 years old were diagnosed with lisinopril?',
  'count(*)-long_registry',
  'age > 69, endpoint = lisinopril, sex = male, year < 2018'],
 ['What is the mean age of men when they died after autistic spectrum disorder?',
  'What is the average age of men who deseased after an autistic spectrum disorder diagnosis?',
  'What is the average age of men who died after the diagnosis of autistic spectrum disturbance?',
  'What is the average age of men who died with ASD?',
  'How old on average age are the men who died of Asd diagnosis?',
  'What is age on average for those males who died with autistic spectrum disorder?',
  'avg(age)-mortality',
  'cut_year = 0, endpoint = autistic spectrum disorder, sex = male'],
 ['How many men between 20 and 21 died 5 years after they diagnosed with aciclovir?',
  'How many men aged from 20 to 21 has died 5 years after their diagnosis of aciclovir?',
  'How many men between 20 and 21 years have died 5 years after they got aciclovir?',
  'What is the number of 20-year-old men and 21-year-old men who died five years after being diagnosed with Acyclovir?',
  'How many men between the ages of 20 and 21 die five years after being diagnosed with acetic acid?',
  'How many men from 20 to 21 years old have died after being diagnosed with acetic acid for 5 years?',
  'count(*)-mortality',
  'age between 20 and 21, cut_year = 5, endpoint = aciclovir, sex = male'],
 ['How many women under 72 years old have been diagnosed with prednisolone?',
  'How many women under 72 have been diagnosed with prednisolone?',
  'How many women under 72 have been diagnosed with prednisolone?',
  'What is the number of women under 72 years old are diagnosed with prednisolone?',
  'How many women under the age of 72 have been diagnosed with Plinysoron?',
  'How many women under 72 years old are diagnosed with Pulinielon?',
  'count(*)-long_registry',
  'age < 72, endpoint = prednisolone, sex = female'],
 ['What is the average age of men who died 5 years after liver tumours diagnosis?',
  "After the diagnosis of liver tumors, what's the mean age of men who survived for another 5 years?",
  'What is the average age of men who died of the diagnosis of liver tumors after five-year survival?',
  'After a five-year survival, what is the average age of men who died of liver tumors?',
  'What is the average age of men who die of liver tumors 5 years after their first diagnosis?',
  'For those men who died 5 years after the first event of liver tumors, what is their mean age?',
  'avg(age)-mortality',
  'cut_year = 5, endpoint = liver tumours, sex = male'],
 ['Which diseases may be diagnosed before sildenafil for men at 11 years old?',
  'For 11-year-old kids, what diseases can be diagnosed first if they later get diagnosed with sildenafil?',
  'My patient is 11 years old. What diseases can he have before being diagnosed with Sildenafil?',
  'My little patient is 11 years old with sildenafil. Which diseases may he have before that?',
  'What diseases can be diagnosed before sildenafil if I have a boy at 11-year-old?',
  'What diseases can I diagnose before sildenafil if I am an 11-year-old male?',
  'prior-cox_hrs',
  'age = 11, outcome = sildenafil, sex = male'],
 ['What is the mean age of males who died 15 years after urinary tract infection diagnosis?',
  "After the diagnosis of urinary tract infection, what's the mean age of men who have survived for another 15 years?",
  "What is the average age of men who deseased 15 years after an urinary tract infection?",
  'On average, what is the average age of men who died of the diagnosis of urinary tract infection after a survival of fifteen years?',
  'What is the average age of men who stay alive for 15 years after they are diagnosed with urinary tract infection?',
  'How old on average age are men who died after 15 years of urinary tract infection?',
  'avg(age)-mortality',
  'cut_year = 15, endpoint = urinary tract infection, sex = male'],
 ['Which diseases may be diagnosed before diabetes for females under 31 years old after 2013?',
  'Since 2013, what diseases can women under 31 get diagnosed if they have diabetes later?',
  'After 2013, what diseases can be diagnosed for females under 31 years old if later they are diagnosed with diabetes?',
  "Which disease may be diagnosed before women's diabetes under 31 years of age after 2013?",
  'Before diabetes, what diseases could be diagnosed in women under the age of 31 after 2013?',
  'I am female below 31. After 2013, what diseases I may have if some time later I find I have diabetes?',
  'prior-cox_hrs',
  'age < 31, outcome = diabetes, sex = female, year > 2013'],
 ["What's the hazard ratio of headaches for those women below 41 years old who had vertigo between 2005 and 2019?",
  'Between 2005 and 2019, what is the danger ratio of headaches if a women under the age of 41 already had Vertigo?',
  'What is the danger of headache for those women under 41 years who had vertigo between 2005 and 2019?',
  'Given vertigo, what is the danger ratio of headaches for women younger 41 from 2005 to 2019?',
  'What is the risk of headaches among women under the age of 41 who have dizziness between 2005 and 2019?',
  "What is the dangerous ratio of women's headaches if they are under the 41 years old and dizzy during 2005 to 2019?",
  'hr-cox_hrs',
  'age < 41, outcome = headaches, prior = vertigo, sex = female, year between 2005 and 2019'],
 ["What's the number of people below 72 years old if they died 5 years after nasal and sinus cancer diagnosis?",
  'How many people under 72 years old died after they suffered 5 years from nasal and sinus cancer?',
  'How many individuals under 72 have died 5 years after the diagnosis of nasal and breast cancer?',
  'What was the number of individuals under 72 years old who died five years after the diagnosis of the nose and the sinus cancer?',
  'What is the death toll of nasal and sinus cancer if these patients are younger than 72 and have survived for 5 years?',
  'How many people under 72 years of age died 5 years after diagnosis of sinus cancer?',
  'count(*)-mortality',
  'age < 72, cut_year = 5, endpoint = nasal and sinus cancer'],
 ['How many individuals under 30 years old have been diagnosed with hyperglycaemia in high blood sugar in 2020?',
  'How many people under 30 were diagnosed with hyperglycemia in high blood glucose in 2020?',
  'How many individuals under the age of 30 were diagnosed with hyperglycemia in high blood sugar in 2020?',
  'What is the number of persons who were under 30 and were diagnosed with hyperglycemia in 2020?',
  'During 2020, how many people below the age of 30 got diagnosed with hyperglycemia?',
  'Do you know how many people in 2020 have been diagnosed with high blood sugar if they are younger than 30?',
  'count(*)-long_registry',
  'age < 30, endpoint = hyperglycaemia in high blood sugar, year = 2020'],
 ["What's the average age of individuals who died 15 years after tuberculosis in tb diagnosis?",
  "After bing diagnosed with Tuberculosis, what's the age on average of people who survived for 15 years more?",
  "What is the average age of people who died 15 years after Tuberculosis in the diagnosis of TB?",
  'What is the average age of individuals who die of tuberculosis in tb 15 years after their first diagnosis of this disease?',
  'For those who die of tuberculosis fifteen years after their first diagnosis, what is their average age?',
  'How old is the average age of people who died after 15 years of tuberculosis diagnosis?',
  'avg(age)-mortality',
  'cut_year = 15, endpoint = tuberculosis in tb'],
 ['Which diseases may be diagnosed given tranexamic acid for individuals over 49 years old after 2010?',
  'What diseases can be diagnosed after transeexamic acid for people over 49 years after 2010?',
  'Which diseases can be diagnosed with after being diagnosed tranexamic acid for individuals over 49 years after 2010?',
  'For those who are older than 49, which disease they can have if they already have tranexamic acid since 2010?',
  'Since 2010, what diseases may be diagnosed by people over the age of 49 who are given transsine?',
  'After 2010, some people over 49 may diagnose with amucine. Which disease they may have later?',
  'outcome-cox_hrs',
  'age > 49, prior = tranexamic acid, year > 2010'],
 ['What is the hazard ratio of osteoarthritis for those females between 92 and 99 who had cirrhosis between 2005 and 2020?',
  'What is the danger ratio of osteoarthritis for these women between 92 and 99 with cirrhosis between 2005 and 2020?',
  'During 2005-2020, what is the risk rate of osteoarthritis if an old lady between 92-99 with cirrhosis?',
  'What is the risk rate of deformer arthrosis if a women between 92-99 with cirrhosis during 2005-2020?',
  'If an old grandma who younger than 99 but older than 92 gets diagnosed with cirrhosis between 2005 and 2020. What is her risk of osteoarthritis?',
  'What is the dangerous ratio of osteoarthritis considering females aged between 92-99 from 2005 to 2020 given cirrhosis?',
  'hr-cox_hrs',
  'age between 92 and 99, outcome = osteoarthritis, prior = cirrhosis, sex = female, year between 2005 and 2020'],
 ['What is the average age of people who died after dysphagia in swallowing problems diagnosis?',
  'If one has dysphagia in swallowing problems. What is the average age of death?',
  'What is the average age of death given diagnosis of swallowing problems?',
  'What is the average age of death after being diagnosed with dysphagia in swallowing problems?',
  'What is the mean age of death regarding to dysphagia in swallowing problems?',
  'Considering dysphagia in swallowing problems, what is the age of death on average?',
  'avg(age)-mortality',
  'cut_year = 0, endpoint = dysphagia in swallowing problems'],
 ["Given mirtazapine, what're the strongest associated comorbidities for men over 99 years old in 2006?",
  'Given the Mirtazapine, what are the strongest associated comorbidities for senior males above 99 in 2006?',
  'Given Mirtazapine, what are the strongest associated comorbidities in 2006 for men over 99?',
  'Considering Meltazapine, what is the strongest related labor work for men over 99 in 2006?',
  'Given Mirtazapin, what was the strongest associated comormission among men over the age of 99 in 2006?',
  'What is the strongest related diseases in 2006 if men over 99 years old were diagnosed with mirtazapine?',
  'outcome-cox_hrs',
  'age > 99, prior = mirtazapine, sex = male, year = 2006'],
 ['What is the mean age at the first event of common heart conditions among women?',
  'What is the average age at the first event of common heart status among women?',
  'What is the average age of women who get diagnosed with common heart conditions for their first time?',
  'What is the average age when women get diagnosed with common heart conditions for the first time?',
  'What is the average age of women with common heart disease for the 1st time?',
  'How old on average are the women diagnosed with common heart disease for the 1st time?',
  'avg(age)-long_registry',
  'endpoint = common heart conditions, sex = female']]