In [41]:
import pandas as pd
import os
import glob
import numpy as np
from snorkel.labeling.model import LabelModel
from sklearn.metrics import classification_report

def reorder_columns(df, cols_in_front):
    """Reorder columns in a pandas dataframe so that the columns in cols_in_front are in front.
    """
    columns = list(df.columns)
    for col in cols_in_front:
        columns.remove(col)
    columns = cols_in_front + columns
    return df[columns]

def lf_results_reported(path='./CITT/'):
    df = pd.read_csv(path + 'calculated_values.txt', sep='|', low_memory=False)
    df['lf'] = df['were_results_reported'] == 't'
    df['lf'] = df['lf'].astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_num_sponsors(path='./CITT/'):
    df = pd.read_csv(path + 'sponsors.txt', sep='|')
    df = df.groupby('nct_id')['name'].count().reset_index()
    df['lf'] = df['name'] > df['name'].quantile(.5)
    df['lf'] = df['lf'].fillna(-1).astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_num_patients(path='./CITT/'):
    df = pd.read_csv(path + 'outcome_counts.txt', sep='|', low_memory=False)    
    df = df.groupby('nct_id').sum().reset_index() # pd df (NCTID, values, num_patients)
    df['lf'] = df['count'] > df['count'].quantile(.5)
    df['lf'] = df['lf'].fillna(-1).astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_patient_drop(path='./CITT/'):
    # patient dropout
    df = pd.read_csv(os.path.join(path, 'drop_withdrawals.txt'), sep='|')
    df = df.groupby('nct_id').sum().reset_index() # pd df (NCTID, values, patient_drop)
    df['lf'] = df['count'] < df['count'].quantile(.5)
    df['lf'] = df['lf'].fillna(-1).astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_sites(path='./CITT/'):
    # sites
    df = pd.read_csv(os.path.join(path, 'facilities.txt'), sep='|')
    df = df.groupby('nct_id')['name'].count().sort_values(ascending=False).reset_index()
    df = df.groupby('nct_id').mean().reset_index() # pd df (NCTID, values, sites)
    df['lf'] = df['name'] > df['name'].quantile(.5)
    df['lf'] = df['lf'].fillna(-1).astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_pvalues(path='./CITT/'):
    # pvalues
    path = './CITT/'
    df = pd.read_csv(os.path.join(path, 'outcome_analyses.txt'), sep='|', low_memory=False)
    df['lf'] = df['p_value'] < .05 # 89406
    df['lf'] = df['lf'].astype('object')
    df.loc[df['p_value'].isna(), ['lf']] = pd.NA
    # df
    df = df[['nct_id', 'lf', 'p_value']]
    df = df.groupby('nct_id').mean().reset_index() # pd df (NCTID, values, pvalues)
    df['lf'] = df['lf'].apply(lambda x: np.round(x, 0) if pd.notna(x) else x)
    df['lf'] = df['lf'].fillna(-1).astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_update_more_recent(path='./CITT/'):
    df = pd.read_csv(os.path.join(path, 'studies.txt'), sep='|', low_memory=False)
    df['last_update_submitted_date'] = pd.to_datetime(df['last_update_submitted_date'])
    df['completion_date'] = pd.to_datetime(df['completion_date'])
    df['update_days'] = (df['last_update_submitted_date'] - df['completion_date']).dt.days
    df['lf'] = df['update_days'].apply(lambda x: x > 0 if pd.notna(x) else x)
    df['lf'] = df['lf'].fillna(-1).astype('int')
    df = reorder_columns(df, ['nct_id', 'lf']) 
    return df

def lf_death_ae(path='./CITT/'):
    df = pd.read_csv(path+'reported_event_totals.txt', sep = '|')
    df = df[df['event_type'] == 'deaths'].fillna(0)
    df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
    df['lf'] = df['subjects_affected'] < 1
    df['lf'] = df['lf'].astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_serious_ae(path='./CITT/'):
    df = pd.read_csv(path+'reported_event_totals.txt', sep = '|')
    df = df[df['event_type'] == 'serious'].fillna(0)
    df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
    df['lf'] = df['subjects_affected'] < 1
    df['lf'] = df['lf'].astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_all_ae(path='./CITT/'):
    df = pd.read_csv(path+'reported_event_totals.txt', sep = '|').fillna(0)
    df = df.groupby('nct_id')['subjects_affected'].sum().reset_index()
    df['lf'] = df['subjects_affected'] < df['subjects_affected'].quantile(.5)
    df['lf'] = df['lf'].astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df

def lf_amendments(path='./stock_price/labels_and_tickers.csv'):
    df = pd.read_csv(path)
    df['lf'] = df['amendment_counts'] > df['amendment_counts'].quantile(.5)
    df['lf'] = df['lf'].astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df
    
def lf_stock_price(path='./stock_price/labels_and_tickers.csv'):
    df = pd.read_csv(path)
    df['lf'] = df['Slope'] > 0
    df['lf'] = df['lf'].astype('int')
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df
    
def lf_linkage(path='./Trial Linkage/'):
    df1 = pd.read_csv(path+'Extracted trial outcomes/Phase 1_Early Phase 1_trial_linkage_outcome_df.csv')
    df2 = pd.read_csv(path+'Extracted trial outcomes/Phase 2_Phase 1_Phase 2_trial_linkage_outcome_df.csv')
    df3 = pd.read_csv(path+'Extracted trial outcomes/Phase 3_Phase 2_Phase 3_trial_linkage_outcome_df.csv')
    df = pd.concat([df1, df2, df3])
    df.rename(columns={'Unnamed: 0': 'nct_id'}, inplace=True)
    df['lf'] = 0
    df.loc[df['outcome']=='Not sure',['lf']] = 1
    df.loc[df['outcome']=='Success', ['lf']] = 1
    df = reorder_columns(df, ['nct_id', 'lf'])
    return df


def get_lfs(path='./CITT/'):
    dfs = [lf_results_reported(path=path), 
           lf_num_sponsors(path=path),
           lf_num_patients(path=path), 
           lf_patient_drop(path=path), 
           lf_sites(path=path), 
           lf_pvalues(path=path),
           lf_update_more_recent(path=path),
           lf_death_ae(path=path),
           lf_serious_ae(path=path),
           lf_all_ae(path=path),
           lf_amendments(),
           lf_stock_price(),
           lf_linkage()]

    all_ids = set() # set of all nct_ids
    for df in dfs:
        all_ids = all_ids | set(df['nct_id'])

    all_df = pd.DataFrame(all_ids, columns=['nct_id']) # combine all dfs
    for i, df in enumerate(dfs):
        all_df = pd.merge(all_df, df.iloc[:,:2].rename(columns={'lf':'lf'+str(i)}), on='nct_id', how='left')
    all_df = all_df.fillna(-1)
    # all_df.iloc[:,1:] = all_df.iloc[:,1:].astype('int')
    return all_df

df = get_lfs()


In [42]:
# L = np.array([[0, 0, -1], [-1, 0, 1], [1, -1, 0]])
# Y_dev = [0, 1, 0]
L = df.iloc[:,1:].values.astype('int')
label_model = LabelModel(verbose=False)
# label_model.fit(L)
# label_model.fit(L, Y_dev=Y_dev, seed=2020, lr=0.05)
# label_model.fit(L, class_balance=[0.67, 0.33], seed=0)
label_model.fit(L, class_balance=[0.5, 0.5], lr=0.05, seed=0)
pred = label_model.predict(L)
print('predicted label distribution', np.unique(pred, return_counts=True))
df['pred'] = pred
df['pred'] = df['pred'].astype('int')

# classification report
path = './clinical-trial-outcome-prediction/data/'
all_files = glob.glob(os.path.join(path, "phase*train.csv")) + glob.glob(os.path.join(path, "phase*valid.csv"))
hint = pd.concat((pd.read_csv(f) for f in all_files))
hint.rename(columns={'nctid': 'nct_id'}, inplace=True)

for phase in ['phase 1', 'phase 2', 'phase 3']:
    hint_subset = hint[hint['phase'].str.contains(phase)]

    combined = pd.merge(hint_subset, df, on='nct_id', how='left')
    combined = combined.dropna(subset=['pred'])
    combined = combined[combined['pred'] != -1]
    print(phase, hint_subset.shape, combined.shape)
    report = classification_report(combined['label'], combined['pred'], output_dict=True)
    print(report['1'])

100%|██████████| 100/100 [00:00<00:00, 1677.16epoch/s]


predicted label distribution (array([0, 1]), array([310351, 169410]))
phase 1 (1596, 10) (1632, 24)
{'precision': 0.5614258434118395, 'recall': 0.9566160520607375, 'f1-score': 0.7075812274368231, 'support': 922.0}
phase 2 (5065, 10) (5225, 24)
{'precision': 0.4873267326732673, 'recall': 0.9669941060903733, 'f1-score': 0.6480579328505596, 'support': 2545.0}
phase 3 (3617, 10) (3860, 24)
{'precision': 0.6515311510031679, 'recall': 0.9887820512820513, 'f1-score': 0.7854869509866327, 'support': 2496.0}


In [127]:
import pandas as pd
import numpy as np
news_df = pd.read_csv('./stock_news_logs/news.csv')
news_title_embedding = np.load('./news_title_embeddings.npy')
print(news_df.shape, news_title_embedding.shape)

In [35]:
import os
import pickle
import json
import pandas as pd
import numpy as np

path = './CITT/'
# df = pd.read_csv(path + 'studies.txt', sep='|', low_memory=False)
# df = pd.read_csv(path + 'sponsors.txt', sep='|')
# df = pd.read_csv(path + 'drop_withdrawals.txt', sep='|', low_memory=False)
# df = pd.read_csv(path + 'reported_events.txt', sep='|', low_memory=False)
# df = pd.read_csv(path + 'outcome_counts.txt', sep='|', low_memory=False)
# df = pd.read_csv('./labels_amendments_ae_serious_death.csv')
df1 = pd.read_csv('./Trial Linkage/Extracted trial outcomes/Phase 1_Early Phase 1_trial_linkage_outcome_df.csv')
df2 = pd.read_csv('./Trial Linkage/Extracted trial outcomes/Phase 2_Phase 1_Phase 2_trial_linkage_outcome_df.csv')
df3 = pd.read_csv('./Trial Linkage/Extracted trial outcomes/Phase 3_Phase 2_Phase 3_trial_linkage_outcome_df.csv')
df = pd.concat([df1, df2, df3])
df.rename(columns={'Unnamed: 0': 'NCTID'}, inplace=True)
df['lf'] = 0
df.loc[df['outcome']=='Not sure',['lf']] = -1
df.loc[df['outcome']=='Success', ['lf']] = 1
# print(np.unique(df['outcome'], return_counts=True))

In [5]:
import os
import pickle
import json

# os.listdir('./stock_news_logs/_names/')
with open('./filtered_ticker_dict_642.pkl', 'rb') as f:
    ticker_dict = pickle.load(f)
ticker_dict_inv = {v: k for k, v in ticker_dict.items()}
# log_dir = './stock_news_logs/_names/'
# for ticker in os.listdir('./stock_news_logs/'):
#     if os.path.exists(f'./stock_news_logs/{ticker}/news.json'):
#         with open(f'./stock_news_logs/{ticker}/news.json', 'rb') as f:
#             name_dict = json.load(f)
#         name = ticker_dict_inv[ticker].lower()
#         os.makedirs(log_dir+ name.lower(), exist_ok=True)
#         with open(log_dir + name.lower() + '/news.json', 'w') as f:
#             json.dump(name_dict, f)

    # ticker_dict.update(name_dict)
    # ticker_dict_inv = {v: k for k, v in ticker_dict.items()}


{'GILD': 'Gilead Sciences',
 'NVS': 'Novartis',
 'ABBV': 'AbbVie',
 'PFE': 'Pfizer',
 'PHAT': 'Phathom Pharmaceuticals, Inc.',
 'ZBH': 'Zimmer Biomet',
 'BSX': 'Boston Scientific Corporation',
 'INCY': 'Incyte Corporation',
 'NVO': 'Novo Nordisk A/S',
 'NBIX': 'Neurocrine Biosciences',
 'SNY': 'Sanofi',
 'BCEL': 'Atreca, Inc.',
 'AZN': 'AstraZeneca',
 'BGNE': 'BeiGene',
 'TAK': 'Takeda',
 'BLUE': 'bluebird bio',
 'LLY': 'Eli Lilly and Company',
 'DXCM': 'DexCom, Inc.',
 'BDX': 'Becton, Dickinson and Company',
 'AMGN': 'Amgen',
 'RDY': "Dr. Reddy's Laboratories Limited",
 'BMY': 'Bristol-Myers Squibb',
 'DRRX': 'Durect',
 'BIIB': 'Biogen',
 'CAH': 'Cardinal Health',
 'ABT': 'Abbott',
 'MMM': '3M',
 'SIGA': 'SIGA Technologies',
 'MRUS': 'Merus N.V.',
 'BNTX': 'BioNTech SE',
 'CARA': 'Cara Therapeutics, Inc.',
 'SWTX': 'SpringWorks Therapeutics, Inc.',
 'ANTX': 'AN2 Therapeutics, Inc',
 'LYEL': 'Lyell Immunopharma, Inc.',
 'EW': 'Edwards Lifesciences',
 'AAPL': 'Apple Inc.',
 'CERS': 'Cer

In [23]:
import os

log_dir = './stock_news_logs/_names/'
failed = []
for name in sorted(os.listdir(log_dir)):
    if not os.path.exists(log_dir + name + '/news.json'):
        print(name)
        failed.append(name)


adir, a servier group company
alk-abelló a
ascendis pharma a
ascendis pharma endocrinology division a
brainfarma industria química e farmacêutica s
coloplast a
evaxion biotech a
h. lundbeck a
laboratório teuto brasileiro s
nordic bioscience a
novo nordisk a
pharmacosmos a


In [None]:
list(news_df['title'].values)

# weighted average of the RELEVANT TRIALS, publications may be before completion date due to multisite, international trials, 
# need to dig into examples and show in the paper
# https://pathos.com/pipeline/ is example of a company that does this
# interpretability is key, need to emphasize this in the dataset creation
# show table statictis of diseases, correlations of different labels based on disease
# phase 1 should be noisier
# need to keep refreshing the pipleine, monthly basis? Integrated dashboard, ask gen ai for interfacing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
data_path = './CITT/'

sponsors = pd.read_csv(data_path + 'sponsors.txt', sep='|')
studies = pd.read_csv(data_path + 'studies.txt', sep='|', low_memory=False)

studies['study_first_submitted_date'] = pd.to_datetime(studies['study_first_submitted_date'])
sponsors = pd.merge(sponsors, studies[['nct_id', 'phase', 'study_first_submitted_date']], on='nct_id', how='left')
sponsors = sponsors[sponsors['agency_class']=='INDUSTRY']
sponsors.dropna(inplace=True)
sponsors = sponsors[sponsors['phase'].str.contains('Phase 3')]

# len(sponsors[sponsors['agency_class']=='INDUSTRY']['name'].unique()) #15277

# # top sponsors and their cumulated trial coverage
# num_sponsors = []
# coverage = []
# for i in range(100, len(sponsors['name'].unique()), 100):
#     top_sponsors = sponsors['name'].value_counts().head(i)
#     coverage_ = top_sponsors.sum() / sponsors['name'].value_counts().sum()
#     num_sponsors.append(i)
#     coverage.append(coverage_)
# plt.scatter(num_sponsors, coverage, label='Cumulated trial coverage')
# # plt.xscale('log')
# plt.xlabel('Number of top sponsors')
# plt.ylabel('Cumulated trial coverage')
# plt.grid()

top_sponsors = sponsors['name'].value_counts().head(1000)
coverage_ = top_sponsors.sum() / sponsors['name'].value_counts().sum()
combined = pd.merge(top_sponsors.reset_index(),
                    sponsors.groupby('name')['study_first_submitted_date'].min().reset_index(),
                    on='name', how='left')
# print(top_sponsors)

with open('./filtered_ticker_dict_642.pkl', 'rb') as f:
# with open('./ticker_dict_211.pkl', 'rb') as f:
    ticker_dict = pickle.load(f)
ticker_df = pd.DataFrame(ticker_dict.items(), columns=['name', 'ticker'])
combined = pd.merge(combined, ticker_df, on='name', how='left')



In [None]:
print(combined.isna().sum())
combined.to_csv('top_sponsors.csv', index=False)

In [None]:
import os

for name in combined['name']:
    date = combined[combined['name']==name]['study_first_submitted_date'].min()
    print(date.year, date.month, date.day)
    os.makedirs('./stock_news_logs/_names/' + name.lower(), exist_ok=True)

In [None]:
studies['study_first_submitted_date'].isna().sum()

In [None]:
import datetime
#convert to datetime
def convert_to_datetime(date_str):
# Mon, 20 May 1996 07:00:00 GMT
    try:
        return datetime.datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S %Z')
    except:
        return pd.NA

with open('./filtered_ticker_dict_642.pkl', 'rb') as f:
# with open('./ticker_dict_211.pkl', 'rb') as f:
    ticker_dict = pickle.load(f)

all_company_dfs = []
for company, ticker in ticker_dict.items():
    if not os.path.exists(os.path.join('./stock_news_logs', ticker, 'news.json')):     # if exists
        continue
    with open(os.path.join('./stock_news_logs', ticker, 'news.json'), 'rb') as f:
        news = json.load(f)
    # print(company, ticker, news)

    all_titles = []
    all_descriptions = []
    all_dates = []
    all_publishers = []
    for k in news.keys():
        if len(news[k]) > 0:
            # print(k, news[k][0].keys())
            for i in range(len(news[k])):
                print(news[k][i]['published date'])
                date = convert_to_datetime(news[k][i]['published date'])
                print(date)
                all_dates.append(date)
                all_titles.append(news[k][i]['title'])
                all_descriptions.append(news[k][i]['description'])
                all_publishers.append(news[k][i]['publisher']['title'])

    df = pd.DataFrame({'date': all_dates, 'title': all_titles, 'description': all_descriptions, 'publisher': all_publishers})
    df['ticker'] = ticker
    all_company_dfs.append(df)
all_company_dfs = pd.concat(all_company_dfs)

In [None]:
# # look at orange book
# import pandas as pd
# # df = pd.read_csv('./drugbank_all_drug_links.csv.zip')
# drugbank_approved = pd.read_csv('./drugbank/drugbank_approved_drug_links.csv.zip')

# exclusivity = pd.read_csv('./EOBZIP_2024_02/exclusivity.txt', sep='~')
# patent = pd.read_csv('./EOBZIP_2024_02/patent.txt', sep='~')
# products = pd.read_csv('./EOBZIP_2024_02/products.txt', sep='~')
# drug_vocab = pd.read_csv('./drugbank/drugbank vocabulary.csv')

# drugbank_approved['Name'] = drugbank_approved['Name'].str.lower()
# drugbank_approved_names = set(drugbank_approved['Name'].unique())

# products[['Ingredient', 'Trade_Name']] = products[['Ingredient', 'Trade_Name']].apply(lambda x: x.str.lower().str.strip())
# products['Ingredient'] = products['Ingredient'].str.split(';')
# products['Ingredient'] = products['Ingredient'].apply(lambda x: [y.strip() for y in x] if isinstance(x, list) else [])
# unique_ingredients = set(products['Ingredient'].explode().unique())

# products['Trade_Name'] = products['Trade_Name'].str.split(' and ')
# products['Trade_Name'] = products['Trade_Name'].apply(lambda x: [y.strip() for y in x] if isinstance(x, list) else [])
# unique_trade_names = set(products['Trade_Name'].explode().unique())

# drug_vocab['Synonyms'] = drug_vocab['Synonyms'].str.split('|')
# drug_vocab['Synonyms'] = drug_vocab['Synonyms'].apply(lambda x: [y.lower().strip() for y in x] if isinstance(x, list) else [])

# all_synonyms = set(drug_vocab['Synonyms'].explode().dropna().unique())
# common_names = set(drug_vocab['Common name'].str.lower().unique())
# all_names = common_names.union(all_synonyms)

# print('unique_ingredients', len(unique_ingredients)) # 2264
# print(len(unique_ingredients.intersection(all_synonyms))) # 1088
# print(len(unique_ingredients.intersection(common_names))) # 1095
# print(len(unique_ingredients.intersection(all_names))) # 1183

# print('unique_trade_names', len(unique_trade_names)) # 7071
# print(len(unique_trade_names.intersection(all_synonyms))) # 669
# print(len(unique_trade_names.intersection(common_names))) # 629
# print(len(unique_trade_names.intersection(all_names))) # 701

# print('drugbank_approved_names', len(drugbank_approved_names)) # 4389
# print(len(drugbank_approved_names.intersection(unique_ingredients))) # 1074
# print(len(drugbank_approved_names.intersection(unique_trade_names))) # 624

In [None]:
# import sys
# sys.path.append('./GNews/')
# from gnews import GNews

# google_news = GNews()
# pakistan_news = google_news.get_news('Pakistan')
# print(pakistan_news[0])    


In [None]:
# import yfinance as yf

# msft = yf.Ticker("MSFT")

# # get all stock info
# msft.info

# # get historical market data
# hist = msft.history(period="1mo")

# # show meta information about the history (requires history() to be called first)
# msft.history_metadata

# # show actions (dividends, splits, capital gains)
# msft.actions
# msft.dividends
# msft.splits
# msft.capital_gains  # only for mutual funds & etfs

# # show share count
# msft.get_shares_full(start="2022-01-01", end=None)

# # show financials:
# # - income statement
# msft.income_stmt
# msft.quarterly_income_stmt
# # - balance sheet
# msft.balance_sheet
# msft.quarterly_balance_sheet
# # - cash flow statement
# msft.cashflow
# msft.quarterly_cashflow
# # see `Ticker.get_income_stmt()` for more options

# # # show holders
# # msft.major_holders
# # msft.institutional_holders
# # msft.mutualfund_holders
# # msft.insider_transactions
# # msft.insider_purchases
# # msft.insider_roster_holders

# # # show recommendations
# # msft.recommendations
# # msft.recommendations_summary
# # msft.upgrades_downgrades

# # Show future and historic earnings dates, returns at most next 4 quarters and last 8 quarters by default. 
# # Note: If more are needed use msft.get_earnings_dates(limit=XX) with increased limit argument.
# msft.earnings_dates

# # # show ISIN code - *experimental*
# # # ISIN = International Securities Identification Number
# # msft.isin

# # # show options expirations
# # msft.options

# # # show news
# # msft.news

# # # get option chain for specific expiration
# # opt = msft.option_chain('YYYY-MM-DD')
# # # data available via: opt.calls, opt.puts

In [None]:
# import requests
# def get_ticker(company_name):
#     res = requests.get(url="https://query2.finance.yahoo.com/v1/finance/search", 
#                        params={
#                            "q": company_name, 
#                            }, 
#                        headers={
#                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
#                            }
#                            )
#     data = res.json()
#     return data
#     # print(data)
#     # company_code = data['quotes'][0]['symbol']
#     # return company_code

# data = get_ticker('')
# data.keys() #dict_keys(['explains', 'count', 'quotes', 'news', 'nav', 'lists', 'researchReports', 'screenerFieldResults', 'totalTime', 'timeTakenForQuotes', 'timeTakenForNews', 'timeTakenForAlgowatchlist', 'timeTakenForPredefinedScreener', 'timeTakenForCrunchbase', 'timeTakenForNav', 'timeTakenForResearchReports', 'timeTakenForScreenerField', 'timeTakenForCulturalAssets'])


In [None]:
# import json

# # with open("./tmp/NCT00102336_pubmed_abs.json", 'r') as f:
# with open("./tmp/NCT00102336_pubmed_abs.json", 'r') as f:
#     d = json.load(f)

# # print(len(d['References']))
# # print(d['References'][0].keys())
# # for i in range(len(d['References'])):
# #     print(i, d['References'][i]['Reference type'])

# prompt = '''
# You are given the PubMed abstract for a clinical trial. Your task is to use summarize important values of the trial into a json format. After summarization, you must predict the trial outcome.

# Guidelines:
# - **Completeness**: Ensure there are no missing statistical tests and descriptions in json output.
# - **Data Verification**: Before concluding the final answer, always verify that your observations align with the original trial description. Do not create any new information.

# Output Format:
# {
#     "description": <string of text summary of the trial outcome>,
#     "extracted features": [
#         {
#         "description: <string, text describing the feature extracted: e.g. "platelet response", "number of participants", "confidence interval", "p-value", "number", "study design">
#         "value": <float or string of the values of above description>
#         }, ... # can repeat as many times as needed
#     ]
#     "outcome": <string, must either "succeed", "fail", "unsure">,
#     "outcome reasoning": <string, reasoning as to why you predicted the outcome. Most trials succeed if the primary p-value < 0.05>
# }

# Notes for final output:
# - Ensure the final answer format is a valid json dictionary. 
# - Ensure all "value" are of float or string ONLY  

# Here is the abstract: 

# [ABSTRACT]

# Begin!
# '''

# abs = d['References'][5]['Abstract']

# print(prompt.replace("[ABSTRACT]", abs))
# # print(prompt)