In [1]:
import pandas as pd
import numpy as np
import random 
import math
import logging
import time 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from scipy.spatial import distance

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

  from .autonotebook import tqdm as notebook_tqdm


### Create Paired Sampled Data

In [2]:
df_raw = pd.read_csv("/mnt/workspace/data/var_doc_complete.csv")

In [3]:
np.random.default_rng(seed=42)

Generator(PCG64) at 0x7F550230C8C0

In [4]:
df = df_raw[["variable_description", "concept", "study"]].dropna().drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,variable_description,concept,study
0,Atrial fibrillation,AFIB,ARIC
1,Incident AF,AFIB,ARIC
2,time to incident af,AFIB,ARIC
3,DAYS SINCE EXAM 1,AFIB,ARIC
4,ATRIAL FIBRILLATION,AFIB,Framingham
...,...,...,...
880,"Weight (pounds), Exam 6",WGT,Framingham
881,"Weight (pounds), Exam 7",WGT,Framingham
882,"Weight (pounds), Exam 8",WGT,Framingham
883,"Weight (pounds), Exam 9",WGT,Framingham


In [5]:
df_conc = df[["concept"]].rename(columns={"concept": "variable_description"})
df_conc["concept"] = df_conc["variable_description"]
df_conc["study"] = 'None'
df_conc = df_conc.drop_duplicates().reset_index(drop=True)
df_conc

Unnamed: 0,variable_description,concept,study
0,AFIB,AFIB,
1,AGE,AGE,
2,ALCOHOL,ALCOHOL,
3,ANYCHOLMED,ANYCHOLMED,
4,ASPIRIN,ASPIRIN,
...,...,...,...
59,TRIG,TRIG,
60,VALVDIS,VALVDIS,
61,VEGETABLES,VEGETABLES,
62,VISDAY,VISDAY,


In [6]:
df = pd.concat([df, df_conc], axis=0, ignore_index=True).drop_duplicates(subset=['variable_description', 'concept', 'study'])
df

Unnamed: 0,variable_description,concept,study
0,Atrial fibrillation,AFIB,ARIC
1,Incident AF,AFIB,ARIC
2,time to incident af,AFIB,ARIC
3,DAYS SINCE EXAM 1,AFIB,ARIC
4,ATRIAL FIBRILLATION,AFIB,Framingham
...,...,...,...
944,TRIG,TRIG,
945,VALVDIS,VALVDIS,
946,VEGETABLES,VEGETABLES,
947,VISDAY,VISDAY,


In [7]:
# ensure protobuf = 3.20.x
from biobert_embedding.embedding import BiobertEmbedding
biobert = BiobertEmbedding(model_path='/mnt/workspace/packages/biobert/')

2024-08-08 17:13:37.286013: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-08 17:13:38.327711: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.6/lib64/
2024-08-08 17:13:38.327840: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.6/lib64/


In [8]:
df_p = df.copy()
df_p['biobert_embedding'] = df_p['variable_description'].apply(lambda sentence: biobert.sentence_vector(sentence))

In [9]:
df_p

Unnamed: 0,variable_description,concept,study,biobert_embedding
0,Atrial fibrillation,AFIB,ARIC,"[tensor(0.1640), tensor(0.1292), tensor(0.2356..."
1,Incident AF,AFIB,ARIC,"[tensor(-0.0823), tensor(-0.1281), tensor(-0.3..."
2,time to incident af,AFIB,ARIC,"[tensor(0.2017), tensor(-0.1166), tensor(-0.38..."
3,DAYS SINCE EXAM 1,AFIB,ARIC,"[tensor(0.4667), tensor(-0.3528), tensor(-0.15..."
4,ATRIAL FIBRILLATION,AFIB,Framingham,"[tensor(0.1640), tensor(0.1292), tensor(0.2356..."
...,...,...,...,...
944,TRIG,TRIG,,"[tensor(0.1032), tensor(0.0132), tensor(0.0905..."
945,VALVDIS,VALVDIS,,"[tensor(0.4729), tensor(-0.0575), tensor(-0.09..."
946,VEGETABLES,VEGETABLES,,"[tensor(0.3033), tensor(-0.2493), tensor(-0.44..."
947,VISDAY,VISDAY,,"[tensor(0.2013), tensor(-0.2242), tensor(-0.23..."


In [10]:
df_p.to_pickle("/mnt/workspace/data/pickle/predictor_data.pkl")

In [11]:
df.study.value_counts()

Framingham    409
ARIC          315
MESA          161
None           64
Name: study, dtype: int64

In [12]:
# Create first dataframe that consists of all 1s. This will maximise all the matches.
concept_map = {}
for concept in df['concept'].unique():
    concept_map[concept] = df[df['concept'] == concept][['variable_description', 'study']]

variable_name_list = []
resample_variable_name_list = []
variable_concept_name = []
resample_variable_concept_name = []
variable_study_name = []
resample_variable_study_name = []

for concept in concept_map:
    n = concept_map[concept].shape[0]
    for i in range(n):
        count = 0
        for j in range(i+1,n):
            if concept_map[concept].iloc[i]['variable_description'] != concept_map[concept].iloc[j]['variable_description']:
                variable_name_list.append(concept_map[concept].iloc[i]['variable_description'])
                variable_concept_name.append(concept)
                variable_study_name.append(concept_map[concept].iloc[i]['study'])
                resample_variable_name_list.append(concept_map[concept].iloc[j]['variable_description'])
                resample_variable_concept_name.append(concept)
                resample_variable_study_name.append(concept_map[concept].iloc[j]['study'])
    
tp = pd.DataFrame({
    'variable_description_1': variable_name_list,
    'concept_name_1': variable_concept_name ,
    'study_1': variable_study_name,
    'variable_description_2': resample_variable_name_list,
    'concept_name_2': resample_variable_concept_name,
    'study_2': resample_variable_study_name
})

print(tp.shape)
tp = tp.drop_duplicates()
print(tp.shape)
tp.loc[:, 'match'] = 1
tp.reset_index(drop=True, inplace=True)

tp

(15876, 6)
(15876, 6)


Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match
0,Atrial fibrillation,AFIB,ARIC,Incident AF,AFIB,ARIC,1
1,Atrial fibrillation,AFIB,ARIC,time to incident af,AFIB,ARIC,1
2,Atrial fibrillation,AFIB,ARIC,DAYS SINCE EXAM 1,AFIB,ARIC,1
3,Atrial fibrillation,AFIB,ARIC,ATRIAL FIBRILLATION,AFIB,Framingham,1
4,Atrial fibrillation,AFIB,ARIC,D205-026-ATRIAL-FIBRILLATION,AFIB,Framingham,1
...,...,...,...,...,...,...,...
15871,"Weight (pounds), Exam 8",WGT,Framingham,WEIGHT (lbs),WGT,MESA,1
15872,"Weight (pounds), Exam 8",WGT,Framingham,WGT,WGT,,1
15873,"Weight (pounds), Exam 9",WGT,Framingham,WEIGHT (lbs),WGT,MESA,1
15874,"Weight (pounds), Exam 9",WGT,Framingham,WGT,WGT,,1


In [13]:
tp['concept_name_1'].equals(tp['concept_name_2'])

True

In [14]:
df["variable_description"].isna().any()

False

In [15]:
# Create a new dataframe consisting of non-matches and attempt to get close to a 45/55 split between matches/non-matches
new_df = pd.DataFrame(columns=["variable_description", "concept_name", "study_name", "variable_description_sampled", "concept_name_sampled", "study_name_sampled", "Equal"])
concept_list_tp = df['concept'].unique()

for concept in concept_list_tp:
    # get the number of rows in tp and add 20% to account for accidental same concept matches
    num_rows_tp = round(tp[tp['concept_name_1'] == concept].shape[0] * 3)
    
    # Loop through new_df_length times
    for i in range(num_rows_tp):
        # Randomly select a row from the original dataframe that has the concept
        row = df[df['concept'] == concept].sample().iloc[0]

        # Randomly select a variable_description from the original dataframe that does not have the concept
        sampled_row = df[df['concept'] != concept].sample().iloc[0]
        
        # Get the corresponding "Concept Name" for the sampled value
        if pd.notna(sampled_row['variable_description']):
            concept_name_2 = sampled_row['concept']

        if concept_name_2 == 'Nan':
            print('Caught Nan Value')

        # Check if the concept name is equal to the original concept name
        equal = int(row["concept"] == concept_name_2)

        # Add a new row to the new dataframe
        new_df = new_df.append({"variable_description": row["variable_description"], "concept_name": row["concept"], "study_name": row["study"], "variable_description_sampled": sampled_row["variable_description"], "concept_name_sampled": concept_name_2, "study_name_sampled": sampled_row["study"], "Equal": equal}, ignore_index=True)
    
new_df

Unnamed: 0,variable_description,concept_name,study_name,variable_description_sampled,concept_name_sampled,study_name_sampled,Equal
0,Incident AF,AFIB,ARIC,TRIG,TRIG,,0
1,DAYS SINCE EXAM 1,AFIB,ARIC,MEDICATION & CONCENTRATION Q4M09A,INSULIN,ARIC,0
2,time to incident af,AFIB,ARIC,"CARROTS, COOKED:(servings/week)",VEGETABLES,Framingham,0
3,ECG-RHYTHM,AFIB,Framingham,Time to CHF or End of Year 2015 (days),HXCVD,MESA,0
4,ECG-RHYTHM,AFIB,Framingham,MEDS - ANTI-CHOL -OTHER,ANYCHOLMED,Framingham,0
...,...,...,...,...,...,...,...
47623,"Weight (pounds), Exam 4",WGT,Framingham,SYSTOLIC BP - PHYSICIAN - 2ND READING,SYSBP2,Framingham,0
47624,"Weight (pounds), Exam 1",WGT,Framingham,MEDICATION & CONCENTRATION Q4M10A,INSULIN,ARIC,0
47625,WGT,WGT,,CUP OF ICEBERG OR HEAD LETTUCE,VEGETABLES,Framingham,0
47626,WEIGHT (lbs),WGT,MESA,MEDICATION B NAME Q5A,INSULIN,ARIC,0


In [16]:
new_df.Equal.value_counts()

0    47628
Name: Equal, dtype: int64

In [17]:
new_df = new_df[new_df['Equal'] == 0]
new_df.shape

(47628, 7)

In [18]:
# Rename the columns
new_df = new_df.rename(columns={"variable_description":"variable_description_1", "concept_name": "concept_name_1", "study_name": "study_1", "variable_description_sampled":"variable_description_2" , "concept_name_sampled": "concept_name_2", "study_name_sampled": "study_2", "Equal": "match"})
# remove duplicate fields that result because the data in itself has multiple of the same mappings
tn = new_df.drop_duplicates()

In [19]:
tn.shape

(45342, 7)

In [20]:
# append the zero dataset to the one dataset
result = pd.concat([tp, tn], axis=0, ignore_index=True)
result = result.drop_duplicates()
result

Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match
0,Atrial fibrillation,AFIB,ARIC,Incident AF,AFIB,ARIC,1
1,Atrial fibrillation,AFIB,ARIC,time to incident af,AFIB,ARIC,1
2,Atrial fibrillation,AFIB,ARIC,DAYS SINCE EXAM 1,AFIB,ARIC,1
3,Atrial fibrillation,AFIB,ARIC,ATRIAL FIBRILLATION,AFIB,Framingham,1
4,Atrial fibrillation,AFIB,ARIC,D205-026-ATRIAL-FIBRILLATION,AFIB,Framingham,1
...,...,...,...,...,...,...,...
61213,"Weight (pounds), Exam 4",WGT,Framingham,SYSTOLIC BP - PHYSICIAN - 2ND READING,SYSBP2,Framingham,0
61214,"Weight (pounds), Exam 1",WGT,Framingham,MEDICATION & CONCENTRATION Q4M10A,INSULIN,ARIC,0
61215,WGT,WGT,,CUP OF ICEBERG OR HEAD LETTUCE,VEGETABLES,Framingham,0
61216,WEIGHT (lbs),WGT,MESA,MEDICATION B NAME Q5A,INSULIN,ARIC,0


In [21]:
result['concept_name_1'].equals(result['concept_name_2'])

False

In [22]:
final_data = result
final_data

Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match
0,Atrial fibrillation,AFIB,ARIC,Incident AF,AFIB,ARIC,1
1,Atrial fibrillation,AFIB,ARIC,time to incident af,AFIB,ARIC,1
2,Atrial fibrillation,AFIB,ARIC,DAYS SINCE EXAM 1,AFIB,ARIC,1
3,Atrial fibrillation,AFIB,ARIC,ATRIAL FIBRILLATION,AFIB,Framingham,1
4,Atrial fibrillation,AFIB,ARIC,D205-026-ATRIAL-FIBRILLATION,AFIB,Framingham,1
...,...,...,...,...,...,...,...
61213,"Weight (pounds), Exam 4",WGT,Framingham,SYSTOLIC BP - PHYSICIAN - 2ND READING,SYSBP2,Framingham,0
61214,"Weight (pounds), Exam 1",WGT,Framingham,MEDICATION & CONCENTRATION Q4M10A,INSULIN,ARIC,0
61215,WGT,WGT,,CUP OF ICEBERG OR HEAD LETTUCE,VEGETABLES,Framingham,0
61216,WEIGHT (lbs),WGT,MESA,MEDICATION B NAME Q5A,INSULIN,ARIC,0


In [23]:
ratio = final_data['match'].value_counts(normalize=True)
ratio

0    0.740665
1    0.259335
Name: match, dtype: float64

### Sanity Checks

In [24]:
# check if there are atleast one value in 0 and one value in 1
g = result.groupby('concept_name_1')['match'].apply(lambda x: list(np.unique(x)))
g_list = g.tolist()
# convert values of each group into a list
groups = result.groupby('concept_name_1')['match'].apply(list)
# groups store in a new  column called listvalues
sol1 = groups.reset_index(name = 'listvalues')
# show the dataframe
sol1

Unnamed: 0,concept_name_1,listvalues
0,AFIB,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,AGE,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,ALCOHOL,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,ANYCHOLMED,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,ASPIRIN,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...
59,TRIG,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
60,VALVDIS,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
61,VEGETABLES,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
62,VISDAY,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [25]:
# count the number of times each 0 and 1 appear for that particular list
group_list = sol1['listvalues'].tolist()
unique_vals = []

for i in group_list:
    a = np.unique(np.array(i))
    unique_vals.append(a)
frequency_unique_values = []

for j in group_list:
    frequency_unique_value = [(i,j.count(i)) for i in set(j)]
    frequency_unique_values.append(frequency_unique_value)

sol2 = pd.DataFrame({
    'concept_name_1': sol1['concept_name_1'],
    'raw_values': sol1['listvalues'],
    'unique values': unique_vals,
    'frequency unique values': frequency_unique_values
})

sol2

Unnamed: 0,concept_name_1,raw_values,unique values,frequency unique values
0,AFIB,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 507), (1, 171)]"
1,AGE,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 501), (1, 171)]"
2,ALCOHOL,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 355), (1, 120)]"
3,ANYCHOLMED,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 2988), (1, 1035)]"
4,ASPIRIN,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 811), (1, 276)]"
...,...,...,...,...
59,TRIG,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 356), (1, 120)]"
60,VALVDIS,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 165), (1, 55)]"
61,VEGETABLES,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 5598), (1, 1953)]"
62,VISDAY,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1]","[(0, 312), (1, 105)]"


In [26]:
for i in range(sol2.shape[0]):
    print(sol2.loc[i, 'concept_name_1'], ' : ', sol2.loc[i, 'frequency unique values'])

AFIB  :  [(0, 507), (1, 171)]
AGE  :  [(0, 501), (1, 171)]
ALCOHOL  :  [(0, 355), (1, 120)]
ANYCHOLMED  :  [(0, 2988), (1, 1035)]
ASPIRIN  :  [(0, 811), (1, 276)]
BASE_CVD  :  [(0, 84), (1, 28)]
BASE_STROKE  :  [(0, 63), (1, 21)]
BG  :  [(0, 133), (1, 45)]
BMI  :  [(0, 358), (1, 120)]
CARSTEN  :  [(0, 135), (1, 45)]
CENSDAY  :  [(0, 30), (1, 10)]
CREAT  :  [(0, 233), (1, 78)]
CURRSMK  :  [(0, 565), (1, 190)]
DAYS_SINCE_EXAM1  :  [(0, 3), (1, 1)]
DEATH_IND  :  [(0, 18), (1, 6)]
DEATH_IND_T2  :  [(0, 3), (1, 1)]
DIAB  :  [(0, 505), (1, 171)]
DIABP  :  [(0, 83), (1, 28)]
DIABP1  :  [(0, 135), (1, 45)]
DIABP2  :  [(0, 135), (1, 45)]
EDUCLEV  :  [(0, 18), (1, 6)]
EVENT  :  [(0, 3), (1, 1)]
EVENT_DESC  :  [(0, 3), (1, 1)]
EVENT_T2_O  :  [(0, 107), (1, 36)]
EVENT_VAL_C  :  [(0, 133), (1, 45)]
FAM_INCOME  :  [(0, 30), (1, 10)]
FASTING_12HR  :  [(0, 18), (1, 6)]
FASTING_8HR  :  [(0, 18), (1, 6)]
FASTING_BG  :  [(0, 196), (1, 66)]
FH_STROKE  :  [(0, 45), (1, 15)]
FRUITS  :  [(0, 2042), (1, 703)]

### Data Repetition Checks

In [27]:
# check whether the combination of variable_description_1 and variable_description_2 is unique (no rows repeated)
a = [[final_data.loc[row, 'variable_description_1'], final_data.loc[row, 'variable_description_2']] for row in range(len(final_data))]
print(len(a))
a = np.array(a)
a.sort(axis=1)
b = np.ascontiguousarray(a).view(
    np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
)
_, inv, ct = np.unique(b, return_inverse=True, return_counts=True)
print(ct[inv] == 1)

61218
[ True  True  True ...  True  True  True]


In [28]:
pd.Series(ct[inv] == 1).value_counts()

True     48426
False    12792
dtype: int64

In [29]:
final_data['unique'] = ct[inv]
final_data.head()

Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match,unique
0,Atrial fibrillation,AFIB,ARIC,Incident AF,AFIB,ARIC,1,1
1,Atrial fibrillation,AFIB,ARIC,time to incident af,AFIB,ARIC,1,1
2,Atrial fibrillation,AFIB,ARIC,DAYS SINCE EXAM 1,AFIB,ARIC,1,1
3,Atrial fibrillation,AFIB,ARIC,ATRIAL FIBRILLATION,AFIB,Framingham,1,1
4,Atrial fibrillation,AFIB,ARIC,D205-026-ATRIAL-FIBRILLATION,AFIB,Framingham,1,1


In [30]:
# check whether the combination of variable_description_1, concept_name_1, variable_description_2, concept_name_2 is unique (no rows repeated)
a = [[final_data.loc[row, 'variable_description_1'], final_data.loc[row, 'concept_name_1'], final_data.loc[row, 'variable_description_2'], final_data.loc[row, 'concept_name_2']] for row in range(len(final_data))]
print(len(a))
a = np.array(a)
a.sort(axis=1)
b = np.ascontiguousarray(a).view(
    np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
)
_, inv, ct = np.unique(b, return_inverse=True, return_counts=True)
print(ct[inv] == 1)

61218
[ True  True  True ...  True  True  True]


In [31]:
pd.Series(ct[inv] == 1).value_counts()

True     58890
False     2328
dtype: int64

In [32]:
pd.Series(ct[inv]).value_counts()

1    58890
2     2316
3       12
dtype: int64

In [33]:
final_data['unique_all'] = ct[inv]
final_data.head()

Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match,unique,unique_all
0,Atrial fibrillation,AFIB,ARIC,Incident AF,AFIB,ARIC,1,1,1
1,Atrial fibrillation,AFIB,ARIC,time to incident af,AFIB,ARIC,1,1,1
2,Atrial fibrillation,AFIB,ARIC,DAYS SINCE EXAM 1,AFIB,ARIC,1,1,1
3,Atrial fibrillation,AFIB,ARIC,ATRIAL FIBRILLATION,AFIB,Framingham,1,1,1
4,Atrial fibrillation,AFIB,ARIC,D205-026-ATRIAL-FIBRILLATION,AFIB,Framingham,1,1,1


In [34]:
final_data[final_data['unique_all'] != 1]

Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match,unique,unique_all
15885,Atrial Fibrillation Diagnosis (via ICD10 Code),AFIB,MESA,"Age (years), Exam 8",AGE,Framingham,0,2,2
15929,"Time between exam visits, in days",AFIB,MESA,FIBRATES,ANYCHOLMED,MESA,0,5,2
15930,ATRIAL FIBRILLATION / FLUTTER BY NOVACODE,AFIB,MESA,MEDICATION & CONCENTRATION Q4M05A,INSULIN,ARIC,0,2,2
15945,Atrial Fibrillation (via ICD10 Code): Time fro...,AFIB,MESA,"SPINACH, COLLARDS, GREENS Q20",VEGETABLES,ARIC,0,2,2
15955,Incident AF,AFIB,ARIC,ONE TOMATO OR HALF CUP,VEGETABLES,Framingham,0,2,2
...,...,...,...,...,...,...,...,...,...
61093,WEIGHT TO THE NEAREST LB Q4,WGT,ARIC,Med 25: Scanned UPC med name,INSULIN,ARIC,0,2,2
61148,WEIGHT TO THE NEAREST LB Q4,WGT,ARIC,"Time between exam visits, in days",HXHRTD,MESA,0,2,2
61149,Weight (kg),WGT,ARIC,Med 2: Scanned UPC med name,INSULIN,ARIC,0,2,2
61176,WEIGHT TO THE NEAREST LB Q4,WGT,ARIC,Med 15: Therapeutic Class Code,STATIN,ARIC,0,2,2


In [35]:
final_data[final_data['unique_all'] != 1].match.value_counts()

0    2328
Name: match, dtype: int64

In [36]:
final_data.iloc[16000, 3]

'ATC CODE FOR THIRD DRUG IN COMPOUND'

In [37]:
final_data[((final_data['variable_description_1'] == 'Incident AF') & (final_data['variable_description_2'] == 'MEDICATION P NAME                 Q19A')) | ((final_data['variable_description_1'] == 'MEDICATION P NAME                 Q19A') & (final_data['variable_description_2'] == 'Incident AF'))]

Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match,unique,unique_all


In [38]:
final_data = final_data[final_data['unique_all'] == 1]
final_data = final_data.drop(columns=['unique', 'unique_all'])
final_data.shape

(58890, 7)

### Save the file

In [39]:
# save the file
final_data.to_csv("/mnt/workspace/data/paired_sampled_custom_data.csv", index=False)

### Embed the data

In [40]:
df = pd.read_csv("/mnt/workspace/data/paired_sampled_custom_data.csv")
df.shape

(58890, 7)

In [41]:
df['biobert_embedding_1'] = df['variable_description_1'].apply(lambda sentence: biobert.sentence_vector(sentence))
df['biobert_embedding_2'] = df['variable_description_2'].apply(lambda sentence: biobert.sentence_vector(sentence))
df

Unnamed: 0,variable_description_1,concept_name_1,study_1,variable_description_2,concept_name_2,study_2,match,biobert_embedding_1,biobert_embedding_2
0,Atrial fibrillation,AFIB,ARIC,Incident AF,AFIB,ARIC,1,"[tensor(0.1640), tensor(0.1292), tensor(0.2356...","[tensor(-0.0823), tensor(-0.1281), tensor(-0.3..."
1,Atrial fibrillation,AFIB,ARIC,time to incident af,AFIB,ARIC,1,"[tensor(0.1640), tensor(0.1292), tensor(0.2356...","[tensor(0.2017), tensor(-0.1166), tensor(-0.38..."
2,Atrial fibrillation,AFIB,ARIC,DAYS SINCE EXAM 1,AFIB,ARIC,1,"[tensor(0.1640), tensor(0.1292), tensor(0.2356...","[tensor(0.4667), tensor(-0.3528), tensor(-0.15..."
3,Atrial fibrillation,AFIB,ARIC,ATRIAL FIBRILLATION,AFIB,Framingham,1,"[tensor(0.1640), tensor(0.1292), tensor(0.2356...","[tensor(0.1640), tensor(0.1292), tensor(0.2356..."
4,Atrial fibrillation,AFIB,ARIC,D205-026-ATRIAL-FIBRILLATION,AFIB,Framingham,1,"[tensor(0.1640), tensor(0.1292), tensor(0.2356...","[tensor(0.4104), tensor(-0.1078), tensor(0.116..."
...,...,...,...,...,...,...,...,...,...
58885,"Weight (pounds), Exam 4",WGT,Framingham,SYSTOLIC BP - PHYSICIAN - 2ND READING,SYSBP2,Framingham,0,"[tensor(0.2072), tensor(0.1461), tensor(0.0303...","[tensor(0.3342), tensor(-0.1605), tensor(0.153..."
58886,"Weight (pounds), Exam 1",WGT,Framingham,MEDICATION & CONCENTRATION Q4M10A,INSULIN,ARIC,0,"[tensor(0.1886), tensor(0.0736), tensor(0.1184...","[tensor(0.0180), tensor(-0.2530), tensor(-0.05..."
58887,WGT,WGT,,CUP OF ICEBERG OR HEAD LETTUCE,VEGETABLES,Framingham,0,"[tensor(0.0061), tensor(-0.2963), tensor(-0.07...","[tensor(-0.0267), tensor(-0.4042), tensor(-0.1..."
58888,WEIGHT (lbs),WGT,MESA,MEDICATION B NAME Q5A,INSULIN,ARIC,0,"[tensor(-0.0515), tensor(-0.1388), tensor(0.01...","[tensor(0.2078), tensor(-0.2129), tensor(0.214..."


In [47]:
# save the embedded data as a .pkl object
df = df.reset_index(drop=False).rename(columns={'index': 'idx'})
df.to_pickle("/mnt/workspace/data/pickle/paired_embedded_data.pkl")

### Create training datasets

In [2]:
df = pd.read_pickle("/mnt/workspace/data/pickle/paired_embedded_data.pkl")
df.shape

(58890, 10)

In [3]:
df_train = df.sample(n=47000, random_state=44)
idx_train = df_train['idx'].unique()
print(len(idx_train))

47000


In [4]:
df_test = df[~df['idx'].isin(idx_train)]
idx_test = df_test['idx'].unique()
print(len(idx_test))

11890


In [5]:
list(set(idx_train).intersection(set(idx_test)))

[]

In [6]:
df_train.match.value_counts() / len(df_train) * 100

0    73.170213
1    26.829787
Name: match, dtype: float64

In [7]:
df_test.match.value_counts() / len(df_test) * 100

0    72.531539
1    27.468461
Name: match, dtype: float64

In [8]:
df_train.to_pickle("/mnt/workspace/data/pickle/paired_embedded_train_data.pkl")
df_test.to_pickle("/mnt/workspace/data/pickle/paired_embedded_test_data.pkl")