In [0]:
%pip install featexp
%pip install transformers torch

In [0]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
from pyspark.sql.functions import datediff, col
from pyspark.sql.functions import col, explode_outer, lpad, regexp_replace
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import posexplode, concat, lit
import seaborn as sns


# Display all columns
pd.set_option('display.max_columns', None)

In [0]:
# read into a dataframe
raw=spark.sql(""" select diagnosis.claim_id,diagnosis.patient_id,diagnosis.date_service,diagnosis.date_service_end,diagnosis_code,enrollment.patient_gender,patient_year_of_birth,sum(procedure.line_charge) as paid_amt 
from healthverity_claims_sample_patient_dataset.hv_claims_sample.diagnosis 
inner join healthverity_claims_sample_patient_dataset.hv_claims_sample.enrollment on diagnosis.patient_id = enrollment.patient_id
inner join healthverity_claims_sample_patient_dataset.hv_claims_sample.procedure on diagnosis.patient_id = procedure.patient_id
where admit_diagnosis_ind='Y'
group by all""")
raw = raw.withColumnRenamed("date_service", "admission_dt").withColumnRenamed("date_service_end", "discharge_dt").withColumnRenamed("diagnosis_code","diag_cd")
display(raw)

In [0]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load BioBERT (pretrained)
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [0]:
code_desc = {
    "S066X0D": "Traumatic brain injury, unspecified, subsequent encounter",
    "F1120": "Opioid dependence, uncomplicated",
    "Z3483": "Encounter for supervision of high-risk pregnancy",
    "K440": "Umbilical hernia without obstruction or gangrene",
    "M75102": "Bursitis of right shoulder",
    "I69354": "Hemiplegia and hemiparesis following cerebral infarction affecting left non-dominant side",
    "L97929": "Non-pressure chronic ulcer of unspecified part of left lower leg with unspecified severity",
    "R531": "Weakness",
    "K5720": "Diverticulosis of intestine, part unspecified, without perforation or abscess",
    "I214": "Acute subendocardial myocardial infarction",
    "K9189": "Other postprocedural complications and disorders of digestive system",
    "I495": "Sick sinus syndrome",
    "S82122D": "Displaced transverse fracture of shaft of left tibia, subsequent encounter",
    "T85730A": "Infection and inflammatory reaction due to other cardiac and vascular devices, implants and grafts, initial encounter",
    "M238X2": "Other instability, left lower leg",
    "R45851": "Suicidal ideation",
    "G9341": "Acute encephalopathy",
    "K8510": "Acute pancreatitis without necrosis or infection",
    "R509": "Fever, unspecified",
    "Z432": "Encounter for attention to gastrostomy"

}

In [0]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
        emb = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    print(emb.shape)
    return emb

# Precompute: Get embedding for every code
dx_embeddings = {code: get_embedding(desc) for code, desc in code_desc.items()}

embedding_size = list(dx_embeddings.values())[0].shape[0]

In [0]:
def codes_to_vec(codes_str):
    codes = codes_str.split('|')
    vecs = [dx_embeddings[c] for c in codes if c in dx_embeddings]
    if vecs:
        return np.mean(vecs, axis=0)
    else:  # Fallback: zero vector
        return np.zeros(embedding_size)

# Apply to your claims dataframe
df =raw.toPandas()
df['admission_dt'] = pd.to_datetime(df['admission_dt'])
df['discharge_dt'] = pd.to_datetime(df['discharge_dt']) 

# Feature engineering: use primary diagnosis/procedure codes, gender, LOS, etc.
df['length_of_stay'] = (df['discharge_dt'] - df['admission_dt']).dt.days.astype('int') 
print("Number of null length_of_stay values:", df['length_of_stay'].isnull().sum())

df['dx_vec'] = df['diag_cd'].apply(codes_to_vec)
df1 = df[df['diag_cd'] == 'F1120']

In [0]:
feature_matrix = np.vstack([
    np.concatenate([ [row.length_of_stay], row.dx_vec ])
    for idx, row in df.iterrows()
])
print(feature_matrix)  # (num_claims, 2 + embedding_size)

In [0]:
vec1 = df1.iloc[0]['dx_vec']
vec2 = df1.iloc[1]['dx_vec']


from numpy import dot
from numpy.linalg import norm

cos_sim = dot(vec1, vec2) / (norm(vec1) * norm(vec2))
print('Cosine similarity:', cos_sim)