In [1]:
import pandas as pd
import os
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import bokeh
import os
import openai
from openai import OpenAI
import backoff
import numpy as np
from sklearn.cluster import KMeans
import backoff
import requests 

In [2]:
# import csv with 'phenotype' descriptor column

file = 'phecode_definitions1.2.csv' 
df = pd.read_csv(file)

print(df.head())



    code                                               desc        V1  \
0   A000  Cholera due to Vibrio cholerae 01, biovar chol...  6.751494   
1   A001    Cholera due to Vibrio cholerae 01, biovar eltor  6.357161   
2   A009                               Cholera, unspecified  1.358162   
3  A0100                         Typhoid fever, unspecified  6.660007   
4  A0101                                 Typhoid meningitis -0.423517   

         V2        V3         V4         V5         V6        V7         V8  \
0 -9.367534  0.946792  -0.417494  -6.610719  -5.803976  4.901303  17.314949   
1 -8.226274  0.466905  -0.155222  -7.021581  -5.988095  4.512167  16.899925   
2  7.949538 -3.219061  17.735281 -11.883527   3.825686  8.894238  10.111987   
3 -1.436555  3.958940   4.435757  -9.614423 -17.550453  2.361521  14.385811   
4 -9.884587  5.500570   5.316119  -9.436379  -4.031093  4.294637  10.534754   

   ...      V991       V992       V993       V994       V995       V996  \
0  ...  6.6

In [3]:
df

Unnamed: 0,code,desc,V1,V2,V3,V4,V5,V6,V7,V8,...,V991,V992,V993,V994,V995,V996,V997,V998,V999,V1000
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol...",6.751494,-9.367534,0.946792,-0.417494,-6.610719,-5.803976,4.901303,17.314949,...,6.678226,-3.577215,10.398252,12.429912,-14.509845,-8.267488,12.233472,-1.180893,-15.889337,7.507463
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor",6.357161,-8.226274,0.466905,-0.155222,-7.021581,-5.988095,4.512167,16.899925,...,6.407214,-2.095692,9.208505,12.745779,-14.219962,-8.189637,12.772043,-1.471267,-15.656559,7.080588
2,A009,"Cholera, unspecified",1.358162,7.949538,-3.219061,17.735281,-11.883527,3.825686,8.894238,10.111987,...,7.626040,7.197934,-1.783137,12.962646,-4.951687,-3.298330,-2.842717,-13.370962,-8.831286,11.067048
3,A0100,"Typhoid fever, unspecified",6.660007,-1.436555,3.958940,4.435757,-9.614423,-17.550453,2.361521,14.385811,...,-1.434052,-14.457172,2.433803,14.079353,-17.832670,-10.446936,9.236184,6.195607,-0.361466,17.960640
4,A0101,Typhoid meningitis,-0.423517,-9.884587,5.500570,5.316119,-9.436379,-4.031093,4.294637,10.534754,...,-1.171579,-3.829593,1.782171,13.485106,-8.543977,-6.823440,5.657843,2.142302,-12.782821,14.822734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72745,Z9981,Dependence on supplemental oxygen,13.006083,2.561383,4.902829,19.406019,-7.360718,-4.099954,3.660315,9.042940,...,5.510718,3.156777,0.963714,6.968931,-5.211675,-12.235018,-11.902325,-0.522480,-12.369491,9.650954
72746,Z9989,Dependence on other enabling machines and devices,2.486253,6.834556,1.008566,29.345808,-8.330998,31.544598,10.595522,-1.860485,...,18.430723,12.717903,-11.426613,7.517748,10.086684,2.336244,-34.155296,-25.693632,-12.438920,2.885945
72747,U070,Vaping-related disorder,2.385478,7.763225,-0.055976,31.533873,-10.729116,25.868855,7.149197,-1.838361,...,19.027412,11.371456,-8.908945,4.630046,9.390279,2.654673,-29.656097,-24.912519,-15.096435,-1.085403
72748,U071,COVID-19,4.063068,12.206376,-3.824706,31.331560,-9.118613,30.780144,11.287815,-1.461173,...,22.628250,18.745651,-9.407323,6.929297,12.659636,8.924576,-37.658791,-31.318447,-12.418530,0.935496


In [4]:
#define API from global variable

openai.api_key = os.environ.get("OPENAI_API_KEY")

In [5]:
# try and catch to call OpenAI API

@backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=8)
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    try:
        return openai.embeddings.create(input=[text], model=model).data[0].embedding
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        raise  # Re-raise the exception to trigger backoff

        

In [6]:
# Trying get_embeddings for just one row

# Get the embedding for the first row's phenotype description
first_row_embedding = get_embedding(df['phenotype'].iloc[0])

# Create a new column 'embeddings' and set the first value
# Initialize the column with None or np.nan for the remaining entries
df['embeddings'] = None

# Set the embedding for the first row
df.at[0, 'embeddings'] = first_row_embedding

In [7]:

# Apply the get_embedding function to the 'phenotype' column and store the results in a new column 'embeddings'
df['embeddings'] = df['phenotype'].apply(get_embedding)


In [8]:
# Save the df as a pickle file so you can use the embeddings for the tsne analysis
df.to_pickle('phecode_gpt_embedding.pkl')
