# Create Ontology Recomendations

This notebook makes use of the precomputed embeddings for the RP1 coebook and the NCIT Ontologyies to generate recomended ontologies to map study variables to. See https://web.csag.uct.ac.za/hub/hub/user-redirect/lab/tree/heat_center/code/data_harmonisation/health_ontologies/03_recommendation_engine_workflow.ipynb for an annotated script

In [1]:
import pandas as pd
import numpy as np
import fsspec
import json
import time
from pathlib import Path


import openai
from openai.embeddings_utils import get_embedding

In [2]:
from scipy import spatial

def calculate_cosine_similarity(embedding1, embedding2):
    similarity = spatial.distance.cosine(embedding1, embedding2)
    return similarity

You will need to get an OpenAI key to use this code. 
Once you have a key and org name set the enviroment variables by the following code in your terminal: 

```bash
export OPENAI_API_KEY=YOUR_API_KEY
export OPENAI_ORG_NAME=YOUR_ORG_Name
```

In [12]:
embedding_model = "text-embedding-ada-002"
OpenAI_api_key = os.getenv("OPENAI_API_KEY")
OpenAI_org_name = os.getenv("OPENAI_ORG_NAME")
openai.api_key = OpenAI_api_key

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
fs = fsspec.filesystem("")

get available studies:

In [6]:
avail_studies = [f.split('/')[-1] for f in fs.ls('../data/') if f.split('/')[-1][0] != '.']

In case we have yet to create a variables.csv file for a particular study

In [7]:
avail_studies = [x for x in avail_studies if fs.exists(f'../data/{x}/metadata/variables.csv')]

In [13]:
for study in avail_studies:
    if not fs.exists(f'../data/{study}/metadata/variables_with_recomendations.csv'):
        print(study)
        df = pd.read_csv(f'../data/{study}/metadata/variables.csv')[['var','description']]
    
        #Get Vector Embeddings for study:
        variables = []
        for i in range(len(df)):
            if type(df.iloc[i].description) == str:
                variables.append(df.iloc[i]['description'])
            else:
                variables.append(df.iloc[i]['var'])       
    
        df['name_to_embed'] = variables
        df["embeddings"] = df.name_to_embed.apply(lambda x: get_embedding(x, engine=embedding_model))
    
        #Codebook
        #Get codebook embeddings:
        df_codebook = pd.read_csv('../codebook/codebook_with_embeddings.csv')
        df_codebook["embeddings"] =df_codebook.embeddings.apply(eval).apply(np.array)  # convert string to numpy array
        df_codebook = df_codebook.rename(columns={"Variable Name (EFO,SNOMED,NCIT)": "Name"})
    
        recomendations = []
        distances = []
        urls = []
        for i in range(len(df)):
            study_var = df['embeddings'].iloc[i]
            df_codebook["distance"] = df_codebook.embeddings.apply(lambda x: calculate_cosine_similarity(study_var, x))
            df_codebook = df_codebook.sort_values("distance")
            recomendations.append(list(df_codebook.Name))
            distances.append(list(df_codebook.distance))
            #recomendations.append(list(df_codebook.nsmallest(3, 'distance').Name))
            #distances.append(list(df_codebook.nsmallest(3, 'distance').distance))
            urls.append(list(df_codebook.nsmallest(3, 'distance')['Ontology Code']))                                           
    
        #Reformat urls to match NCIT.owl
        url_reformatted = []
        for url in urls:
            url_reformatted.append(['_'.join(y.split(':')) for y in url])
    
        df['codebook_recomendations'] = recomendations
        df['codebook_distances'] = distances
        df['codebook_urls'] = url_reformatted #to match NCIT.owl format
        
        # Save result to somewhere sensible:
        df.to_csv(f'../data/{study}/metadata/variables_with_recomendations.csv')
        
        del df
        del df_codebook

Europe_CH_SIB
