# Create Ontology Recommendations

This notebook does two things:

- 1: fetch vector embeddings for the target variables and input dataset variables. Using the Descriptions.
- 2: calculate the cosine similarity from each dataset variable to each target variable. This is used to order the 

In [1]:
import pandas as pd
import numpy as np
import fsspec
from pathlib import Path
import os

import openai
from openai.embeddings_utils import get_embedding

In [20]:
from scipy import spatial

def calculate_cosine_similarity(embedding1, embedding2):
    similarity = spatial.distance.cosine(eval(embedding1), eval(embedding2))
    return similarity

You will need to get an OpenAI key to use this code. 

In [5]:
embedding_model = "text-embedding-ada-002"
OpenAI_api_key = '####'
OpenAI_org_name = '#####'
openai.api_key = OpenAI_api_key

In [7]:
pd.options.mode.chained_assignment = None  # default='warn'

In [9]:
fs = fsspec.filesystem("")

Get embeddings for target variables:

In [12]:
if not fs.exists('output/target_variables_with_embeddings.csv'):
    df = pd.read_csv('../input/target_variables.csv')
    # use variable name if no description
    name_to_embed = []
    for i in range(len(df)):
        if type(df.iloc[i].description) == str:
            name_to_embed.append(df.iloc[i]['description'])
        else:
            name_to_embed.append(df.iloc[i]['variable_name'])
    df['name_to_embed'] = name_to_embed
    df["embeddings"] = df['name_to_embed'].apply(lambda x: get_embedding(x, engine=embedding_model))
    df.to_csv('output/target_variables_with_embeddings.csv', index=False)
else:
    pass
    # here we could write some logic to check the descriptions in target_variables_with_descriptions is == to target_variables

get available studies:

In [14]:
avail_studies = [x for x in fs.ls('../input/') if fs.isdir(x)] # get directories
avail_studies = [f.split('/')[-1] for f in avail_studies if f.split('/')[-1][0] != '.'] # strip path and remove hidden folders
avail_studies

['Europe_CH_SIB', 'H3Africa']

Get embeddings for the dataset_variables

In [15]:
for study in avail_studies:
    if not fs.exists(f'output/{study}_variables_with_embeddings.csv'):
        df = pd.read_csv(f'../input/{study}/dataset_variables.csv')
        # use variable name if no description
        name_to_embed = []
        for i in range(len(df)):
            if type(df.iloc[i].description) == str:
                name_to_embed.append(df.iloc[i]['description'])
            else:
                name_to_embed.append(df.iloc[i]['variable_name'])
        df['name_to_embed'] = name_to_embed
        df["embeddings"] = df['name_to_embed'].apply(lambda x: get_embedding(x, engine=embedding_model))
        df.to_csv(f'output/{study}_variables_with_embeddings.csv', index=False)
    else:
        pass
        # here we could write some logic to check the descriptions in target_variables_with_descriptions is == to target_variables

Get cosine similarities and create recommendations:

In [21]:
for study in avail_studies:
    if not fs.exists(f'output/{study}_variables_with_recommendations.csv'):
        study_df = pd.read_csv(f'output/{study}_variables_with_embeddings.csv')
        target_df = pd.read_csv('output/target_variables_with_embeddings.csv')
        recommendations = []
        distances = []
        for i in range(len(study_df)):
            study_var = study_df['embeddings'].iloc[i]
            target_df["distance"] = target_df.embeddings.apply(lambda x: calculate_cosine_similarity(study_var, x))
            target_df = target_df.sort_values("distance")
            recommendations.append(list(target_df.description))
            distances.append(list(target_df.distance))
        study_df['target_recommendations'] = recommendations
        study_df['target_distances'] = distances
        study_df.to_csv(f'output/{study}_variables_with_recommendations.csv', index=False)
    else:
        pass
        # here we could write some logic to check the descriptions in target_variables_with_descriptions is == to target_variables