# embed

Notebook to explore embedding large number of text fragments using openai

In [1]:
import os
import gzip
import openai
import dotenv
import pandas as pd
from pathlib import Path
from openai import OpenAI 
from loguru import logger

dotenv.load_dotenv("../.env")

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(
    api_key=api_key
)

def get_embeddings(texts):
    client = openai.OpenAI()  # Create an OpenAI client
    model = "text-embedding-ada-002"  # Use the latest embedding model
    response = client.embeddings.create(
        input=texts,
        model=model
    )
    # embeddings = [data['embedding'] for data in response['data']]
    return response.model_dump()['data'] # type: ignore 

In [2]:
#
# get the data set
#
signatures_path = Path("../data/sigs.csv")
df = pd.read_csv(signatures_path)
logger.info("loaded signatures data: {} rows {} columns", df.shape[0], df.shape[1])
old_name = df.columns[0]
df.rename(columns={old_name:"cell_id"}, inplace=True)

[32m2024-06-19 16:38:12.257[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mloaded signatures data: 9370 rows 3 columns[0m


In [3]:
def batch_generator(lst, batch_size=500):
    """
    Generator that yields batches of a given size from a list.
    
    :param lst: List of arbitrary length.
    :param batch_size: Number of elements in each batch (default is 500).
    """
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]


In [4]:
s = df.signature
print(len(s))

9370


In [5]:

print(df.index[0], df.signature[0])
print(df.shape)

0 RPL11 CEP350 GNLY PTPN4 SMARCA5 KIAA0825 ORC5 SARAF PDCD4 ABLIM1 FNBP4 SLC38A1 ZC3H13 CTDSPL2 NF1
(9370, 3)


In [6]:
import requests
import json
def get_embedding(input):
    if not isinstance(input, str):
        return []
    try:
        response = requests.post(
            'https://api.openai.com/v1/embeddings',
            headers={'Authorization': f'Bearer {openai.api_key}'},
            json={'input': input, 'model': 'text-embedding-ada-002'}
        )
        return response.json()['data'][0]['embedding']
    except Exception as e:
        logger.error("error reading embedding from openai ({})", e)
        return []


In [7]:
import numpy as np

edf = df
edf['embedding'] = ''

for i in range(df.shape[0]):
    try:
        e = get_embedding(edf.signature[i])
        edf.embedding[i] = json.dumps(e)
    except Exception as e:
        logger.error("ran into an exception at item {} (exeption {})", i, e)
edf.head()

[32m2024-06-19 16:38:35.115[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mget_embedding[0m:[36m14[0m - [31m[1merror reading embedding from openai ('data')[0m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edf.embedding[i] = json.dumps(e)
[32m2024-06-19 16:38:35.245[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mget_embedding[0m:[36m14[0m - [31m[1merror reading embedding from openai ('data')[0m
[32m2024-06-19 16:38:35.383[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mget_embedding[0m:[36m14[0m - [31m[1merror reading embedding from openai ('data')[0m
[32m2024-06-19 16:38:35.518[0m | [31m[1mERROR   [0m | [36m__main__[0m:[36mget_embedding[0m:[36m14[0m - [31m[1merror reading embedding from openai ('data')[0m
[32m2024-06-19 16:38:35.650[0m | [31m[1mERROR   [0m | [36m__main_

KeyboardInterrupt: 

In [82]:
df

Unnamed: 0,cell_id,cluster,signature,embedding
0,TGACCAAGTAGACAAA,0,RPL11 CEP350 GNLY PTPN4 SMARCA5 KIAA0825 ORC5 ...,"[-0.021236828, 0.0102644665, 0.017603861, -0.0..."
1,CGGCCATAGCGAGCGA,0,CDKAL1 RPS13 ITFG1 LINC01934 APPL1 TRA2B ARL15...,"[-0.024834119, 0.0010850973, -0.011615959, -0...."
2,AACCCGCAGCATGTTA,0,UBR5 SUMF1 SPATA5 PPP2R5C ITGA4 WWP1 PHF20L1 C...,"[-0.014364182, 0.00015998675, -0.0018439151, -..."
3,AATCATCCAGTTTACG,0,RIPOR2 ATXN1 HIBADH JAZF1 PDE3B ARID1A TUT4 TR...,"[-0.0128802, -0.0067781447, -0.0024146063, -0...."
4,TGAAGGATCGTTACTT,0,RPS7 RTN4 GNLY GSK3B RAB28 RPL34 FNIP1 ARID1B ...,"[-0.004522995, -0.011238755, 0.014591605, -0.0..."
...,...,...,...,...
9365,TTACGTTTCCCTGACT,18,FOXP1 NCOA2 SMAP2 TET2 DOCK2 RIPOR2 NAMPT JMJD...,
9366,CTAACCTGTAAACAAG,18,USP24 CTSS LYST KANSL1L NAMPT RPL41 PPP1R12A B...,
9367,AATTGCCAGAGAGCCG,18,RNF13 LYN MTSS1 CELF2 NEAT1 PHF20 MIS18BP1 PAF...,
9368,CTGTATTTCCATAATG,18,LYN NEAT1 ATP2B1 PRKCB DPYD SCLT1 MAP3K1 CD44 ...,


In [55]:
for cell_no in range(df.shape[0]):
    if cell_no % 1000 == 0:
        print(cell_no, df.signature[cell_no])


# sum =0
# count = 0
# for sig in s:
#     if isinstance(sig,str):
#         sum += len(sig)
#     count += 1
# print(sum, count)

0 RPL11 CEP350 GNLY PTPN4 SMARCA5 KIAA0825 ORC5 SARAF PDCD4 ABLIM1 FNBP4 SLC38A1 ZC3H13 CTDSPL2 NF1
1000 SP100 PTMA LCP1 EIF4G3 PRKACB HNRNPU CLK1 XRCC5 ARL4C SNRK MSL2 HNRNPDL SEC31A CWC27 SREK1 STK38 FAM126A AOAH MTSS1 DOCK8 RFX3 MEGF9 PIP4K2A SRGN ZMYM2 PCNX1 HSP90AA1 ACTG1 DIDO1 UBE2G2
2000 ANKIB1 CD53 CDC42SE2 NFX1 TNRC6C TXNIP CLASP1 TBC1D1 PARP8 FYN AOAH SYNE2 SRSF11 FAM102B C1orf21 SMYD3 UBR3 STAT4 VPS8 NEK1 DOCK2 ATXN1 RIPOR2 AKAP9 TAFA2 ZNF254 PPP1R16B
3000 CDC14A LINC00486 TBC1D5 TNIK ST6GAL1 CPEB3 ZRANB2 AL136456.1 NEK7 GCC2 FYB1 RPL37 GZMK VPS13B PABPC1 EMSY AC092821.3 PPP1R12A IL32 CYTH1 SON
4000 OGA CEP83 RPLP1 CLTC RAB10 BIRC6 SLC4A7 RAPGEF6 UBAC2 USP34 RBPJ DOCK2 RIPOR2 RNF38 PARG RPLP2 FTH1 FAU SLC38A1 TAFA2 RAB8B SSH2 RPL38 ANKRD12
5000 RALGPS2 SH3BGRL PRDM2 EIF4G3 PDE4B VAV3 MAN1A2 GDAP2 FCRL1 YPEL5 TLK1 DOCK10 PSMD1 UBE2E2 SIDT1 STIM2 ARHGAP24 CAMK2D PLEKHG1 SNX9 JAZF1 ELMO1 MTSS1 ETS1 TCP11L2 PAN3 RB1 FNDC3A HERPUD1 WWOX
6000 RAP1A LMBRD1 ATP8A1 ZFAND6 AOAH RORA S

In [8]:
import openai
import asyncio
import aiohttp

# Initialize the OpenAI API key
openai.api_key = api_key

async def get_embeddings(session, texts):
    response = await session.post(
        'https://api.openai.com/v1/embeddings',
        headers={'Authorization': f'Bearer {openai.api_key}'},
        json={'input': texts, 'model': 'text-embedding-ada-002'}
    )
    return await response.json()

async def process_signature_list(text_list):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for i in range(0, len(text_list), 16):
            batch = text_list[i:i+16]
            tasks.append(get_embeddings(session, batch))
        
        results = await asyncio.gather(*tasks)
        
        # Combine embeddings from all batches
        embeddings = []
        for result in results:
            if isinstance(result,dict) and 'data' in result:
                embeddings.extend(result['data'])
            else:
                logger.error("Encountered invalid embedding")
                embeddings.extend([])
        return embeddings



In [9]:
from typing import Any

import json

async def process_signatures_in_batches(signatures: list[str], batch_size=500) -> list[dict[Any,Any]]:
    """Take a list gene signatures as space separated strings and submit them to 
    OpenAI in batches to produce a list of embeddings in json format

    Args:
        signatures (list[list[str]]): The gene signatures in the form ["XYZ JGK", "NNC DDDD", ...]

    Returns:
        list[float]: The embeddings in the form [[123, 345, ...],[5423, 232, ... ]]
    """
    batches = batch_generator(signatures, batch_size=batch_size)
    result = []
    for batch in batches:
        # logger.info("processing batch of size {}", len(batch))
        embed_batch = await(process_signature_list(batch))
        result.extend(embed_batch)
    return result


In [20]:

from os import PathLike


async def load_embeddings(path: PathLike, filename: PathLike) -> list[list[str]]:
    path = Path(path)
    filename = Path(filename)
    if not path.exists() or not path.is_dir():
        raise ValueError('{} is not a valid directory', path)
    json_path = path.joinpath(filename)
    if json_path.suffix == '.gz':
        embeddings_json_compressed =  json_path # Path("../data/embeddings.json.gz")
        embeddings_json = None
    elif json_path.suffix == '.json':
        embeddings_json = json_path
        embeddings_json_compressed = Path(json_path.as_posix() + '.gz')
    else:
        raise ValueError('filename must end in .gz or .json not {}', filename.suffix)
    
    if embeddings_json_compressed and embeddings_json_compressed.exists():
        logger.info("found compressed json file: {}", embeddings_json_compressed)
        with gzip.open(embeddings_json_compressed, 'rt') as f:
            e_json = f.read()
        embeddings = json.loads(e_json)
    elif embeddings_json and embeddings_json.exists():
        logger.info("loading uncompressed json file: {}", embeddings_json)
        with open(embeddings_json, "rt") as f:
            e_json = f.read()
        embeddings = json.loads(e_json)
    else:
        logger.info('generating new embeddings json file -- this will take a while...')
        embeds = await process_signatures_in_batches(signatures,16)
        embeddings = []
        for e in embeds:
            embeddings.append(e['embedding'])
        logger.info("processed {} embeddings", len(embeddings))
        e_json = json.dumps(embeddings)
        with gzip.open(embeddings_json_compressed, "wt",encoding='utf-8') as f:
            f.write(e_json)
    return embeddings 

embeddings = await load_embeddings("../data", "embeddings.json") # type: ignore


[32m2024-06-18 19:40:26.521[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_embeddings[0m:[36m21[0m - [1mfound compressed json file: ../data/embeddings.json.gz[0m
  embeddings = await load_embeddings("../data", "embeddings.json")


In [23]:
embedding_json_vector = [json.dumps(e) for e in embeddings]
print(len(embedding_json_vector))


9354


In [25]:
df.shape

(9370, 3)