In [1]:
import os
from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
from umap import UMAP
from openai import OpenAI
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pathlib import Path, PurePath
import tiktoken
from IPython.display import clear_output
import math
from  concurrent.futures import ThreadPoolExecutor
import warnings

warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)


In [2]:
os.listdir()
#os.chdir("diesunddas")
os.getcwd()


'/Users/gheiss/workspaces/jupyter/embedding'

In [3]:
# Einfache Progress-Anzeige
def progress(total, stop = 100):
    i = 1
    while True:
        pct = i / total * 100
#        if i % int(total / 1000) == 0:
        d = int(total * 0.0001 * stop)
        #if d == 0 or i % d == 0 or pct >= stop:
        if True:
            clear_output(wait=True)
            print(f"{i} / {math.ceil(total)} ({min(round(pct,1),100)}%)")
        
        if pct >= stop: 
            yield True 
        else:
            i += 1
            yield 
            
def chunker(seq, size):
    for pos in range(0, len(seq), size):
        yield seq.iloc[pos:pos + size] 

class Embedding:
    
    client = OpenAI()
    
    @staticmethod
    def to_cache_path(orig_path):
        cach_path = Path(orig_path).absolute().with_suffix(".mbd.h5")
        return cach_path
        
    @classmethod
    def match_embeddings(cls, input, target):
        if len(input.df) == 0 or len(target.df) == 0:
            return

        similarities = cosine_similarity (
            input.df.loc[:,"__embedding"].tolist(), 
            target.df.loc[:,"__embedding"].tolist())
        maxes = similarities.argmax(axis=1)

        hits = list(map(lambda i: target.df.loc[i,target.column].strip(), maxes))

        res = pd.DataFrame()
        res["in"] = input.df.loc[:,input.column]
        res["out"] = hits
        res.to_csv("testresult.csv", sep=";", index=False)
        return res        

    def __init__(self, model = "text-embedding-3-small"):
        self.model = model 
        self.encoding = tiktoken.encoding_for_model(model)

    def store(self):
        with pd.HDFStore(self.cache, 'w') as store:
            store.put('data', self.df)
            store.get_storer('data').attrs.metadata = {"path": self.path, "column": self.column}
            print(f"Stored Embedding to {self.cache}")
        return self

    def load(self, filename, contains=""):
        print(f"Loading Embedding from {self.to_cache_path(filename)}") 
        with pd.HDFStore(self.to_cache_path(filename), "r") as store:
            self.df = store.get("data")
            metadata = store.get_storer('data').attrs.metadata
            self.path = metadata["path"]
            self.column = metadata["column"]
            self.df = self.df.dropna(subset=[self.column])
            self.df = self.df[self.df[self.column].str.contains(contains)]
        print(f"Loaded {len(self.df)} embeddings.")
        return self


    def count_tokens(self, text_to_embed):
        tcnt = len(self.encoding.encode(text_to_embed))
        return tcnt

    def get_embeddings(self, texts_to_embed):
        response = self.client.embeddings.create(
            model=self.model,
            input=texts_to_embed,
            encoding_format="float"
        )
        return map(lambda d: d.embedding, response.data)

    def process_batch(self, batch_df, prog):
        results = self.get_embeddings(batch_df[self.column])
        for i in enumerate(results):
            self.df.at[batch_df.index[i[0]], "__embedding"] = i[1]
        next(prog)

    def from_csv(self, path, delimiter=";", column="text", nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        self.path = Path(path).absolute()
        self.cache = self.path.with_suffix(".mbd.h5")
        self.column = column
        self.df = pd.read_csv(self.path, sep=delimiter)        
        print(f"Reading from {self.path}")
        self.process_df(nsamples=nsamples, dryrun=dryrun, chunksize=chunksize, parallel=parallel)
        return self   

    def from_xls(self, path, sheet="", column="text", nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        self.path = Path(path).absolute()
        self.cache = self.path.with_suffix(".mbd.h5")
        self.column = column        
        with pd.ExcelFile(path) as xls:  
            self.df = pd.read_excel(xls, sheet)  
        print(f"Reading from {self.path}")
        self.process_df(nsamples=nsamples, contains=contains, dryrun=dryrun, chunksize=chunksize, parallel=parallel)
        return self   

    def from_array(self, texts, nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        self.column = "__text"
        self.df = pd.DataFrame()
        self.df[self.column] = texts
        self.process_df(nsamples=nsamples, contains=contains, dryrun=dryrun, chunksize=chunksize, parallel=parallel)
        return self   

    def process_df(self, nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        self.df = self.df[self.df[self.column].str.contains(contains)]
        if nsamples > 0 : self.df = self.df.sample(nsamples)
        self.df["__embedding"] = ""
        self.df["__tcnt"] = self.df[self.column].astype(str).apply(self.count_tokens) 
        if not dryrun and len(self.df) > 0: 
            if chunksize == 0 : chunksize = int(8192 / self.df["__tcnt"].max() * 0.7) 
            print("chunksize: ", chunksize)
            prog = progress(len(self.df)/chunksize)    
            with ThreadPoolExecutor(max_workers=parallel) as executor:
                for batch_df in chunker(self.df, chunksize):
                    executor.submit(self.process_batch, batch_df, prog)            

        print(f"Nuber of tokens: {self.df['__tcnt'].sum()}")
        print(f"Created {len(self.df)} embeddings.")

    
    @classmethod
    def similarity_matrix(cls, input, target):
        if len(input.df) == 0 or len(target.df) == 0:
            return
        similarities = cosine_similarity (
            input.df.loc[:,"__embedding"].tolist(), 
            target.df.loc[:,"__embedding"].tolist())
        df = pd.DataFrame(similarities, columns=list(target.df[target.column]), index=list(input.df[input.column]))
        return df


In [4]:
zds_latest_06 = Embedding().from_csv("../precitooldata/zds_latest/zds_categories_with_path_06.csv", column='categoryPathShort').store()
broken_cats = Embedding().from_xls("../precitooldata/EDEfalscheKategorien.xlsx", sheet="finalV2", column="ede").store()

#broken_cats = Embedding().load("../precitooldata/EDEfalscheKategorien.xlsx")
#Embedding.match_embeddings(broken_cats, zds_latest_06)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/gheiss/workspaces/jupyter/embedding/../precitooldata/zds_latest/zds_categories_with_path_06.csv'

In [None]:

target_cats = Embedding().load("../precitooldata/zds_latest/zds_categories_with_path_06.csv")
#target_cats = Embedding().load("../precitooldata/zds_latest/zds_categories_with_path.csv", contains="69_Betriebseinrichtung")
#Embedding.match_embeddings(broken_cats, target_cats)

In [6]:
bla = Embedding(model="text-embedding-3-large").from_array(["maus", "haus", "laus", "ratte"], nsamples=0, contains="asda")
blub = Embedding(model="text-embedding-3-large").from_array(["nagetier", "wohnzimmer"])
Embedding.similarity_matrix(bla, blub)

1 / 1 (100%)
Nuber of tokens: 6
Created 2 embeddings.


In [None]:
target_colors = Embedding(model="text-embedding-3-large").from_csv("embedding/targetcolors.csv", nsamples=0).store()
test_colors = Embedding(model="text-embedding-3-large").from_csv("embedding/testcolors.csv", column="search", nsamples=0).store()

target = Embedding().load("embedding/targetcolors.csv")
test = Embedding().load("embedding/testcolors.csv")

Embedding.match_embeddings(test, target)
Embedding.similarity_matrix(test, target)