In [1]:
import os
from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
from umap import UMAP
from openai import OpenAI
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pathlib import Path, PurePath
import tiktoken
from IPython.display import clear_output
import math
from  concurrent.futures import ThreadPoolExecutor
import warnings

warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)


In [2]:
os.listdir()
#os.chdir("diesunddas")
os.getcwd()


'/Users/gheiss/workspaces/jupyter'

In [None]:
# Einfache Progress-Anzeige
def progress(total, stop = 100):
    i = 1
    while True:
        pct = i / total * 100
#        if i % int(total / 1000) == 0:
        d = int(total * 0.0001 * stop)
        #if d == 0 or i % d == 0 or pct >= stop:
        if True:
            clear_output(wait=True)
            print(f"{i} / {math.ceil(total)} ({min(round(pct,1),100)}%)")
        
        if pct >= stop: 
            yield True 
        else:
            i += 1
            yield 
            
def chunker(seq, size):
    for pos in range(0, len(seq), size):
        yield seq.iloc[pos:pos + size] 

class Embedding:
    
    client = OpenAI()

    def __init__(self, model = "text-embedding-3-small"):
        """Creates a new (empty) embedding with the given model"""
        
        self.model = model 
        self.encoding = tiktoken.encoding_for_model(model)
        
    def __count_tokens(self, text_to_embed):
        tcnt = len(self.encoding.encode(text_to_embed))
        return tcnt

    def __get_embeddings(self, texts_to_embed):
        response = self.client.embeddings.create(
            model=self.model,
            input=texts_to_embed,
            encoding_format="float"
        )
        return map(lambda d: d.embedding, response.data)

    def __process_batch(self, batch_df, prog):
        results = self.__get_embeddings(batch_df[self.column])
        for i in enumerate(results):
            self.df.at[batch_df.index[i[0]], "__embedding"] = i[1]
        next(prog)


    def __process_df(self, nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        self.df = self.df[self.df[self.column].str.contains(contains)]
        if nsamples > 0 : self.df = self.df.sample(nsamples)
        self.df["__embedding"] = ""
        self.df["__tcnt"] = self.df[self.column].astype(str).apply(self.__count_tokens) 
        if not dryrun and len(self.df) > 0: 
            if chunksize == 0 : chunksize = int(8192 / self.df["__tcnt"].max() * 0.7) 
            print("chunksize: ", chunksize)
            prog = progress(len(self.df)/chunksize)    
            with ThreadPoolExecutor(max_workers=parallel) as executor:
                for batch_df in chunker(self.df, chunksize):
                    executor.submit(self.__process_batch, batch_df, prog)            

        print(f"Number of tokens: {self.df['__tcnt'].sum()}")
        print(f"Created {len(self.df)} embeddings.")


    def store(self):
        """Stores the embedding alongside the input file the embedding was created from, with the suffix .mbd.h5"""
        
        with pd.HDFStore(self.cache, 'w') as store:
            store.put('data', self.df)
            store.get_storer('data').attrs.metadata = {"path": self.path, "column": self.column}
            print(f"Stored Embedding to {self.cache}")
        return self

    def load(self, filename, contains=""):
        """Loads an embedding that was previously stored."""
        
        print(f"Loading Embedding from {self.to_cache_path(filename)}") 
        with pd.HDFStore(self.to_cache_path(filename), "r") as store:
            self.df = store.get("data")
            metadata = store.get_storer('data').attrs.metadata
            self.path = metadata["path"]
            self.column = metadata["column"]
            self.df = self.df.dropna(subset=[self.column])
            self.df = self.df[self.df[self.column].str.contains(contains)]
        print(f"Loaded {len(self.df)} embeddings.")
        return self

    def from_csv(self, path, delimiter=";", column="text", nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        """
        Build an embedding from the given CSV file. 
        
        column: The named column in the CSV file to get the text to embed from.
        nsamples: Only use nsamples random lines to build the embedding.
        contains: Only use lines containing the given text
        dryrun: Do not call any APIs 
        chunksize: How many lines are included in a single API calls. If omitted or 0, the optimal chunksize is estimated automatically.
        parallel: Number of parallel requests.
        """
        
        self.path = Path(path).absolute()
        self.cache = self.path.with_suffix(".mbd.h5")
        self.column = column
        self.df = pd.read_csv(self.path, sep=delimiter)        
        print(f"Reading from {self.path}")
        self.__process_df(nsamples=nsamples, dryrun=dryrun, chunksize=chunksize, parallel=parallel)
        return self   

    def from_xls(self, path, sheet="", column="text", nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        """
        Build an embedding from the given excel file. 
        
        Same arguments as from_csv() plus:
        sheet: Name of the shhet to read.
        """
        
        self.path = Path(path).absolute()
        self.cache = self.path.with_suffix(".mbd.h5")
        self.column = column        
        with pd.ExcelFile(path) as xls:  
            self.df = pd.read_excel(xls, sheet)  
        print(f"Reading from {self.path}")
        self.__process_df(nsamples=nsamples, contains=contains, dryrun=dryrun, chunksize=chunksize, parallel=parallel)
        return self   

    def from_array(self, texts, nsamples=0, contains="", dryrun=False, chunksize=0, parallel=20):
        """
        Build an embedding from an array. 

        Same arguments as from_csv().        
        """

        self.column = "__text"
        self.df = pd.DataFrame()
        self.df[self.column] = texts
        self.__process_df(nsamples=nsamples, contains=contains, dryrun=dryrun, chunksize=chunksize, parallel=parallel)
        return self   
        
    @staticmethod
    def to_cache_path(orig_path):
        cach_path = Path(orig_path).absolute().with_suffix(".mbd.h5")
        return cach_path
        
    @classmethod
    def match_embeddings(cls, input, target):
        if len(input.df) == 0 or len(target.df) == 0:
            return

        similarities = cosine_similarity (
            input.df.loc[:,"__embedding"].tolist(), 
            target.df.loc[:,"__embedding"].tolist())
        maxes = similarities.argmax(axis=1)

        hits = list(map(lambda i: target.df.loc[i,target.column].strip(), maxes))

        res = pd.DataFrame()
        res["in"] = input.df.loc[:,input.column]
        res["out"] = hits
        res.to_csv("testresult.csv", sep=";", index=False)
        return res        

    @classmethod
    def similarity_matrix(cls, input, target):
        if len(input.df) == 0 or len(target.df) == 0:
            return
        similarities = cosine_similarity (
            input.df.loc[:,"__embedding"].tolist(), 
            target.df.loc[:,"__embedding"].tolist())
        df = pd.DataFrame(similarities, columns=list(target.df[target.column]), index=list(input.df[input.column]))
        return df


In [4]:
zds_latest_06 = Embedding().from_csv("../precitooldata/zds_latest/zds_categories_with_path_06.csv", column='categoryPathShort').store()
broken_cats = Embedding().from_xls("../precitooldata/EDEfalscheKategorien.xlsx", sheet="finalV2", column="ede").store()

#broken_cats = Embedding().load("../precitooldata/EDEfalscheKategorien.xlsx")
#Embedding.match_embeddings(broken_cats, zds_latest_06)

In [None]:

target_cats = Embedding().load("../precitooldata/zds_latest/zds_categories_with_path_06.csv")
#target_cats = Embedding().load("../precitooldata/zds_latest/zds_categories_with_path.csv", contains="69_Betriebseinrichtung")
#Embedding.match_embeddings(broken_cats, target_cats)

In [11]:
bla = Embedding(model="text-embedding-3-large").from_array(["maus", "haus", "laus", "ratte"])
blub = Embedding(model="text-embedding-3-large").from_array(["nagetier", "wohnzimmer"])
print(Embedding.similarity_matrix(bla, blub))

1 / 1 (100%)
Nuber of tokens: 6
Created 2 embeddings.
       nagetier  wohnzimmer
maus   0.448305    0.266702
haus   0.263480    0.401299
laus   0.143407    0.176122
ratte  0.635840    0.324984


In [8]:
#target_colors = Embedding(model="text-embedding-3-large").from_csv("embedding/targetcolors.csv", nsamples=0).store()
#test_colors = Embedding(model="text-embedding-3-large").from_csv("embedding/testcolors.csv", column="search", nsamples=0).store()

target = Embedding().load("embedding/targetcolors.csv")
test = Embedding().load("embedding/testcolors.csv")

print(Embedding.match_embeddings(test, target))
print(Embedding.similarity_matrix(test, target))

Loading Embedding from /Users/gheiss/workspaces/jupyter/embedding/targetcolors.mbd.h5
Loaded 18 embeddings.
Loading Embedding from /Users/gheiss/workspaces/jupyter/embedding/testcolors.mbd.h5
Loaded 79 embeddings.
                in        out
0             blau       blue
1             grau       grey
2             gelb     yellow
3            beige      beige
4             weiß      white
..             ...        ...
74      türkisblau  turquoise
75  vintage indigo  turquoise
76           weiss      white
77           white      white
78          yellow     yellow

[79 rows x 2 columns]
                   black      grey     beige     white      blue     brown  \
blau            0.342835  0.409396  0.381652  0.330763  0.561244  0.345590   
grau            0.347526  0.630660  0.449944  0.355024  0.323822  0.393050   
gelb            0.281950  0.378548  0.408017  0.329555  0.324214  0.323200   
beige           0.331640  0.434184  0.857109  0.381737  0.331725  0.453607   
weiß         