### DESCRIPTION:
    Find duplicates using embeddings – use Azure OpenAI to find similarities between pieces of text.
### REQUIREMENTS:
    Create an .env file with your OpenAI API key and save it in the root directory of this project.


In [1]:
from openai.embeddings_utils import cosine_similarity
from dotenv import load_dotenv
import utils as utils
import tiktoken
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [2]:
load_dotenv()
openai = utils.init_OpenAI()

# encoding for text-embedding-ada-002
embedding_encoding = "cl100k_base"  
encoding = tiktoken.get_encoding(embedding_encoding)

In [5]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], engine=utils.OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME)["data"][0]["embedding"]

In [6]:
response = """
    Text1: {txt1}
    Text2: {txt2}
    cosine_similarity: {cosine_similarity}
"""

sample1 = "Automatic Chicken Coop Door Opener Hen House Light Sensor Closer,"
sample2 = "Automatic Chicken Coop Door Opener Hen House Light Sensor Closer Opene,"
sample3 = "Automatic Chicken Coop Door Opener Hen House Light and some blah blah"
sample4 = "totally different text"

# calc embeddings
embed1 = calc_embeddings(sample1)
embed2 = calc_embeddings(sample2)
embed3 = calc_embeddings(sample3)
embed4 = calc_embeddings(sample4)

#compare the embeddings
result = cosine_similarity(embed1, embed2)
print(response.format(txt1=sample1, txt2=sample2, cosine_similarity=result))

result = cosine_similarity(embed1, embed3)
print(response.format(txt1=sample1, txt2=sample3, cosine_similarity=result))

result = cosine_similarity(embed1, embed4)
print(response.format(txt1=sample1, txt2=sample4, cosine_similarity=result))


    Text1: Automatic Chicken Coop Door Opener Hen House Light Sensor Closer,
    Text2: Automatic Chicken Coop Door Opener Hen House Light Sensor Closer Opene,
    cosine_similarity: 0.9956954434535107


    Text1: Automatic Chicken Coop Door Opener Hen House Light Sensor Closer,
    Text2: Automatic Chicken Coop Door Opener Hen House Light and some blah blah
    cosine_similarity: 0.923000174626097


    Text1: Automatic Chicken Coop Door Opener Hen House Light Sensor Closer,
    Text2: totally different text
    cosine_similarity: 0.7076560470812693

