In [None]:
pip install openai num2words matplotlib plotly scipy scikit-learn pandas tiktoken

In [None]:
import openai
import os
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity, get_embeddings
import tiktoken
import torch

# set seed
seed = 24
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
# define data path
DATA_PATH = './obesity_data/'
AOAI_PATH = './aoai/'

alldocs_df = pd.read_pickle(DATA_PATH + '/alldocs_df.pkl')

API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") 



In [None]:
openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

url = openai.api_base + "/openai/deployments?api-version=2022-12-01" 

r = requests.get(url, headers={"api-key": API_KEY})

#print(r.text)

In [None]:
#Very minimal cleansing as discussed in the AOAI tutorial
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

alldocs_df['text_clean']= alldocs_df["text"].apply(lambda x : normalize_text(x))

In [None]:
#Need to tokenize this for Azure Open AI, don't plan on splitting as they all fit within token limit
tokenizer = tiktoken.get_encoding("cl100k_base")
alldocs_df['n_tokens'] = alldocs_df["text_clean"].apply(lambda x: len(tokenizer.encode(x)))

print('# too big:',len(alldocs_df[alldocs_df.n_tokens>=8192]))
print('Total Number of Tokens:',sum(alldocs_df['n_tokens']))   


In [None]:
#This retrieves the embedding.  Since there is a cost, this is commented out.  To use this, you need to set two environment variables
#alldocs_df['ada_v2'] = alldocs_df["text_clean"].apply(lambda x : get_embedding(x, engine = 'text-embedding-ada-002')) 
# engine should be set to the deployment name you chose when you deployed the text-embedding-ada-002 (Version 2) model
#alldocs_df.to_pickle(AOAI_PATH + '/alldocs_df_aoai.pkl') 

#commented embdding call because there is a charge - loading from pkl file to do analysis
alldocs_df = pd.read_pickle(AOAI_PATH + '/alldocs_df_aoai.pkl') 


In [None]:
#Try and sentence tokenize
from nltk.tokenize import sent_tokenize
##Sentences - we can't do data cleansing until after sentence tokenized
alldocs_df['sentence_tokenized'] = alldocs_df['text_clean'].apply(lambda x: sent_tokenize(x)) # this is a list of sentences

alldocs_df['sentence_count'] = alldocs_df['sentence_tokenized'].apply(lambda x: len(x))
sentence_max_aoai = np.max(alldocs_df['sentence_count'])
print('Max Sentences:', sentence_max_aoai)


#need to create tokens add '\n' to reach max_sentences
#def token_and_pad_sentence(input_sentences, sentence_max):
#    pad_spaces = sentence_max - len(input_sentences)
#    result = input_sentences
#    if pad_spaces > 0:
#        for i in range(pad_spaces):
#            result.append('\n')

#alldocs_df_expanded['sentence_tokenized'] = alldocs_df_expanded['sentence_tokenized'].apply(lambda x: token_and_pad_sentence(x, sentence_max))

In [None]:
#Need to tokenize this for Azure Open AI, don't plan on splitting as they all fit within token limit
tokenizer = tiktoken.get_encoding("cl100k_base")

def get_sentence_tokens(input_sentences):
    tokens = 0
    for isx, sentence in enumerate(input_sentences):
        tokens = tokens + len(tokenizer.encode(sentence))
    return tokens

alldocs_df['n_sent_tokens'] = alldocs_df["sentence_tokenized"].apply(lambda x: get_sentence_tokens(x))

print('# too big:',len(alldocs_df[alldocs_df.n_sent_tokens>=8192]))
print('Total Number of Tokens:',sum(alldocs_df['n_sent_tokens']))   


In [None]:
#Now get the sentence embeddings
import time

batch = 0

def process_sentence(sentence):

    done = False

    return_array = None
    cnt = 0

    while not done:
        try:
            return_array = get_embedding(sentence, engine = 'text-embedding-ada-002')
            done = True
        except Exception as e:
            print(f'Exception {batch} {str(e)}')
            cnt = cnt + 1
            if cnt > 5:
                print('Too many retries')
                done = True
            else:
                print('Sleeping')
                time.sleep(60)
    
    return return_array

def get_padded_embeddings(input_sentences, sentence_max):
    global batch

    output_array = np.zeros((sentence_max, 1536))
    pad_zeros = sentence_max - len(input_sentences)
    
    batch = batch + 1
    size = len(input_sentences)
    print(f"Running batch {batch}:Size:{size}")

    cnt = 0
    done = False

    for idx, sentence in enumerate(input_sentences):
        output_array[idx,:] = process_sentence(sentence)

    if pad_zeros > 0:
        for i in range(pad_zeros):
            idx = idx + 1
            output_array[idx,:] = np.zeros(1536)

    return output_array

#df_test = alldocs_df.head(2).copy()
#df_test['ada_v2_sent'] = df_test["sentence_tokenized"].apply(lambda x: get_padded_embeddings(x, sentence_max_aoai))
#print(df_test['ada_v2_sent'])

#alldocs_df['ada_v2_sent'] = alldocs_df["sentence_tokenized"].apply(lambda x: get_padded_embeddings(x, sentence_max_aoai))
#alldocs_df.to_pickle(AOAI_PATH + '/alldocs_df_aoai.pkl') 

#commented embdding call because there is a charge - loading from pkl file to do analysis
alldocs_df = pd.read_pickle(AOAI_PATH + '/alldocs_df_aoai.pkl') 


In [None]:
#There were a couple retries.  Make sure the shapes are all correct
sum(alldocs_df['ada_v2_sent'].apply(lambda x: x.shape) != (381,1536))

