In [7]:
import autorootcwd
import pandas as pd
import openai
import json
from tqdm import tqdm
import umap
import matplotlib.pyplot as plt
import plotly.express as px

from src.embeddings import AngleEmbedding

# Category description expansion

We use language models to generate a set of extended descriptions for each category, based on the original name and description. Then we generate their embeddings and save them to `data/taxonomy/taxonomy_expanded.csv`.

In [3]:
client = openai.Client()
angle = AngleEmbedding()

In [4]:
def get_embedding(text, model="text-embedding-ada-002"):
   response = client.embeddings.create(input = [text], model=model)
   return response.data[0].embedding


def generate(system, prompt, model="gpt4", temperature=0.0, return_json=False):
    if model == "gpt35":
        model = "gpt-3.5-turbo-1106"
    elif model == "gpt4":
        model = "gpt-4-1106-preview"
    else:
        raise ValueError("model must be 'gpt35' or 'gpt4'")
    
    args = {
        "model": model, 
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": prompt},
        ],
        "temperature": temperature,
        "frequency_penalty": 0.0,
        "presence_penalty": 0.0,
        "n": 1,
    }

    if return_json:
        args["response_format"] = {"type": "json_object"}
        
    completion = client.chat.completions.create(**args)
    answer = completion.choices[0].message.content
    return answer


def generate_keywords(category, description):
   system = "You are a cautious assistant. You carefully follow instructions. You are designed to output JSON."
   
   with open('prompts/keywords.txt', 'r') as file:
      prompt = file.read()
      prompt.strip()
      prompt = prompt.format(category=category, description=description)
   
   answer = generate(system=system, prompt=prompt, model="gpt4", temperature=1.0, return_json=True)
   json_obj = json.loads(answer)
   keywords = json_obj["keywords"]
   keywords = ', '.join(keywords)
   
   return keywords


def generate_example_articles(category, description):
   pass


def generate_descriptions(category, description):
   pass

In [21]:
taxonomy = pd.read_csv('data/taxonomy/taxonomy.csv')
print(taxonomy.shape)

taxonomy_high_label=taxonomy[taxonomy['hierarchy']==1]
print(taxonomy_high_label.shape)

category_name_list = taxonomy_high_label['name'].tolist()
category_description_list = taxonomy_high_label['description'].tolist()
print(len(category_name_list), len(category_description_list))

(937, 8)
(17, 8)
17 17


In [7]:
keywords_list = []
for category, description in tqdm(list(zip(category_name_list, category_description_list)), desc="Generating Keywords"):
    keywords = generate_keywords(category=category, description=description)
    keywords_list.append(keywords)

Generating Keywords: 100%|██████████| 17/17 [07:52<00:00, 27.78s/it]


In [8]:
keywords_list

['performing arts, visual arts, literary arts, music, theater, dance, painting, sculpture, photography, film, fashion, culinary arts, architecture, museums, galleries, concerts, exhibitions, festivals, crafts, design, poetry, literature, novels, opera, ballet, jazz, rock music, pop culture, street art, graffiti, modern art, classical music, creative writing, stand-up comedy, art history, artificial intelligence in art, cultural heritage, art conservation, art education, film industry, cinematography, animation, digital art, game design, arts criticism, arts funding, arts policy, arts therapy, cultural festivals, folk arts, indie music, art auctions, art fairs, art movements, fine arts, artistic expression, creative industries, entertainment law, cultural events, celebrity culture, art awards, art grants, art competitions, avant-garde, choreography, artistic collaboration, cultural identity, world music, collectible arts, artistic innovation, street performances, arts advocacy, art coll

In [9]:
final_description_form = """
Name: {category}
Description: {description}
Keywords: {keywords}
"""

final_category_descriptions = []
embeddings = []
embeddings_angle = []
for category, description, keywords in tqdm(list(zip(category_name_list, category_description_list, keywords_list)), desc="Generating Embeddings"):
    final_description = final_description_form.format(category=category, description=description, keywords=keywords)
    final_description = final_description.strip()
    final_category_descriptions.append(final_description)
    
    emb = get_embedding(text=final_description)
    emb_angle = angle.get_embedding(final_description)
    embeddings.append(emb)
    embeddings_angle.append(emb_angle)

Generating Embeddings: 100%|██████████| 17/17 [01:38<00:00,  5.77s/it]


In [10]:
final_category_descriptions

['Name: arts, culture and entertainment\nDescription: Matters pertaining to the advancement and refinement of the human mind, of interests, skills, tastes and emotions \nKeywords: performing arts, visual arts, literary arts, music, theater, dance, painting, sculpture, photography, film, fashion, culinary arts, architecture, museums, galleries, concerts, exhibitions, festivals, crafts, design, poetry, literature, novels, opera, ballet, jazz, rock music, pop culture, street art, graffiti, modern art, classical music, creative writing, stand-up comedy, art history, artificial intelligence in art, cultural heritage, art conservation, art education, film industry, cinematography, animation, digital art, game design, arts criticism, arts funding, arts policy, arts therapy, cultural festivals, folk arts, indie music, art auctions, art fairs, art movements, fine arts, artistic expression, creative industries, entertainment law, cultural events, celebrity culture, art awards, art grants, art co

In [24]:
embeddings_str = [str(emb) for emb in embeddings]
embeddings_angle_str = [str(list(emb)) for emb in embeddings_angle]

taxonomy_high_label.loc[:, 'keywords'] = keywords_list
taxonomy_high_label.loc[:, 'final_description'] = final_category_descriptions
taxonomy_high_label.loc[:, 'final_description_ada_embedding'] = embeddings_str
taxonomy_high_label.loc[:, 'final_description_angle_embedding'] = embeddings_angle_str

taxonomy_high_label['angle_embedding_name'] = taxonomy['angle_embedding_name']
taxonomy_high_label['angle_embedding_description'] = taxonomy['angle_embedding_description']

taxonomy_high_label.to_csv('data/taxonomy/taxonomy_expanded.csv', index=False)
taxonomy_high_label.to_json('data/taxonomy/taxonomy_expanded.json', orient="records", indent=4)

In [16]:
# Reduce the dimensionality of the embeddings using UMAP
reducer = umap.UMAP()
embedding_umap = reducer.fit_transform(embeddings)

# Create an interactive scatter plot using Plotly
fig = px.scatter(x=embedding_umap[:, 0], y=embedding_umap[:, 1], title='UMAP Embedding Space of Final Category Descriptions', text=category_name_list)
fig.show()

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
