In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [2]:
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = CohereEmbedding(cohere_api_key=os.environ["COHERE_API_KEY"], model_name="embed-english-v3.0")
# embed_model = OpenAIEmbedding(api_key=os.environ["OPENAI_API_KEY"], model="text-embedding-3-small")

[nltk_data] Downloading package punkt_tab to /Users/jeremy.herzog/.pye
[nltk_data]     nv/versions/3.11.4/lib/python3.11/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

import nest_asyncio; nest_asyncio.apply();

text_a = """
schools located in the south
"""

text_b = """
RUTGERS UNIVERSITY
"""

text_a_embedding = await embed_model.aget_text_embedding(text_a)
text_b_embedding = await embed_model.aget_text_embedding(text_b)

cosine_similarity([text_a_embedding], [text_b_embedding])

array([[0.38467444]])

In [14]:
column_names = [
    "University of Rochester",
    "University of Florida - Main Campus",
    "University of Dayton",
    "California Polytechnic State University",
    "Baylor University",
    "Pomona College",
    "Iowa State University",
    "University of California, Santa Barbara",
    "Boston College",
    "College of the Holy Cross",
    "Babson College",
    "University of California, San Diego",
    "The University of North Carolina at Charlotte",
    "University of Michigan",
    "Duke University",
    "Brown University",
    "Williams College",
    "University of Richmond",
    "Lafayette College",
    "University of Nevada, Las Vegas",
    "Harvard University",
    "Marquette University",
    "Clemson University"
]


node_text = """
Which schools are meet the following geographic preference criteria?

* The South
"""

column_embeddings = await embed_model.aget_text_embedding_batch(texts=column_names)
text_embedding = await embed_model.aget_text_embedding(node_text)

cosine_sims = cosine_similarity([text_embedding], column_embeddings)
column_sims = []

for text_sims in cosine_sims:
    for j, text_sim in enumerate(text_sims):
        column_sims.append((column_names[j], text_sim))

column_sims

[('University of Rochester', 0.351378258943712),
 ('University of Florida - Main Campus', 0.3736843434440817),
 ('University of Dayton', 0.35141424456430037),
 ('California Polytechnic State University', 0.34711328641674155),
 ('Baylor University', 0.3592961124255179),
 ('Pomona College', 0.36098847484518576),
 ('Iowa State University', 0.3462973014566052),
 ('University of California, Santa Barbara', 0.3524067163775816),
 ('Boston College', 0.3604330517649026),
 ('College of the Holy Cross', 0.3578833288028056),
 ('Babson College', 0.3709132226844557),
 ('University of California, San Diego', 0.36263622391756867),
 ('The University of North Carolina at Charlotte', 0.3171878842252406),
 ('University of Michigan', 0.36660349028630757),
 ('Duke University', 0.30815139321352125),
 ('Brown University', 0.36234695227315067),
 ('Williams College', 0.3659580377666173),
 ('University of Richmond', 0.35537008648104446),
 ('Lafayette College', 0.37132901025272097),
 ('University of Nevada, Las V

In [15]:
import numpy as np

max_idx = np.argmax(cosine_sims, axis=1)[0]
print(column_sims[max_idx])

('percent_who_had_gpa_between_1_0_and_1_99', 0.601285067489364)


In [6]:
from sklearn.cluster import affinity_propagation
from sklearn.metrics.pairwise import euclidean_distances

# cosine_sims[0].reshape(-1, 1)
S = -euclidean_distances(cosine_sims[0].reshape(-1, 1), squared=True)
# S
cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
print(cluster_centers_indices)
groups = {}

for i, label in enumerate(labels):
    if label not in groups:
        groups[label] = [column_sims[i]]
    else:
        groups[label].append(column_sims[i])

groups
# max cosine similarity is in what group? -> try to fill those columns

[ 0 12 21]


{0: [('degrees_offered_certificate', 0.2548387515735656),
  ('degrees_offered_diploma', 0.30530242081179576),
  ('degrees_offered_associate', 0.29446668639876394),
  ('degrees_offered_bachelor', 0.3227382133176331),
  ('degrees_offered_postbachelors_certificate', 0.284919736175609),
  ('degrees_offered_master', 0.3043138847067871),
  ('degrees_offered_doctoral_degree_research_scholarship', 0.2297179743241619),
  ('full_time_total_degree_seeking_men', 0.22121428494004153),
  ('full_time_total_degree_seeking_women', 0.23410032738982012),
  ('part_time_total_degree_seeking_men', 0.21642238978583161),
  ('part_time_total_degree_seeking_women', 0.23503888857584093)],
 1: [('score_range_sat_math_200_299_percent', 0.4452242797935162),
  ('score_range_act_composite_30_36_percent', 0.37070506928716707),
  ('score_range_act_english_30_36_percent', 0.38795881369784346),
  ('score_range_act_math_30_36_percent', 0.4243276535445679),
  ('score_range_act_composite_24_29_percent', 0.37615780459367826)

In [16]:
from sklearn.cluster import MeanShift

clustering = MeanShift().fit(cosine_sims[0].reshape(-1, 1))

groups = {}

for i, label in enumerate(clustering.labels_):
    if label not in groups:
        groups[label] = [column_sims[i]]
    else:
        groups[label].append(column_sims[i])

# groups
groups[clustering.labels_[max_idx]]

[('percent_who_had_gpa_of_4', 0.5632066704592831),
 ('percent_who_had_gpa_between_3_75_and_3_99', 0.5828313567541883),
 ('percent_who_had_gpa_between_3_50_and_3_74', 0.5917935769697924),
 ('percent_who_had_gpa_between_3_25_and_3_49', 0.5940490744743664),
 ('percent_who_had_gpa_between_3_00_and_3_24', 0.5854920860592666),
 ('percent_who_had_gpa_between_2_50_and_2_99', 0.5834674695539106),
 ('percent_who_had_gpa_between_2_0_and_2_49', 0.5881357542287242),
 ('percent_who_had_gpa_between_1_0_and_1_99', 0.601285067489364)]

In [22]:
from pydantic import BaseModel, Field
from typing import Literal
import json

class Column(BaseModel):
    name: str | None = None
    age: int | None = None
    salary: float | None = None
    has_children: bool | None = None
    # name: str = Field(description="Column name")
    # datatype: Literal["str", "int", "float", "bool"] = Field(
    #     description="Datatype of values for column", default="str"
    # )

print(json.dumps(Column.model_json_schema(), indent=2))

{
  "properties": {
    "name": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Name"
    },
    "age": {
      "anyOf": [
        {
          "type": "integer"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Age"
    },
    "salary": {
      "anyOf": [
        {
          "type": "number"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Salary"
    },
    "has_children": {
      "anyOf": [
        {
          "type": "boolean"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Has Children"
    }
  },
  "title": "Column",
  "type": "object"
}
