In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [2]:
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = CohereEmbedding(cohere_api_key=os.environ["COHERE_API_KEY"], model_name="embed-english-v3.0")
# embed_model = OpenAIEmbedding(api_key=os.environ["OPENAI_API_KEY"], model="text-embedding-3-small")

[nltk_data] Downloading package punkt_tab to /Users/jeremy.herzog/.pye
[nltk_data]     nv/versions/3.11.4/lib/python3.11/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

import nest_asyncio; nest_asyncio.apply();

text_a = """
schools located in the south
"""

text_b = """
RUTGERS UNIVERSITY
"""

text_a_embedding = await embed_model.aget_text_embedding(text_a)
text_b_embedding = await embed_model.aget_text_embedding(text_b)

cosine_similarity([text_a_embedding], [text_b_embedding])

array([[0.38467444]])

In [17]:
column_names = [
    "University of Rochester, Rochester, NY 14627",
    "University of Florida - Main Campus, Gainesville, FL 32611",
    "University of Dayton, 300 College Park, Dayton, OH 45469",
    "California Polytechnic State University, San Luis Obispo, CA 93407",
    "Baylor University, 1311 S 5th St, Waco, TX 76706",
    "Pomona College, 333 N College Way, Claremont, CA 91711",
    "Iowa State University, Ames, IA 50011",
    "University of California, Santa Barbara, Santa Barbara, CA 93106",
    "Boston College, 140 Commonwealth Avenue, Chestnut Hill, MA 02467",
    "College of the Holy Cross, 1 College St, Worcester, MA 01610",
    "Babson College, 231 Forest St, Wellesley, MA 02457",
    "University of California, San Diego, 9500 Gilman Dr, La Jolla, CA 92093",
    "The University of North Carolina at Charlotte, 9201 University City Blvd, Charlotte, NC 28223",
    "University of Michigan, Ann Arbor, MI 48109",
    "Duke University, Durham, NC 27708",
    "Brown University, Providence, RI 02912",
    "Williams College, 880 Main St, Williamstown, MA 01267",
    "University of Richmond, 28 Westhampton Way, Richmond, VA 23173",
    "Lafayette College, 730 High St, Easton, PA 18042",
    "University of Nevada, Las Vegas, 4505 S Maryland Pkwy, Las Vegas, NV 89154",
    "Harvard University, Cambridge, MA 02138",
    "Marquette University, 1250 W Wisconsin Ave, Milwaukee, WI 53233",
    "Clemson University, Clemson, SC 29634"
]


node_text = """
Which schools are meet the following geographic preference criteria?

* The South
"""

column_embeddings = await embed_model.aget_text_embedding_batch(texts=column_names)
text_embedding = await embed_model.aget_text_embedding(node_text)

cosine_sims = cosine_similarity([text_embedding], column_embeddings)
column_sims = []

for text_sims in cosine_sims:
    for j, text_sim in enumerate(text_sims):
        column_sims.append((column_names[j], text_sim))

column_sims

[('University of Rochester, Rochester, NY 14627', 0.3718114642452711),
 ('University of Florida - Main Campus, Gainesville, FL 32611',
  0.41855892149962964),
 ('University of Dayton, 300 College Park, Dayton, OH 45469',
  0.33614491603543634),
 ('California Polytechnic State University, San Luis Obispo, CA 93407',
  0.34621457590205384),
 ('Baylor University, 1311 S 5th St, Waco, TX 76706', 0.3975875087893648),
 ('Pomona College, 333 N College Way, Claremont, CA 91711',
  0.3594790523269663),
 ('Iowa State University, Ames, IA 50011', 0.4000651362003228),
 ('University of California, Santa Barbara, Santa Barbara, CA 93106',
  0.3868263156527376),
 ('Boston College, 140 Commonwealth Avenue, Chestnut Hill, MA 02467',
  0.29119175035175304),
 ('College of the Holy Cross, 1 College St, Worcester, MA 01610',
  0.3309509261492706),
 ('Babson College, 231 Forest St, Wellesley, MA 02457', 0.3330177106772869),
 ('University of California, San Diego, 9500 Gilman Dr, La Jolla, CA 92093',
  0.329

In [15]:
import numpy as np

max_idx = np.argmax(cosine_sims, axis=1)[0]
print(column_sims[max_idx])

('percent_who_had_gpa_between_1_0_and_1_99', 0.601285067489364)


In [19]:
from sklearn.cluster import affinity_propagation
from sklearn.metrics.pairwise import euclidean_distances

# cosine_sims[0].reshape(-1, 1)
S = -euclidean_distances(cosine_sims[0].reshape(-1, 1), squared=True)
# S
cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
print(cluster_centers_indices)
groups = {}

for i, label in enumerate(labels):
    if label not in groups:
        groups[label] = [column_sims[i]]
    else:
        groups[label].append(column_sims[i])

groups
# max cosine similarity is in what group? -> try to fill those columns

[ 4  5  8 11 22]


{1: [('University of Rochester, Rochester, NY 14627', 0.3718114642452711),
  ('California Polytechnic State University, San Luis Obispo, CA 93407',
   0.34621457590205384),
  ('Pomona College, 333 N College Way, Claremont, CA 91711',
   0.3594790523269663),
  ('The University of North Carolina at Charlotte, 9201 University City Blvd, Charlotte, NC 28223',
   0.3534074668737264),
  ('University of Richmond, 28 Westhampton Way, Richmond, VA 23173',
   0.3634712470895129),
  ('Lafayette College, 730 High St, Easton, PA 18042', 0.3659924841456075)],
 0: [('University of Florida - Main Campus, Gainesville, FL 32611',
   0.41855892149962964),
  ('Baylor University, 1311 S 5th St, Waco, TX 76706', 0.3975875087893648),
  ('Iowa State University, Ames, IA 50011', 0.4000651362003228),
  ('University of California, Santa Barbara, Santa Barbara, CA 93106',
   0.3868263156527376),
  ('University of Michigan, Ann Arbor, MI 48109', 0.41401448870413415),
  ('Duke University, Durham, NC 27708', 0.39548

In [16]:
from sklearn.cluster import MeanShift

clustering = MeanShift().fit(cosine_sims[0].reshape(-1, 1))

groups = {}

for i, label in enumerate(clustering.labels_):
    if label not in groups:
        groups[label] = [column_sims[i]]
    else:
        groups[label].append(column_sims[i])

# groups
groups[clustering.labels_[max_idx]]

[('percent_who_had_gpa_of_4', 0.5632066704592831),
 ('percent_who_had_gpa_between_3_75_and_3_99', 0.5828313567541883),
 ('percent_who_had_gpa_between_3_50_and_3_74', 0.5917935769697924),
 ('percent_who_had_gpa_between_3_25_and_3_49', 0.5940490744743664),
 ('percent_who_had_gpa_between_3_00_and_3_24', 0.5854920860592666),
 ('percent_who_had_gpa_between_2_50_and_2_99', 0.5834674695539106),
 ('percent_who_had_gpa_between_2_0_and_2_49', 0.5881357542287242),
 ('percent_who_had_gpa_between_1_0_and_1_99', 0.601285067489364)]

In [22]:
from pydantic import BaseModel, Field
from typing import Literal
import json

class Column(BaseModel):
    name: str | None = None
    age: int | None = None
    salary: float | None = None
    has_children: bool | None = None
    # name: str = Field(description="Column name")
    # datatype: Literal["str", "int", "float", "bool"] = Field(
    #     description="Datatype of values for column", default="str"
    # )

print(json.dumps(Column.model_json_schema(), indent=2))

{
  "properties": {
    "name": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Name"
    },
    "age": {
      "anyOf": [
        {
          "type": "integer"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Age"
    },
    "salary": {
      "anyOf": [
        {
          "type": "number"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Salary"
    },
    "has_children": {
      "anyOf": [
        {
          "type": "boolean"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Has Children"
    }
  },
  "title": "Column",
  "type": "object"
}
