In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [24]:
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = CohereEmbedding(cohere_api_key=os.environ["COHERE_API_KEY"], model_name="embed-english-v3.0")
# embed_model = OpenAIEmbedding(api_key=os.environ["OPENAI_API_KEY"], model="text-embedding-3-small")

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

import nest_asyncio; nest_asyncio.apply();

text_a = """
approximate_date_costs_available
"""

text_b = """
G0. Net Price Calculator URL\nPlease provide the URL of your institution's net price calculator:\n\n[_____________________________]\n\nFor the following sections, please provide 2023-2024 academic year costs of attendance for the following categories that are applicable to your institution.\n\nIf your institution's 2023-2024 academic year costs of attendance are not available at this time, please select the checkbox below and enter the approximate date (i.e. MM/DD) when your institution's final 2023-2024 academic year costs of attendance will be available.\n\nTuition and Fee Data Provided are: [Firm and Final]\n\n[ ] 2023-2024 academic costs not currently available\nApproximate date costs will be available: [________________]
"""

text_a_embedding = await embed_model.aget_text_embedding(text_a)
text_b_embedding = await embed_model.aget_text_embedding(text_b)

cosine_similarity([text_a_embedding], [text_b_embedding])

array([[0.51945307]])

In [4]:
column_names = [
    "degrees_offered_certificate",
    "degrees_offered_diploma",
    "degrees_offered_associate",
    "degrees_offered_bachelor",
    "degrees_offered_postbachelors_certificate",
    "degrees_offered_master",
    "degrees_offered_doctoral_degree_research_scholarship",
    "score_range_sat_math_200_299_percent",
    "score_range_act_composite_30_36_percent",
    "score_range_act_english_30_36_percent",
    "score_range_act_math_30_36_percent",
    "score_range_act_composite_24_29_percent",
    "score_range_act_english_24_29_percent",
    "full_time_total_degree_seeking_men",
    "full_time_total_degree_seeking_women",
    "part_time_total_degree_seeking_men",
    "part_time_total_degree_seeking_women",
    "percent_who_had_gpa_of_4",
    "percent_who_had_gpa_between_3_75_and_3_99",
    "percent_who_had_gpa_between_3_50_and_3_74",
    "percent_who_had_gpa_between_3_25_and_3_49",
    "percent_who_had_gpa_between_3_00_and_3_24",
    "percent_who_had_gpa_between_2_50_and_2_99",
    "percent_who_had_gpa_between_2_0_and_2_49",
    "percent_who_had_gpa_between_1_0_and_1_99"
]

node_text = """
## C11. High School Grade Point Ranges

Percentage of all enrolled, degree-seeking, first-time, first-year students who had high school grade-point averages within each of the following ranges (using 4.0 scale).

1. Report information only for those students from whom you collected high school GPA.

2. If you are able to report GPA ranges separately for students that also submitted at least one test score versus those who did not submit a test score, please do so in the respective columns. If you are unable to report these data, please report the ranges for all students.

| Score Range | Percent of students who submitted scores | Percent of students who did not submit scores | Percent of all enrolled students |
|-------------|------------------------------------------|-----------------------------------------------|----------------------------------|
| Percent who had GPA of 4.0 | 33.8% | | |
| Percent who had GPA between 3.75 and 3.99 | 32.5% | | |


| Percent who had GPA between 3.50 and 3.74 | 18.9% |   |   |
|-------------------------------------------|-------|---|---|
| Percent who had GPA between 3.25 and 3.49 | 9.4%  |   |   |
| Percent who had GPA between 3.00 and 3.24 | 3.7%  |   |   |
| Percent who had GPA between 2.50 and 2.99 | 1.6%  |   |   |
| Percent who had GPA between 2.0 and 2.49  | 0.2%  |   |   |
| Percent who had GPA between 1.0 and 1.99  | 0.0%  |   |   |
| Percent who had GPA below 1.0             |       |   |   |
| Total                                     | 100.0%| 0.0% | 0.0% |

Percent of all enrolled students was previously collected. Reporting by submitted test score is new. If available, please report all three segments of students.
"""

column_embeddings = await embed_model.aget_text_embedding_batch(texts=column_names)
text_embedding = await embed_model.aget_text_embedding(node_text)

cosine_sims = cosine_similarity([text_embedding], column_embeddings)
column_sims = []

for text_sims in cosine_sims:
    for j, text_sim in enumerate(text_sims):
        column_sims.append((column_names[j], text_sim))

column_sims

[('degrees_offered_certificate', 0.2548387515735656),
 ('degrees_offered_diploma', 0.30530242081179576),
 ('degrees_offered_associate', 0.29446668639876394),
 ('degrees_offered_bachelor', 0.3227382133176331),
 ('degrees_offered_postbachelors_certificate', 0.284919736175609),
 ('degrees_offered_master', 0.3043138847067871),
 ('degrees_offered_doctoral_degree_research_scholarship', 0.2297179743241619),
 ('score_range_sat_math_200_299_percent', 0.4452242797935162),
 ('score_range_act_composite_30_36_percent', 0.37070506928716707),
 ('score_range_act_english_30_36_percent', 0.38795881369784346),
 ('score_range_act_math_30_36_percent', 0.4243276535445679),
 ('score_range_act_composite_24_29_percent', 0.37615780459367826),
 ('score_range_act_english_24_29_percent', 0.3961174608746474),
 ('full_time_total_degree_seeking_men', 0.22121428494004153),
 ('full_time_total_degree_seeking_women', 0.23410032738982012),
 ('part_time_total_degree_seeking_men', 0.21642238978583161),
 ('part_time_total_de

In [15]:
import numpy as np

max_idx = np.argmax(cosine_sims, axis=1)[0]
print(column_sims[max_idx])

('percent_who_had_gpa_between_1_0_and_1_99', 0.601285067489364)


In [6]:
from sklearn.cluster import affinity_propagation
from sklearn.metrics.pairwise import euclidean_distances

# cosine_sims[0].reshape(-1, 1)
S = -euclidean_distances(cosine_sims[0].reshape(-1, 1), squared=True)
# S
cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
print(cluster_centers_indices)
groups = {}

for i, label in enumerate(labels):
    if label not in groups:
        groups[label] = [column_sims[i]]
    else:
        groups[label].append(column_sims[i])

groups
# max cosine similarity is in what group? -> try to fill those columns

[ 0 12 21]


{0: [('degrees_offered_certificate', 0.2548387515735656),
  ('degrees_offered_diploma', 0.30530242081179576),
  ('degrees_offered_associate', 0.29446668639876394),
  ('degrees_offered_bachelor', 0.3227382133176331),
  ('degrees_offered_postbachelors_certificate', 0.284919736175609),
  ('degrees_offered_master', 0.3043138847067871),
  ('degrees_offered_doctoral_degree_research_scholarship', 0.2297179743241619),
  ('full_time_total_degree_seeking_men', 0.22121428494004153),
  ('full_time_total_degree_seeking_women', 0.23410032738982012),
  ('part_time_total_degree_seeking_men', 0.21642238978583161),
  ('part_time_total_degree_seeking_women', 0.23503888857584093)],
 1: [('score_range_sat_math_200_299_percent', 0.4452242797935162),
  ('score_range_act_composite_30_36_percent', 0.37070506928716707),
  ('score_range_act_english_30_36_percent', 0.38795881369784346),
  ('score_range_act_math_30_36_percent', 0.4243276535445679),
  ('score_range_act_composite_24_29_percent', 0.37615780459367826)

In [16]:
from sklearn.cluster import MeanShift

clustering = MeanShift().fit(cosine_sims[0].reshape(-1, 1))

groups = {}

for i, label in enumerate(clustering.labels_):
    if label not in groups:
        groups[label] = [column_sims[i]]
    else:
        groups[label].append(column_sims[i])

# groups
groups[clustering.labels_[max_idx]]

[('percent_who_had_gpa_of_4', 0.5632066704592831),
 ('percent_who_had_gpa_between_3_75_and_3_99', 0.5828313567541883),
 ('percent_who_had_gpa_between_3_50_and_3_74', 0.5917935769697924),
 ('percent_who_had_gpa_between_3_25_and_3_49', 0.5940490744743664),
 ('percent_who_had_gpa_between_3_00_and_3_24', 0.5854920860592666),
 ('percent_who_had_gpa_between_2_50_and_2_99', 0.5834674695539106),
 ('percent_who_had_gpa_between_2_0_and_2_49', 0.5881357542287242),
 ('percent_who_had_gpa_between_1_0_and_1_99', 0.601285067489364)]

In [22]:
from pydantic import BaseModel, Field
from typing import Literal
import json

class Column(BaseModel):
    name: str | None = None
    age: int | None = None
    salary: float | None = None
    has_children: bool | None = None
    # name: str = Field(description="Column name")
    # datatype: Literal["str", "int", "float", "bool"] = Field(
    #     description="Datatype of values for column", default="str"
    # )

print(json.dumps(Column.model_json_schema(), indent=2))

{
  "properties": {
    "name": {
      "anyOf": [
        {
          "type": "string"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Name"
    },
    "age": {
      "anyOf": [
        {
          "type": "integer"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Age"
    },
    "salary": {
      "anyOf": [
        {
          "type": "number"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Salary"
    },
    "has_children": {
      "anyOf": [
        {
          "type": "boolean"
        },
        {
          "type": "null"
        }
      ],
      "default": null,
      "title": "Has Children"
    }
  },
  "title": "Column",
  "type": "object"
}
