In [33]:
# install dependencies
import sqlite3
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
# connect to database
connection = sqlite3.connect("../../data/colleges.sqlite")
colleges_df = pd.read_sql_query("SELECT * FROM hd2021_summary", connection)

In [3]:
# separate strings and numbers
is_num = [dtype == np.int64 or dtype == np.float64 for dtype in colleges_df.dtypes]
is_string = [dtype != np.int64 or dtype != np.float64 for dtype in colleges_df.dtypes]

number_colleges_df = colleges_df.loc[:, is_num]
string_colleges_df = colleges_df.loc[:, is_string]

In [192]:
# load the model
USE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4"
use = hub.load(USE_URL)
print(f"use {USE_URL} is loaded")

use https://tfhub.dev/google/universal-sentence-encoder/4 is loaded


In [178]:
# get specific column(s)
largest_programs_with_unitid = string_colleges_df[["UNITID", "Largest_Program"]].dropna()

# get the programs
largest_programs = list(largest_programs_with_unitid["Largest_Program"])
# get the corresponding unitids
corresponding_unitids = list(largest_programs_with_unitid["UNITID"])

# remove repeats
unique_programs = list(set(largest_programs))
print(f"{len(unique_programs)} programs")

439 programs


In [179]:
# clean program titles
reverse_lookup = {}
cleaned_titles = []

for program in unique_programs:
    # cleaned_program_title = program.replace("/", " or ")
    cleaned_program_title = program

    cleaned_titles.append(cleaned_program_title)
    reverse_lookup[cleaned_program_title] = program

print(f"{len(cleaned_titles)} cleaned titles")

439 cleaned titles


In [180]:
# create embeddings
embedded_largest_program_names = use(cleaned_titles)
# for index, embedding in enumerate(np.array(embedded_largest_program_names).tolist()):
#     print(f"Title: {cleaned_titles[index]}")
#     print(f"Embedding length {len(embedding)}")
#     print(f"Embedding: {embedding}")
list_embeddings = np.array(embedded_largest_program_names).tolist()
title_embeddings = dict(zip(cleaned_titles, list_embeddings))
print(list(title_embeddings.items())[0])

('Chemical Technology/Technician', [0.053433630615472794, 0.012734547257423401, 0.07266387343406677, 0.07103799283504486, 0.05384865030646324, 0.04526801407337189, -0.03725350275635719, -0.008688163943588734, -0.025469521060585976, -0.005902777425944805, 0.06701875478029251, 0.0018895810935646296, 0.02368696965277195, -0.04996219277381897, 0.06732191890478134, -0.07621824741363525, -0.05051932856440544, 0.023175032809376717, 0.03569447994232178, -0.04907894507050514, 0.015320664271712303, -0.04967855289578438, 0.06433694064617157, -0.03585246577858925, 0.024994678795337677, -0.035441651940345764, -0.02435644157230854, 0.05605313181877136, -0.0170806385576725, -0.04664447158575058, -0.05016355216503143, 0.06571034342050552, -0.007335019297897816, 0.014117647893726826, -0.05200963467359543, 0.020886318758130074, -0.051836367696523666, -0.09217800945043564, 0.05636177957057953, 0.0752955824136734, -0.06813608855009079, 0.015087056905031204, -0.036215584725141525, -0.06731798499822617, 0.0

In [181]:
# first_title = title_embeddings["Chemical Technology/Technician"]
# second_title = title_embeddings["Physical Sciences, General"]

# print(f"{round(np.inner(first_title, second_title)*100)}%")

closenessの全体 = {}
for title, embedding in title_embeddings.items():
    closeness = {}
    for other_title, other_embedding in title_embeddings.items():
        closeness[np.inner(embedding, other_embedding)] = other_title
    closenessの全体[title] = closeness

In [191]:
values_for_specific = list(closenessの全体.values())[0]
top_values = list(values_for_specific.keys())
top_values.sort(reverse=True)
print(top_values[:10])
print([values_for_specific[value] for value in top_values[:10]])

[0.9999999773144765, 0.43703822086467964, 0.4283497553888696, 0.4022658914297902, 0.39451638857308646, 0.3935044294425424, 0.38350726423826587, 0.3831380374732279, 0.3780588889569155, 0.3775495890015259]
['Chemical Technology/Technician', 'Zoology/Animal Biology', 'Molecular Medicine', 'Horticultural Science', 'Biochemistry', 'Physical Therapy/Therapist', 'Medical Reception/Receptionist', 'Metal Fabricator', 'Holistic/Integrative Health', 'Diesel Mechanics Technology/Technician']
