This experiment is to take each Mindtools skill and match all of our best content from the library to it, and store the scores in a spreadsheet.

In [1]:
import header

import random
import json
import numpy as np
import pandas as pd
from django.db import transaction
from tqdm import tqdm
from v0.index import VectorIndex
from v0.models import Content, MindtoolsSkillGroup

DJANGO SETTINGS IN DEBUG
[v0.index] 2022-08-29 15:12:38,069 INFO MainThread [index:init_indexes:174] | Initializing indexes..
[v0.index] 2022-08-29 15:12:38,070 INFO MainThread [index:init_indexes:204] | 0 indexes specified to be built at startup.


In [2]:
# generate index
skill_group_index = VectorIndex(MindtoolsSkillGroup.objects.all())

[v0.VectorIndex_MindtoolsSkillGroup] 2022-08-29 15:12:38,771 INFO MainThread [index:_generate_index:45] | Generated index for MindtoolsSkillGroup with a total of 12 vectors in 0.3882s


In [3]:
# Load content from our database dump
df = pd.read_csv('data/content.csv')

# df = df[df.provider.eq('hbr')]
df = df[df.provider == 'hbr'].reset_index(drop=True).copy()
print(df.shape)

skill_group_pks = list(MindtoolsSkillGroup.objects.all().values_list('pk', flat=True))

# Add the skill groups as columns to store the similarity scores
df = df.reindex(columns=list(df.columns) + skill_group_pks)
print(list(df.columns))

(2932, 6)
['title', 'uuid', 'mindtools_skill_group', 'embedding_all_mpnet_base_v2', 'provider', 'url', 'Time Management', 'Stress Management', 'Creativity Tools', 'Project Management', 'Strategy Tools', 'Team Management', 'Career Skills', 'Communication Skills', 'Decision Making', 'Problem Solving', 'Learning Skills', 'Leadership Skills']


In [6]:
# Do the actual matching batched, much faster
query_vectors = np.array([np.array(json.loads(x)).astype(np.float32) for x in df['embedding_all_mpnet_base_v2']]).astype(np.float32)
similarity_values, indices = skill_group_index.index.search(query_vectors, k=len(skill_group_pks))

for i, indice in tqdm(enumerate(indices), total=len(indices)):
    for j, skill_group_indice in enumerate(indice):
        df.at[i, skill_group_pks[skill_group_indice]] = similarity_values[i, j]

100%|██████████| 2932/2932 [00:00<00:00, 22830.25it/s]


In [7]:
# Store and plot
df_pruned = df.filter(['title', 'uuid', 'provider', 'url']+skill_group_pks, axis=1)
df_pruned.to_excel('data/mindtools_experiments/mindtools_content_similarity_hbr.xlsx', index=False)