This experiment is to take each Mindtools skill and match all of our best content from the library to it, and store the scores in a spreadsheet.

In [9]:
import header

import random
import json
import numpy as np
import pandas as pd
from django.db import transaction
from tqdm import tqdm
from v0.index import VectorIndex
from v0.models import Content, MindtoolsSkillGroup

In [2]:
# generate index
skill_group_index = VectorIndex(MindtoolsSkillGroup.objects.all())

[v0.VectorIndex_MindtoolsSkillGroup] 2022-08-23 14:18:25,705 INFO MainThread [index:_generate_index:46] | Generated index for MindtoolsSkillGroup with a total of 11 vectors in 0.1061s


In [7]:
# Load content from our database dump
df = pd.read_csv('data/content.csv')

skill_group_pks = list(MindtoolsSkillGroup.objects.all().values_list('pk', flat=True))

# Add the skill groups as columns to store the similarity scores
df = df.reindex(columns=list(df.columns) + skill_group_pks)
print(list(df.columns))

['title', 'uuid', 'mindtools_skill_group', 'embedding_all_mpnet_base_v2', 'provider', 'url', 'Time Management', 'Stress Management', 'Creativity Tools', 'Project Management', 'Strategy Tools', 'Team Management', 'Career Skills', 'Communication Skills', 'Decision Making', 'Problem Solving', 'Learning Skills']


In [14]:
# Do the actual matching batched, much faster
query_vectors = np.array([np.array(json.loads(x)).astype(np.float32) for x in df['embedding_all_mpnet_base_v2']]).astype(np.float32)
similarity_values, indices = skill_group_index.index.search(query_vectors, k=len(skill_group_pks))

for i, indice in tqdm(enumerate(indices), total=len(indices)):
    for j, pk in enumerate(skill_group_pks):
        df.at[i, pk] = similarity_values[i, j]

100%|██████████| 11074/11074 [00:01<00:00, 8953.32it/s]


In [18]:
# Store and plot
df_pruned = df.filter(['title', 'uuid', 'provider', 'url']+skill_group_pks, axis=1)
df.to_excel('data/mindtools_experiments/mindtools_content_similarity_full.xlsx', index=False)
df_pruned.to_excel('data/mindtools_experiments/mindtools_content_similarity_pruned.xlsx', index=False)