In [1]:
import header

import random
import numpy as np
import pandas as pd
from django.db import transaction
from tqdm import tqdm
from v0.index import VectorIndex
from v0.models import Content, MindtoolsSkillGroup, MindtoolsSkillSubgroup

DJANGO SETTINGS IN DEBUG
[v0.index] 2022-08-18 14:29:07,190 INFO MainThread [index:init_indexes:167] | Initializing indexes..
[v0.index] 2022-08-18 14:29:07,761 INFO MainThread [index:init_indexes:195] | No indexes to build


In [2]:
# generate index
skill_subgroup_index = VectorIndex(MindtoolsSkillSubgroup.objects.all())
skill_group_index = VectorIndex(MindtoolsSkillGroup.objects.all())

[v0.VectorIndex_MindtoolsSkillSubgroup] 2022-08-18 14:29:08,207 INFO MainThread [index:_generate_index:46] | Generated index for MindtoolsSkillSubgroup with a total of 87 vectors in 0.1793s
[v0.VectorIndex_MindtoolsSkillGroup] 2022-08-18 14:29:08,259 INFO MainThread [index:_generate_index:46] | Generated index for MindtoolsSkillGroup with a total of 11 vectors in 0.0307s


In [31]:
contents = list(Content.objects.exclude(embedding_all_mpnet_base_v2__isnull=True).values_list('pk', 'embedding_all_mpnet_base_v2'))
content_pks, content_embeddings = zip(*contents)
print(len(contents))

59717


In [32]:
# Do the actual matching batched, much faster
query_vectors = np.array([np.array(x[1]).astype(np.float32) for x in contents]).astype(np.float32)
values, indices = skill_group_index.index.search(query_vectors, k=1)
skill_group_results_pk = [skill_group_index.pks[indice[0]] for indice in indices]

In [38]:
# group for faster updates
skill_group_pks = list(MindtoolsSkillGroup.objects.all().values_list('pk', flat=True))

results_grouped = dict.fromkeys(skill_group_pks)

df = pd.DataFrame(columns=['Mindtools Skill Group', 'Article Count'])

for skill_group_pk in skill_group_pks:
    results_grouped[skill_group_pk] = [content_pks[i] for i, value in enumerate(skill_group_results_pk) if value == skill_group_pk]
    print(f'{skill_group_pk}: {len(results_grouped[skill_group_pk])} articles')
    df.loc[skill_group_pk] = [skill_group_pk, len(results_grouped[skill_group_pk])]

df.to_csv('data/mindtools_experiments/skill_group_article_count.csv', index=False)

print('Total articles:', len(contents))

Time Management: 5402 articles
Stress Management: 2120 articles
Creativity Tools: 6482 articles
Project Management: 4681 articles
Strategy Tools: 7144 articles
Team Management: 12013 articles
Career Skills: 8151 articles
Communication Skills: 5777 articles
Decision Making: 4257 articles
Problem Solving: 2803 articles
Learning Skills: 887 articles
Total articles: 59717


In [36]:
for i, skill_group_pk in tqdm(enumerate(skill_group_pks), total=len(skill_group_pks)):
    updated_count = Content.objects.filter(pk__in=results_grouped[skill_group_pk]).update(mindtools_skill_group=skill_group_pk)

100%|██████████| 11/11 [00:12<00:00,  1.09s/it]
