In [1]:
import header

import random
import numpy as np
import pandas as pd
from django.db import transaction
from tqdm import tqdm
from v0.index import VectorIndex
from v0.models import Content, MindtoolsSkillGroup, MindtoolsSkillSubgroup

DJANGO SETTINGS IN DEBUG
[v0.index] 2022-08-19 11:48:26,160 INFO MainThread [index:init_indexes:167] | Initializing indexes..
[v0.index] 2022-08-19 11:48:27,110 INFO MainThread [index:init_indexes:195] | No indexes to build


In [2]:
# generate index
skill_subgroup_index = VectorIndex(MindtoolsSkillSubgroup.objects.all())
skill_group_index = VectorIndex(MindtoolsSkillGroup.objects.all())

[v0.VectorIndex_MindtoolsSkillSubgroup] 2022-08-19 11:48:27,728 INFO MainThread [index:_generate_index:46] | Generated index for MindtoolsSkillSubgroup with a total of 87 vectors in 0.1984s
[v0.VectorIndex_MindtoolsSkillGroup] 2022-08-19 11:48:27,786 INFO MainThread [index:_generate_index:46] | Generated index for MindtoolsSkillGroup with a total of 11 vectors in 0.033s


In [3]:
contents = list(Content.objects.exclude(embedding_all_mpnet_base_v2__isnull=True).values_list('pk', 'embedding_all_mpnet_base_v2'))
content_pks, content_embeddings = zip(*contents)
print(len(contents))

59717


In [7]:
# Do the actual matching batched, much faster
query_vectors = np.array([np.array(x[1]).astype(np.float32) for x in contents]).astype(np.float32)
similarity_values, indices = skill_group_index.index.search(query_vectors, k=1)
skill_group_results_pk = [skill_group_index.pks[indice[0]] for indice in indices]
skill_group_results_value = [value[0] for value in similarity_values]

In [8]:
# we want to do the analysis by skill group, so we need to group by skill group
skill_group_pks = list(MindtoolsSkillGroup.objects.all().values_list('pk', flat=True))
results_grouped = dict.fromkeys(skill_group_pks)

df = pd.DataFrame(columns=['mindtools_skill_group', 'similarities', 'content_pks', 'count'])

for skill_group_pk in skill_group_pks:
    # Do the grouping
    # essentially we are iterating over the results, which are pairs of indices and the pk of the group it got matched to,
    # then seeing if the pk of the group is the same as the current group we are looking at, if it is, we add the pk of the article to the pks_in_group list
    pks_in_group, similarity_values_in_group = zip(*[(content_pks[i], similarity_values[i][0])  for i, skill_group in enumerate(skill_group_results_pk) if skill_group == skill_group_pk])
    results_grouped[skill_group_pk] = pks_in_group
    df.loc[skill_group_pk] = [skill_group_pk, pd.Series(similarity_values_in_group), pks_in_group, len(pks_in_group)]

In [9]:
 # clear memory
del query_vectors
del skill_group_results_pk
del skill_group_results_value
del contents
del content_pks
del content_embeddings

In [11]:
df.to_excel('skill_group_similarity.xlsx')

df_flipped = df.filter(['mindtools_skill_group', 'similarities']).T

df_flipped.to_excel('skill_group_similarity_flipped.xlsx')

ax = df_flipped.plot.hist(column=['similarities'], by='mindtools_skill_group', bins=20)

# Now that we have the grouped results, we can do the analysis
# df.to_excel('skill_group_article_count.csv', index=False)

print('Total articles:', len(contents))

                   mindtools_skill_group  \
Time Management          Time Management   
Stress Management      Stress Management   
Creativity Tools        Creativity Tools   
Project Management    Project Management   
Strategy Tools            Strategy Tools   

                                                         similarities  \
Time Management     0       0.313193
1       0.198143
2       0.05...   
Stress Management   0       0.153420
1       0.211083
2       0.23...   
Creativity Tools    0       0.426946
1       0.229233
2       0.28...   
Project Management  0       0.502471
1       0.329220
2       0.06...   
Strategy Tools      0       0.116625
1       0.251954
2       0.12...   

                                                          content_pks  count  
Time Management     (67f171ad-c1ea-41c8-9ae7-34b8385f5b60, 2523a77...   5402  
Stress Management   (13713cfa-1ed4-4dcd-8ab1-0bbf2013d216, 08b24dd...   2120  
Creativity Tools    (0cf3d410-baf7-488f-9b99-6c31125a3b72, 

TypeError: no numeric data to plot