This notebook compiles a graphable object of similar artists.

**Note** The code has been moved to `../bin`

In [4]:
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import preprocess_string
from pathlib import Path

In [5]:
def import_categories(paths):
    categories = {
        # <file name>: [ ...<artists>]
    }

    for path in paths:
        with open(path, mode='r', encoding='utf-8') as file:
            # Split the file path, select the name, then split off the extension
            category_name = file.name.split('/')[-1].split('.')[0]

            data = file.readlines()

            c_name = categories.setdefault(category_name, {})
            c_name['names'] = []
            # strip the artist names and append them to categories.name
            [c_name['names'].append(d.strip()) for d in data]
    return categories

The above results in an object of categories, each one with a list of artists. 

Now, using `preprocess_string`, the names can be tokenized.

That list will then get run through the model to check which names exist in the model and which don't.

In [6]:
def format_name(names):
    "Join a name with an underscore"
    # process all the names at once.
    return ["".join(preprocess_string(n)) for n in names]

def tokenize_names(categories):
    for c_name in categories:
        c_name = categories[c_name]
        c_name['tokens'] = []
        tokenized_names = format_name(c_name['names'])
        [c_name['tokens'].append(n) for n in tokenized_names]
    return categories


In [7]:
paths = Path('../data/historical-categories/').glob('*.txt')
categories = import_categories(paths)
categories = tokenize_names(categories)

Now we have an object for categories with full names and their tokenized forms.

```python

categories = {
    'ab-ex-names': {
        'names': [ ... ],
        'tokens': [ ... ]
    }, 
    ...
}

```

In [9]:
model = Word2Vec.load('../data/models/moma-combos.model')

Now that I have the lists of names, it's time to compare them. What I want to know is, how is each category represented in the model?

In [19]:
def print_category_relevance(C, model_tokens):
    for c_name in C:
        count = 0
        for t in C[c_name]['tokens']:
            if t in model_tokens:
                count += 1
        C[c_name]['tokens_in_model'] = count
        C[c_name]['tokens_in_category'] = len(C[c_name]['tokens'])

    ab_ex = C['ab-ex-artists']
    impressionist = C['impressionist-artists']
    conceptual = C['conceptual-artists']

    print('<category name>: <model> / <category>)')
    print(f'Ab Ex: {ab_ex["tokens_in_model"]} / {ab_ex["tokens_in_category"]}')
    print(f'Impressionist: {impressionist["tokens_in_model"]} / {impressionist["tokens_in_category"]}')
    print(f'Conceptual: {conceptual["tokens_in_model"]} / {conceptual["tokens_in_category"]}')

    
model_tokens = list(model.wv.vocab)

print_category_relevance(categories, model_tokens)

<category name>: <model> / <category>)
Ab Ex: 76 / 122
Impressionist: 14 / 52
Conceptual: 42 / 137


\begin{equation} A_m ⋂ A_c = A_{cm} \end{equation}

**A_m** Artists in the model

**A_c** Artists in the category

**A_cm** Intersection of A_m and A_c

For each artist that appears in both the category and the model, I want to:

1. get the N most similar artists (A_ms)
2. record the mean of A_cm/A_ms
3. count any incoming references from other A_cm


In [22]:
def graph_artists(tokens, model, top_n_similar=10):
    """
    tokens: A list of tokenized artist names from a category. 
            tokens in category must be the same as in the model.
    model:  The Word2Vec model
    
    Returns object
    
    graph {
        '<token>': {
            similar: [],
            in_category_mean: <int>,
        }
    }
    """
    graph = {}
    
    vocab = list(model.wv.vocab)
    
    for a in tokens:
        if a in vocab:
            node = graph.setdefault(a, {})
            node['similar'] = set()
            similar_a = model.wv.most_similar(a, topn=top_n_similar)
            for s_a, _ in similar_a:
                if s_a in tokens:
                    node['similar'].add(s_a)
            node['in_category_mean'] = len(node['similar']) / top_n_similar
            # Remove the node if there are no similar artists
            if node['in_category_mean'] == 0:
                graph.popitem()

    return graph

graph = graph_artists(categories['ab-ex-artists']['tokens'], model)
graph

{'albertalcalai': {'similar': {'louischanker'}, 'in_category_mean': 0.1},
 'williambaziot': {'similar': {'jamebrook',
   'marktobei',
   'theodorostamo',
   'theodorroszak'},
  'in_category_mean': 0.4},
 'louisbourgeoi': {'similar': {'gracehartigan', 'herbertferber', 'jamerosati'},
  'in_category_mean': 0.3},
 'ernestbrigg': {'similar': {'josephgoto'}, 'in_category_mean': 0.1},
 'jamebrook': {'similar': {'charlselig',
   'hanhofmann',
   'raoulhagu',
   'williambaziot'},
  'in_category_mean': 0.4},
 'alexandcalder': {'similar': {'arshilgorki',
   'davidsmith',
   'herbertferber',
   'jacksonpollock',
   'theodorroszak',
   'willemkoon'},
  'in_category_mean': 0.6},
 'giorgiocavallon': {'similar': {'estebanvicent',
   'leekrasner',
   'michaelgoldberg',
   'morriloui',
   'philipguston'},
  'in_category_mean': 0.5},
 'johnchamberlain': {'similar': {'twombl'}, 'in_category_mean': 0.1},
 'josephcornel': {'similar': {'gracehartigan',
   'helenfrankenthal',
   'reinhardt',
   'robertmotherw