In [5]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import uuid

# Data

In [6]:
with open("../data/ismir2021.json", "r") as f:
    ismir_df = pd.read_json(f)

ismir_df['combined'] = ismir_df.apply(lambda x: f'title: {x["title"]}, abstract: {x["abstract"]}', axis=1)
ismir_df['uuid'] = ismir_df.apply(lambda x: uuid.uuid5(uuid.NAMESPACE_DNS, x['combined']), axis=1)

ismir_df

Unnamed: 0,title,author,year,pages,abstract,ee,extra,combined,uuid
0,Four-way Classification of Tabla Strokes with ...,"[Rohit M A, Amitrajit Bhattacharjee, Preeti Rao]",2021,19-26,Motivated by musicological applications of the...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'Automatic transcription for data...,title: Four-way Classification of Tabla Stroke...,1bea36cc-16a4-592e-880d-0b5569291dae
1,A Contextual Latent Space Model: Subsequence M...,[Taketo Akama],2021,27-34,Some generative models for sequences such as m...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'Latent space models and context ...,title: A Contextual Latent Space Model: Subseq...,68669618-ff82-5ffd-9793-afda49d690a6
2,OMR-assisted transcription: a case study with ...,"[María Alfaro-Contreras, David Rizo, Jose M. I...",2021,35-41,Most of the musical heritage is only available...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'This paper contributes to a bett...,title: OMR-assisted transcription: a case stud...,2a5f889a-ce76-5629-9a74-8ce9d63c0e09
3,Deeper Convolutional Neural Networks and Broad...,[Stefan A Baumann],2021,42-49,"In recent years, complex convolutional neural ...",https://archives.ismir.net/ismir2021/paper/000...,"{'takeaway': 'Deeper, more complex Convolution...",title: Deeper Convolutional Neural Networks an...,7a44de35-e9d7-5699-9ae2-5bba5d78a199
4,The Music Performance Markup Format and Ecosystem,[Axel Berndt],2021,50-57,Music Performance Markup (MPM) is a new XML fo...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'The paper introduces the Music P...,title: The Music Performance Markup Format and...,9c6d8787-a89b-5b14-bd9d-5751b1a4627c
...,...,...,...,...,...,...,...,...,...
99,Composer Classification With Cross-Modal Trans...,"[Daniel Yang, Timothy Tsai]",2021,802-809,This paper studies composer style classificati...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'We improve composer classificati...,title: Composer Classification With Cross-Moda...,52dff9f6-8f92-5e1c-87e4-7776c492f701
100,Aligning Unsynchronized Part Recordings to a F...,"[Daniel Yang, Kevin Ji, Timothy Tsai]",2021,810-817,This paper explores an application that would ...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'We propose a way to align a set ...,title: Aligning Unsynchronized Part Recordings...,ce65817b-bfb7-5e79-818f-1509bf42d5fe
101,ADTOF: A large dataset of non-synthetic music ...,"[Mickael Zehren, Marco Alunno, Paolo Bientinesi]",2021,818-824,The state-of-the-art methods for drum transcri...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'To fight data paucity in the fie...,title: ADTOF: A large dataset of non-synthetic...,ae41534d-b5c6-5859-9a8d-f7e9e23817eb
102,Learn by Referencing: Towards Deep Metric Lear...,"[Huan Zhang, Yiliang Jiang, Tao Jiang, Hu Peng]",2021,825-832,The excellence of human singing is an importan...,https://archives.ismir.net/ismir2021/paper/000...,{'takeaway': 'We proposed a metric-learning ba...,title: Learn by Referencing: Towards Deep Metr...,296c9dea-6809-5571-b26c-80b80d66b8e2


# Embedding Model

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

In [7]:
# all-MiniLM-L6-v2

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

model_id = 'sentence-transformers/all-MiniLM-L6-v2'
def get_embeddings(input: list[str]):

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id)

    encoded_input = tokenizer(input, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

embeddings = get_embeddings(list(ismir_df['combined']))

# Pinecone

In [8]:
import os
import pinecone
import time
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()


True

### Index Init

In [9]:
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENVIRONMENT")
)

In [10]:
pinecone.whoami()

WhoAmIResponse(username=None, user_label=None, projectname='d66c2c2')

In [11]:
x = dir(pinecone)
for a in x:
    print(x)

['ApiAttributeError', 'ApiException', 'ApiKeyError', 'ApiTypeError', 'ApiValueError', 'CollectionDescription', 'Config', 'DeleteRequest', 'DeleteResult', 'DescribeIndexStatsRequest', 'DescribeIndexStatsResponse', 'FetchResponse', 'FetchResult', 'ForbiddenException', 'Index', 'IndexDescription', 'InfoResult', 'NotFoundException', 'OpenApiException', 'PineconeException', 'PineconeProtocolError', 'ProtobufAny', 'QueryRequest', 'QueryResponse', 'QueryResult', 'QueryVector', 'RpcStatus', 'ScoredVector', 'ServiceException', 'SingleQueryResults', 'SparseValues', 'UnauthorizedException', 'UpdateRequest', 'UpsertRequest', 'UpsertResponse', 'UpsertResult', 'Vector', 'VersionResponse', 'WhoAmIResponse', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'config', 'configure_index', 'core', 'create_collection', 'create_index', 'delete_collection', 'delete_index', 'describe_collection', 'describe_index', 'exceptions',

In [18]:
index_name = 'ismir2021'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine' # dependent on model
    )
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

In [19]:
index = pinecone.Index(index_name)
index.describe_index_stats()
index

<pinecone.index.Index at 0x16911d510>

### Upsert

In [20]:
batch_size = 32

for i in tqdm(range(0, len(ismir_df), batch_size)):
    i_end = min(len(ismir_df), i+batch_size)
    batch = ismir_df.iloc[i:i_end]
    
    ids = [f"{x['uuid']}" for _, x in batch.iterrows()]
    values = [x for x in embeddings[i:i_end]]
    metadata = [
        {
            'combined': x['combined'],
            'title': x['title'],
            'authors': x['author'],
            'year': x['year'],
            'abstract': x['abstract'],
            'ee': x['ee'],
            'takeaway': x['extra']['takeaway'],
            'best_paper_candidate_bool': x['extra']['best_paper_candidate'],
            'subject_area_primary': x['extra']['subject_area_primary']
        } for _, x in batch.iterrows()]
    
    index.upsert(
        vectors=[
            {
                'id': ids[i],
                'values': values[i].numpy().tolist(), # embeddings
                'metadata': metadata[i]
            } for i in range(len(ids))]
        )
    

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:04<00:00,  1.00s/it]


In [21]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.00104,
 'namespaces': {'': {'vector_count': 104}},
 'total_vector_count': 104}

# Query Test

In [23]:
queries = ["This study investigates automatic classification methods for tabla strokes, motivated by musicological applications and the acoustic correspondence with Western drum types. The study compares transfer learning on a pre-trained multiclass CNN drums model with 1-way models trained separately for each tabla stroke class. The 1-way models perform better overall, while adapted 3-way models show improved performance for the scarcest target class, and data augmentation strategies are explored for enhanced model robustness."]
q_embeddings = get_embeddings(queries)

index.query(
    vector=q_embeddings[0].numpy().tolist(),
    top_k=3,
    filter={}
)

{'matches': [{'id': '1bea36cc-16a4-592e-880d-0b5569291dae',
              'score': 0.934879959,
              'values': []},
             {'id': 'fb484b14-3a12-5ec0-b691-303a56a8de17',
              'score': 0.62498337,
              'values': []},
             {'id': '52dff9f6-8f92-5e1c-87e4-7776c492f701',
              'score': 0.614866853,
              'values': []}],
 'namespace': ''}