# Initialization

In [1]:
print("Hello there.")

Hello there.


In [2]:
# Colab specific data upload
!unzip cran.zip

Archive:  cran.zip
replace cran/cran.all.1400? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [3]:
# Auto re-import .py files
%load_ext autoreload
%autoreload 2

# Plotting with plt
%matplotlib inline

In [4]:
import numpy as np
import pandas as pd
import pprint

from preprocessing import *
from synonym_enrich import *
from cosine_sim import *
from baseline import *
from validate import *

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
!pip install --upgrade gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)




# Validation Core

In [7]:
# Parameters
params = {
    "data_parse_fnt": parseDocs,
    "retrieval_fnt": read_cran_relevancy,
    "doc_args": ["cran/cran.all.1400",],
    "query_args": ["cran/cran.qry",],
    "retrieval_args": {
        "path": "cran/cranqrel",
        "relevancy_threshold": 4,},
    "preprocessings": [
        tokenize_and_clean,
        lemmatize,
    ],
    "use_tfidf": True,
    "use_lsi": False,
    "d": 5,
    "embedding": None,
    # "embedding": Word2Vec.load("w2v.model"),   # <-- Maybe, load model inside validate()?
    # "embedding": Word2Vec.load("d2v.model"),
    "k": 10,
    "test_size": 0.4,
}

pprint.pprint(validate(**params))

{'mAP': 4.233686067019401}


# Other code

Just for inspiration.

In [None]:
def compute_average_results(closest_songs, train_index, test_index):
    """Computes average metrics for a model based on n closest songs for a test set 
    
    Parameters
    ----------
    closest_songs : a list of lists 
        a list of length equal to the number of songs in a query matrix
        each element contain n closest songs from a training collection to that song
    train_index : dict {atrist:lits of song indices}
        indices of songs in training collection assigned to artists
    test_index : dict {atrist:lits of song indices}
        indices of songs in test collection assigned to artists
    
    Returns
    -------
    average_precision: float 
        average precision based on all test songs
    average_recall: float
        average recall based on all test songs
    average_accuracy: float
        average accuracy based on all test songs
    average_error: float
        average error based on all test songs
    average_f_measure: float
        average f_measure based on all test songs
    """
    
    precision = recall = accuracy = error = f_measure = n_tests = 0
    n = sum([len(x) for x in train_index.values()])
    
    art_song_pairs = [(k, x) for k,v in test_index.items() for x in v]
    for artist, song_idx in art_song_pairs:
        if song_idx >= len(closest_songs):
            continue
            
        n_tests += 1
        q_result = closest_songs[song_idx]
        
        other_artists = list(train_index.keys())
        other_artists.remove(artist)
        tp = sum([x in train_index[artist] for x in q_result])
        fp = sum([x not in train_index[artist] for x in q_result])
        fn = sum([x not in q_result for x in train_index[artist]])
        tn = sum([x not in q_result for a in other_artists for x in train_index[a]])

        p = tp / (tp + fp)
        r = tp / (tp + fn)
        precision += p
        recall += r
        accuracy += (tp + tn) / n
        error += (fp + fn) / n
        if p + r > 0:
            f_measure += (2 * p * r) / (p + r)
    
    return precision / n_tests, recall / n_tests, accuracy / n_tests, error / n_tests, f_measure / n_tests