In [None]:
#|default_exp pickle

# Exporting to pickle

> This module contains the functions to export all the embeddings to a time series format, group them together and export it as a pickle file

In [None]:
#|export
import os
import numpy as np
import pickle
import string
from numpy import dot
from numpy.linalg import norm

**NOTE** The module has 2 main functions:
* `create_dic` which creates individual `.pkl` files (for each chapter of the book) based on breakpoints of chapters given by the use. (use when the dataset is very huge. Visualizing the entire heatmap does not give a lot of information. 
* `create_dic_whole_book` creates a single `.pkl` file for the entire book. To be used when the dataset is relatively small in size i.e. 2000 - 2500 sentences.



In [None]:
#|export
def label(arg):
    """
    Returns the full name of the model based on the abbreviation
    """
    switcher = {
        'dcltr_base': "DeCLUTR Base",
        'dcltr_sm': "DeCLUTR Small",
        'distil': "DistilBERT",
        'if_FT': "InferSent FastText",
        'if_glove': "InferSent GloVe",
        'roberta': "RoBERTa",
        'use': "USE",
        'new_lex': 'Lexical Vectors',
        'old_lex': 'Lexical Weights',
        'lexical_wt': 'Lexical Weights',
        'lexical_wt_ssm': 'Lexical Weights',
        'lex_vect': 'Lexical Vectors',
        'lex_vect_corr_ts': 'Lexical Vectors (Corr)',
        'mpnet': 'MPNet',
        'minilm': 'MiniLM',
        'xlm': 'XLM'
    }
    return switcher.get(arg)

In [None]:
#|export
def cos_sim(a, b):
    """
    Returns the cosine similarity between 2 vectors. 
    """
    return dot(a, b)/(norm(a)*norm(b))

In [None]:
#|export 
def create_dict_whole_book(embedding_path, k):
    mdict = {}
    parent_dir = os.path.basename(os.path.dirname(embedding_path))
    sub_dict = {}
    for fx in os.listdir(embedding_path):
        if fx.endswith('.npy'):
            name = fx[:-4]
            embed = np.load(embedding_path+fx)
            book_name, method = get_embed_method_and_name(name)
            ts = successive_similarities(embed, k)

            name = create_label_whole_book(method, parent_dir)

            sub_dict[name] = ts

        if fx.endswith('_vect.npy'):
            name = fx[:-4]
            embed = np.load(embedding_path+fx)
            book_name, method = get_embed_method_and_name(name)
            # ts = successive_similarities(embed, k)

            name = create_label_whole_book(method, parent_dir)

            sub_dict[name] = embed


        if fx.endswith('_wt.npy'):
            name = fx[:-4]
            embed = np.load(embedding_path+fx)
            book_name, method = get_embed_method_and_name(name)
            # ts = successive_similarities(embed, k)

            name = create_label_whole_book(method, parent_dir)

            sub_dict[name] = embed

        if fx.endswith('_corr_ts.npy'):
            name = fx[:-4]
            embed = np.load(embedding_path+fx)
            book_name, method = get_embed_method_and_name(name)
            # ts = successive_similarities(embed, k)

            name = create_label_whole_book(method, parent_dir)
            print('Found Lex Corr', name)
            sub_dict[name] = embed


    mdict[0] = sub_dict
    pickle.dump(mdict, open(parent_dir +'_whole.pkl', 'wb'))

In [None]:
#|export
def create_label_whole_book(method, parent_dir):
    # returns only the method name
    return label(method)

    # Format of Book name + Method
    # return parent_dir.title() + ' ' + label(method)


In [None]:
#|export
def create_label(index, method, parent_dir):
    met = label(method)
    return 'Book ' +str(index + 1) + " " + parent_dir.title() + " " + met


In [None]:
#|export
def get_embed_method_and_name(
    fname, # name of the file
    )->(str, str): # name of file, embeddding method
    """
    Returns the name of the file and the method by 
    splitting on the word '_cleaned_'
    """
    t = fname.split('_cleaned_')
    return  t[0].split()[-1], t[-1]