In [None]:
#|default_exp pickle

# Exporting to pickle

> This module contains the functions to export all the embeddings to a time series format, group them together and export it as a pickle file

In [None]:
#|export
import os
import numpy as np
import pickle
import string
from numpy import dot
from numpy.linalg import norm
from fastcore.xtras import *
from fastcore.script import *

**NOTE** The module has 2 main functions:
* `create_dic` which creates individual `.pkl` files (for each chapter of the book) based on breakpoints of chapters given by the use. (use when the dataset is very huge. Visualizing the entire heatmap does not give a lot of information. 
* `create_dic_whole_book` creates a single `.pkl` file for the entire book. To be used when the dataset is relatively small in size i.e. 2000 - 2500 sentences.



In [None]:
# |export
def label(
    method:str # name of the method
    ):
    """
    Returns the full name of the model based on the abbreviation
    """
    switcher = {
        'dcltr_base': "DeCLUTR Base",
        'dcltr_sm': "DeCLUTR Small",
        'distil': "DistilBERT",
        'if_FT': "InferSent FastText",
        'if_glove': "InferSent GloVe",
        'roberta': "RoBERTa",
        'use': "USE",
        'new_lex': 'Lexical Vectors',
        'old_lex': 'Lexical Weights',
        'lexical_wt': 'Lexical Weights',
        'lexical_wt_ssm': 'Lexical Weights',
        'lex_vect': 'Lexical Vectors',
        'lex_vect_corr_ts': 'Lexical Vectors (Corr)',
        'mpnet': 'MPNet',
        'minilm': 'MiniLM',
        'xlm': 'XLM'
    }
    return switcher.get(method)

In [None]:
#|export
def cos_sim(
    a: np.ndarray, # vector 1 
    b: np.ndarray, # vector 2
    ):
    """
    Returns the cosine similarity between 2 vectors. 
    """
    return dot(a, b)/(norm(a)*norm(b))

In [None]:
#|export
from pathlib import Path

In [None]:
#|export 
def successive_similarities(embeddings, k):
    successive = []
    for i in range(len(embeddings) - k):
        successive.append(cos_sim(embeddings[i], embeddings[i+k]))
    return successive

In [None]:
#|export 
@call_parse
def create_dict_whole_book(
    embedding_path:str = '.', # path to the embeddings
    k:int=1, # consecutive index
    ):
    "Create pkl for time series from embeddings"
    p = Path(embedding_path).absolute()
    book_name = p.stem.replace('_', ' ').title()
    
    mdict = {}
    parent_dir = os.path.basename(os.path.dirname(embedding_path))
    sub_dict = {}
    
    files = globtastic(p, recursive=False, file_glob='*.npy').map(Path)
    flen = files.__len__()
    if flen < 1:
        print(f'Found {flen} embeddings')
        print(f'Check `embedding path` and try again')
        return
        
    print(f'Book Name: {book_name}')
    print(f'Found {flen} methods')
    print('-'*45)
    
    for f in files:
        name = f.stem

        if name.endswith('_vect'):
            embed = np.load(f)
            book_name, method = get_embed_method_and_name(name)
            # ts = successive_similarities(embed, k)
            sub_dict[name] = embed


        elif name.endswith('_wt'):
            embed = np.load(f)
            book_name, method = get_embed_method_and_name(name)
            # ts = successive_similarities(embed, k)
            sub_dict[name] = embed

        elif name.endswith('_corr_ts'):
            embed = np.load(f)
            book_name, method = get_embed_method_and_name(name)
            # ts = successive_similarities(embed, k)
            print('Found Lex Corr', name)
            sub_dict[name] = embed
        
        else:
            embed = np.load(f)
            book_name, method = get_embed_method_and_name(name)
            
            ts = successive_similarities(embed, k)
            name = create_label_whole_book(method, parent_dir)
            print(f'Found {name}')
            sub_dict[name] = ts

    mdict[0] = sub_dict
    
    new_path = p/'pkl'
    new_path.mkdir(exist_ok = True)
    pickle.dump(mdict, open(new_path/f'{book_name}_whole.pkl', 'wb'))
    print('-'*45)
    print(f'Saved pkl at {new_path}')

In [None]:
#|export
def create_label_whole_book(method, parent_dir):
    # returns only the method name
    return label(method)

    # Format of Book name + Method
    # return parent_dir.title() + ' ' + label(method)


In [None]:
#|export
def create_label(index, method, parent_dir):
    met = label(method)
    return 'Book ' +str(index + 1) + " " + parent_dir.title() + " " + met


In [None]:
#|export
def get_embed_method_and_name(
    fname, # name of the file
    )->(str, str): # name of file, embeddding method
    """
    Returns the name of the file and the method by 
    splitting on the word '_cleaned_'
    """
    t = fname.split('_cleaned_')
    return  t[0].split()[-1], t[-1]

In [None]:
#| local
%cd ~

/home/deven


In [None]:
#| local
create_dict_whole_book('embeddings/A_Modest_Proposal', 1)

Book Name: A Modest Proposal
Found 10 methods
---------------------------------------------
Found DeCLUTR Small
Found RoBERTa
Found InferSent GloVe
Found InferSent FastText
Found DistilBERT
Found XLM
Found MPNet
Found USE
Found DeCLUTR Base
Found MiniLM
---------------------------------------------
Saved pkl at /home/deven/embeddings/A_Modest_Proposal/pkl
