In [225]:
import numpy as np
import pandas as pd
import json
import os
import pinecone
import itertools
import redis
from tqdm import tqdm
from ast import literal_eval
import glob

In [252]:
def execute_for_all_files():
    desktop = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop') 
    file_paths = glob.glob(f'{desktop}/ap-data/*')

    for index, file in enumerate(file_paths):
        print(f"Processing File {index+1}")
        dataframe = pd.read_csv(file)
        print(f"Embedding Upload for File {index+1}:")
        schematize_and_upload_embeddings(dataframe)
        
        print(f"Metadata Upload for File {index+1}:")
        schematize_and_upload_metadata(dataframe)

execute_for_all_files()

Processing File 0
Embedding Upload for File 0:


10it [00:00, 419.71it/s]


Metadata Upload for File 0:


4it [00:01,  2.71it/s]


Processing File 1
Embedding Upload for File 1:


10it [00:00, 339.30it/s]


Metadata Upload for File 1:


4it [00:01,  2.53it/s]


# Supporting Code

In [239]:
def preprocess_embeddings(dataframe):
    embeddings = dataframe.loc[:, ['doi', 'embedding']]
    embeddings.columns = ['id', 'values']
    embeddings['values'] = embeddings['values'].apply(literal_eval)
    return embeddings

In [240]:
def batches(iterable, batch_size=100):
  """Helper function to break an iterable into chunks of size batch_size."""
  it = iter(iterable)
  chunk = list(itertools.islice(it, batch_size))
  while chunk:
      yield chunk
      chunk = list(itertools.islice(it, batch_size))

def batch_upload(vector_list, pinecone_index, batch_size):
  for vector_batch in batches(vector_list, batch_size=batch_size):
    pinecone_index.upsert(vectors=vector_batch)

In [241]:
def async_batch_upload(vector_list, batch_size):
    with pinecone.Index("embedding-db", pool_threads=30) as index:
        async_results = [
            index.upsert(vectors=id_vectors_chunk, async_req=True)
            for id_vectors_chunk in tqdm(batches(vector_list, batch_size=batch_size))
        ]

        return [async_result.get() for async_result in async_results]


# Executable

In [242]:
def schematize_and_upload_embeddings(unprocessed_data):
    pinecone.init(api_key="7b1da713-81bd-4c6e-9e9f-c6bace0fae47", environment="us-west1-gcp")
    index = pinecone.Index("embedding-db")

    data = preprocess_embeddings(unprocessed_data)
    data_itertuples = list(data.itertuples(index=False, name=None))
    return async_batch_upload(data_itertuples, 100)            

# Other Embedding Code

In [243]:
def schematize_and_write_embeddings(unprocessed_data, vectors_per_file):
    data = preprocess_embeddings(unprocessed_data)

    desktop = os.path.join(os.path.join(os.path.expanduser('~')), 'Desktop') 

    for i in range(0, len(data), vectors_per_file):
        df = data.loc[i:i+vectors_per_file]
        # converts from pandas dataframe directly to json
        json_data = json.loads(df.to_json(orient='records'))
        with open(f"{desktop}/embeddings-folder/file-{i}.csv", "w") as f:
                json.dump(json_data, f)

# Metadata Stuff

In [244]:
def dict_chunks(data, size):
    it = iter(data)
    for i in range(0, len(data), size):
        yield {k:data[k] for k in itertools.islice(it, size)}

In [245]:
def process_metadata(dataframe):
    doi_list = dataframe['doi'].tolist()
    metadata = dataframe.drop(['embedding', 'doi'], axis=1)
    json_metadata = json.loads(metadata.to_json(orient='records'))
    json_list = [ json.dumps(i) for i in json_metadata]
    return ( doi_list, json_list )

def schematize_and_upload_metadata(dataframe, batch_size=300):
    doi_list, json_list = process_metadata(dataframe)

    r = redis.Redis(
    host= 'global-sterling-marlin-30591.upstash.io',
    port= '30591',
    password= '86c6e52311d54de0af1bfd7c21d52056', ssl=True)

    for chunk in tqdm(dict_chunks(dict(zip(doi_list, json_list)), batch_size)):
        r.mset(chunk)

# Deleting all Data

In [246]:
# pinecone.init(api_key="7b1da713-81bd-4c6e-9e9f-c6bace0fae47", environment="us-west1-gcp")
# index = pinecone.Index("embedding-db")
# index.delete(delete_all=True)

In [247]:
# r = redis.Redis(
#     host= 'global-sterling-marlin-30591.upstash.io',
#     port= '30591',
#     password= '86c6e52311d54de0af1bfd7c21d52056', ssl=True)
# r.flushdb()