# Do the embedding and upsert to pinecone

In [41]:
import sys
sys.path.append("../")

from importlib import reload
import wikipedia_embedder
reload(wikipedia_embedder)

from wikipedia_embedder import WikiEmbedder


input_parquet_path = "df_english_filtered_articles.cache.parquet"
checkpoint_path = "df_checkpoint.parquet"
pinecone_index_name = "english-wiki"

# batch size controls how many we upsert to pinecone at a time
# and also how many are embedded at a time.
# something like 50 is a good number.
batch_size = 50
embedding_model = "multilingual-e5-large"


wiki_embedder = WikiEmbedder(
    input_parquet_path,
    checkpoint_path,
    pinecone_index_name,
    batch_size=batch_size,
    embedding_model=embedding_model,
)

wiki_embedder.print_init_info()


2024-12-08 11:08:13,010 - INFO - Discovering subpackages in _NamespacePath(['/Users/einar/git/hafsteinn/together_rag/venv/lib/python3.10/site-packages/pinecone_plugins'])
2024-12-08 11:08:13,012 - INFO - Looking for plugins in pinecone_plugins.inference
2024-12-08 11:08:13,012 - INFO - Installing plugin inference into Pinecone
2024-12-08 11:08:15,504 - INFO - Found 49680 fully processed articles
2024-12-08 11:08:15,505 - INFO - Processing remaining 320 articles (including partially processed)



Original number of articles: 50000
Total number of chunks: 753048
Embedding model: multilingual-e5-large
Batch size: 50
Checkpoint path: df_checkpoint.parquet
N rows in checkpoint: 752728
N unique articles in checkpoint: 49680
N articles to process: 320
n checkpoint + n top process (sanity check): 50000


In [42]:
"""
NOTE: takes around 8 hours to run for 50K articles of the english wikipedia.
"""

# controls how many articles are processed at a time
articles_per_batch = 10

wiki_embedder.process_articles(articles_per_batch=articles_per_batch)

2024-12-08 11:08:19,723 - INFO - Processing articles 0 to 9
2024-12-08 11:08:19,724 - INFO - N chunks in batch: 10
2024-12-08 11:08:20,487 - INFO - Processed batch 1: Embeddings 0 to 49
2024-12-08 11:08:21,116 - INFO - Completed processing batch. Total processed chunks: 752738
2024-12-08 11:08:21,116 - INFO - Processing articles 10 to 19
2024-12-08 11:08:21,117 - INFO - N chunks in batch: 10
2024-12-08 11:08:21,662 - INFO - Processed batch 1: Embeddings 0 to 49
2024-12-08 11:08:22,040 - INFO - Completed processing batch. Total processed chunks: 752748
2024-12-08 11:08:22,040 - INFO - Processing articles 20 to 29
2024-12-08 11:08:22,040 - INFO - N chunks in batch: 10
2024-12-08 11:08:22,520 - INFO - Processed batch 1: Embeddings 0 to 49
2024-12-08 11:08:22,827 - INFO - Completed processing batch. Total processed chunks: 752758
2024-12-08 11:08:22,828 - INFO - Processing articles 30 to 39
2024-12-08 11:08:22,828 - INFO - N chunks in batch: 10
2024-12-08 11:08:23,325 - INFO - Processed ba

# Inspect the checkpoint

In [21]:
"""
for ref from above:
Processing 50000 articles
Total number of chunks: 753048
Embedding model: multilingual-e5-large
Batch size: 50
Checkpoint path: df_checkpoint.parquet
N rows in checkpoint: 8710
N unique articles in checkpoint: 60
"""

import pandas as pd

df_checkpoint = pd.read_parquet(checkpoint_path)
print(len(df_checkpoint))
print(len(df_checkpoint["title_match"].unique()))
df_checkpoint.head(2)

660872
28490


Unnamed: 0,title_match,chunk_idx
0,2000s,0
1,2000s,1


In [25]:
done_perc_chunks = len(df_checkpoint) / wiki_embedder.df.n_chunks.sum()
done_perc_articles = len(df_checkpoint.title_match.unique()) / len(wiki_embedder.df.title_match.unique())
print(f"Done {done_perc_chunks:.2%} of chunks")
print(f"Done {done_perc_articles:.2%} of articles")



Done 87.76% of chunks
Done 56.98% of articles
