In [None]:
import chromadb
from chromadb.utils import embedding_functions
from pypdf import PdfReader
from tqdm import tqdm
import pandas as pds
from IPython.display import clear_output
import os
import pandas as pd
import wikipediaapi
import tensorflow as tf
import time



In [None]:
tf.config.list_physical_devices('GPU')


In [None]:
CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "distiluse-base-multilingual-cased-v1"
COLLECTION_NAME = "WW2-Languages-Wiki-Limited"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [None]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL,
    device="mps"
)

In [None]:
collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func
)

In [None]:
collection = client.get_collection(COLLECTION_NAME, embedding_function=embedding_func)

In [None]:
wikipedia = wikipediaapi.Wikipedia(
    user_agent="Project (ValuedCustomer)",
    language="en",
    extract_format=wikipediaapi.ExtractFormat.WIKI,
)

In [None]:
title = "World War II"
selected_languages = ["ar", "zh", "nl", "fr", "de", "it", "ko", "pl", "pt", "ru", "es", "tr"]

page = wikipedia.page(title=title)
print(page.title, "\n")


all_languages = []
langlinks = page.langlinks
for k in sorted(langlinks.keys()):
    v = langlinks[k]
    all_languages.append(v.language)
print(all_languages)
print(len(all_languages))

In [None]:
df = pd.DataFrame(columns=["document", "language"])

content = page.text
print("Language:", "en")
print("Title:", page.title)
for chunk in content.split("."):
    df.loc[-1] = [chunk, "en"]
    df.index = df.index + 1
    df = df.sort_index()


for lang in tqdm(selected_languages):
    clear_output(wait=True)
    page_lang = page.langlinks[lang]
    content = page_lang.text
    print("Language:", lang)
    print("Title:", page_lang.title)
    
    for chunk in content.split("."):
        df.loc[-1] = [chunk, lang]
        df.index = df.index + 1
        df = df.sort_index()
    

In [None]:
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))
print()

df = df[df["document"].str.len() > 45]

print("-" * 50)
print("Aftr removing empty strings")
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))


In [None]:
print("Dataframe Shape:", df.shape)

print()

print("Average String Length:", df["document"].str.len().mean())
print("Shortest String Length:", df["document"].str.len().min())
print("Longest String Length:", df["document"].str.len().max())



df.head()


In [None]:
df.tail()


In [None]:
df_documents = df['document'].to_list()
df_languages = df['language'].to_list()
print(len(df_documents), len(df_languages))

In [None]:
collection.add(
    documents = df_documents,
    metadatas = [{"language": lang} for lang in df_languages],
    ids=[f"id{i}" for i in range(len(df_documents))]
)

In [None]:
collection.count()

In [None]:
collection.peek(2)

In [None]:
query_results = collection.query(
    query_texts=[""],
    n_results=1
)
query_results


In [None]:
len(collection.get(ids='id0', include=['embeddings'])['embeddings'][0])