In [27]:
import chromadb
from chromadb.utils import embedding_functions
from pypdf import PdfReader
from tqdm import tqdm
import pandas as pds
from IPython.display import clear_output
import os
import pandas as pd
import wikipediaapi
import tensorflow as tf
import time



In [2]:
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "distiluse-base-multilingual-cased-v1"
COLLECTION_NAME = "WW2-Languages-Wiki"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [4]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL,
    device="mps"
)

  from tqdm.autonotebook import tqdm, trange


In [5]:
collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func
)

In [6]:
collection = client.get_collection(COLLECTION_NAME, embedding_function=embedding_func)

In [7]:
wikipedia = wikipediaapi.Wikipedia(
    user_agent="Project (ValuedCustomer)",
    language="en",
    extract_format=wikipediaapi.ExtractFormat.WIKI,
)

In [40]:
title = "World War II"
languages = ["de", "ru", "ar", "zh"]

page = wikipedia.page(title=title)
print(page.title, "\n")


all_languages = []
langlinks = page.langlinks
for k in sorted(langlinks.keys()):
    v = langlinks[k]
    all_languages.append(v.language)
print(all_languages)
print(len(all_languages))

World War II 

['af', 'als', 'am', 'an', 'ang', 'ar', 'ary', 'arz', 'as', 'ast', 'av', 'az', 'azb', 'ba', 'ban', 'bar', 'bat-smg', 'bcl', 'be', 'be-x-old', 'bg', 'bh', 'bi', 'blk', 'bn', 'bo', 'br', 'bs', 'btm', 'bxr', 'ca', 'cbk-zam', 'cdo', 'ce', 'ceb', 'ckb', 'co', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'din', 'diq', 'dsb', 'dty', 'dv', 'el', 'eml', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'fi', 'fiu-vro', 'fo', 'fr', 'frp', 'frr', 'fur', 'fy', 'ga', 'gan', 'gcr', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'hak', 'he', 'hi', 'hif', 'hr', 'hsb', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'ig', 'ilo', 'io', 'is', 'it', 'ja', 'jam', 'jbo', 'jv', 'ka', 'kaa', 'kab', 'kbd', 'kbp', 'kcg', 'kk', 'km', 'kn', 'ko', 'krc', 'ks', 'ksh', 'ku', 'kw', 'ky', 'la', 'lad', 'lb', 'lez', 'lfn', 'li', 'lij', 'lld', 'lmo', 'lo', 'lt', 'lv', 'mai', 'map-bms', 'mg', 'mhr', 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'mwl', 'my', 'mzn', 'nap', 'nb', 'nds', 'nds-nl', 'ne', 'new', 'nl', 'nn', 'nqo', 'nv', 'ny', 'oc'

In [26]:
df = pd.DataFrame(columns=["document", "language"])

for lang in tqdm(all_languages):
    clear_output(wait=True)
    page_lang = page.langlinks[lang]
    content = page_lang.text
    print("Language:", lang)
    print("Title:", page_lang.title)
    
    for chunk in content.split("\n"):
        df.loc[-1] = [chunk, lang]
        df.index = df.index + 1
        df = df.sort_index()
    

Language: zh-yue
Title: 第二次世界大戰


100%|██████████| 227/227 [00:23<00:00,  9.85it/s]


In [28]:
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))
print()

df = df[df["document"].str.len() > 0]

print("-" * 50)
print("Aftr removing empty strings")
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))


Dataframe Shape: (24140, 2)
Number of empty strings 4710

--------------------------------------------------
Aftr removing empty strings
Dataframe Shape: (19430, 2)
Number of empty strings 0


In [29]:
print("Dataframe Shape:", df.shape)

print()

print("Average String Length:", df["document"].str.len().mean())
print("Shortest String Length:", df["document"].str.len().min())
print("Longest String Length:", df["document"].str.len().max())



df.head()


Dataframe Shape: (19430, 2)

Average String Length: 338.06664951106535
Shortest String Length: 1
Longest String Length: 6862


Unnamed: 0,document,language
0,== 參考 ==,zh-yue
3,克羅地亞,zh-yue
4,南斯拉夫,zh-yue
5,保加利亞,zh-yue
6,斯洛伐克,zh-yue


In [30]:
df_documents = df['document'].to_list()
df_languages = df['language'].to_list()

In [32]:
collection.add(
    documents = df_documents,
    metadatas = [{"language": lang} for lang in df_languages],
    ids=[f"id{i}" for i in range(len(df_documents))]
)

In [39]:
collection.count()

19430

In [37]:
collection.peek(2)

{'ids': ['id0', 'id1'],
 'embeddings': [[0.027346588671207428,
   -0.049121707677841187,
   -0.06052326038479805,
   0.021187398582696915,
   -0.09249996393918991,
   -0.015620296820998192,
   0.02335466258227825,
   -0.036205362528562546,
   -0.020064063370227814,
   -0.013469664379954338,
   -0.015714524313807487,
   -0.0011286346707493067,
   -0.02886279672384262,
   0.01664876565337181,
   0.09986474364995956,
   0.03526124730706215,
   0.017452366650104523,
   -0.10059753060340881,
   -0.01447143591940403,
   0.00974336825311184,
   -0.008981304243206978,
   0.03403535112738609,
   0.055465854704380035,
   0.044259168207645416,
   0.010911666788160801,
   -0.038738563656806946,
   0.014996207319200039,
   0.05485611781477928,
   0.010830024257302284,
   -0.006254417356103659,
   0.012387121096253395,
   0.07539049535989761,
   0.00916977133601904,
   -0.008955545723438263,
   -0.024492206051945686,
   -0.048841554671525955,
   0.01036826055496931,
   0.020321711897850037,
   0.008

In [36]:
query_results = collection.query(
    query_texts=[""],
    n_results=1
)
query_results


{'ids': [['id5605']],
 'distances': [[5.829359889762686e-13]],
 'metadatas': [[{'language': 'pl'}]],
 'embeddings': None,
 'documents': [['\t\t\t']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [38]:
len(collection.get(ids='id0', include=['embeddings'])['embeddings'][0])

512