In [1]:
import chromadb
from chromadb.utils import embedding_functions
from pypdf import PdfReader
from tqdm import tqdm
import pandas as pds
from IPython.display import clear_output
import os
import pandas as pd
import wikipediaapi
import tensorflow as tf
import time



In [2]:
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "distiluse-base-multilingual-cased-v1"
COLLECTION_NAME = "WW2-Languages-Wiki"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [4]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL,
    device="mps"
)

  from tqdm.autonotebook import tqdm, trange


In [5]:
collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func
)

In [6]:
collection = client.get_collection(COLLECTION_NAME, embedding_function=embedding_func)

In [7]:
wikipedia = wikipediaapi.Wikipedia(
    user_agent="Project (ValuedCustomer)",
    language="en",
    extract_format=wikipediaapi.ExtractFormat.WIKI,
)

In [8]:
title = "World War II"
languages = ["de", "ru", "ar", "zh"]

page = wikipedia.page(title=title)
print(page.title, "\n")


all_languages = []
langlinks = page.langlinks
for k in sorted(langlinks.keys()):
    v = langlinks[k]
    all_languages.append(v.language)
print(all_languages)
print(len(all_languages))

World War II 

['af', 'als', 'am', 'an', 'ang', 'ar', 'ary', 'arz', 'as', 'ast', 'av', 'az', 'azb', 'ba', 'ban', 'bar', 'bat-smg', 'bcl', 'be', 'be-x-old', 'bg', 'bh', 'bi', 'blk', 'bn', 'bo', 'br', 'bs', 'btm', 'bxr', 'ca', 'cbk-zam', 'cdo', 'ce', 'ceb', 'ckb', 'co', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'din', 'diq', 'dsb', 'dty', 'dv', 'el', 'eml', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'fi', 'fiu-vro', 'fo', 'fr', 'frp', 'frr', 'fur', 'fy', 'ga', 'gan', 'gcr', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'hak', 'he', 'hi', 'hif', 'hr', 'hsb', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'ig', 'ilo', 'io', 'is', 'it', 'ja', 'jam', 'jbo', 'jv', 'ka', 'kaa', 'kab', 'kbd', 'kbp', 'kcg', 'kk', 'km', 'kn', 'ko', 'krc', 'ks', 'ksh', 'ku', 'kw', 'ky', 'la', 'lad', 'lb', 'lez', 'lfn', 'li', 'lij', 'lld', 'lmo', 'lo', 'lt', 'lv', 'mai', 'map-bms', 'mg', 'mhr', 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'mwl', 'my', 'mzn', 'nap', 'nb', 'nds', 'nds-nl', 'ne', 'new', 'nl', 'nn', 'nqo', 'nv', 'ny', 'oc'

In [9]:
df = pd.DataFrame(columns=["document", "language"])

content = page.text
print("Language:", "en")
print("Title:", page.title)
for chunk in content.split("."):
    df.loc[-1] = [chunk, "en"]
    df.index = df.index + 1
    df = df.sort_index()


for lang in tqdm(all_languages):
    clear_output(wait=True)
    page_lang = page.langlinks[lang]
    content = page_lang.text
    print("Language:", lang)
    print("Title:", page_lang.title)
    
    for chunk in content.split("."):
        df.loc[-1] = [chunk, lang]
        df.index = df.index + 1
        df = df.sort_index()
    

100%|██████████| 227/227 [01:44<00:00,  2.18it/s]

Language: zh-yue
Title: 第二次世界大戰





In [29]:
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))
print()

df = df[df["document"].str.len() > 45]

print("-" * 50)
print("Aftr removing empty strings")
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))


Dataframe Shape: (41019, 2)
Number of empty strings 0

--------------------------------------------------
Aftr removing empty strings
Dataframe Shape: (41019, 2)
Number of empty strings 0


In [30]:
print("Dataframe Shape:", df.shape)

print()

print("Average String Length:", df["document"].str.len().mean())
print("Shortest String Length:", df["document"].str.len().min())
print("Longest String Length:", df["document"].str.len().max())



df.head()


Dataframe Shape: (41019, 2)

Average String Length: 157.46088398059436
Shortest String Length: 46
Longest String Length: 44649


Unnamed: 0,document,language
0,1%，其中美國戰俘甚至有37%嘅死亡率，呢個比例甚至比美軍遭到德國同意大利俘虜者嘅死亡率重要...,zh-yue
1,7平方公里嘅中途島附近海域發生咗一場海戰，成為美國反擊日本聯合艦隊嘅轉戾點，美國以企業號、大...,zh-yue
2,Gen Percival向日軍投降（新加坡淪陷）。\n\n太平洋戰爭\n日本偷襲珍珠港\n...,zh-yue
3,6萬人嘅盟軍晌1944年6月6號（D-Day）由諾曼第呢個缺口登陸。另一方面，德國亦喺進攻蘇...,zh-yue
4,第二次世界大戰（粵拼：dai6 ji6 ci3 sai3 gaai3 daai6 zin3）...,zh-yue


In [32]:
df.tail()


Unnamed: 0,document,language
49190,"Millions died in genocides, including the Hol...",en
49191,World War II was the deadliest conflict in hi...,en
49192,"Tanks and aircraft played major roles, with t...",en
49193,Nearly all of the world's countries—including...,en
49194,World War II or the Second World War (1 Septem...,en


In [35]:
df_documents = df['document'].to_list()
df_languages = df['language'].to_list()
print(len(df_documents), len(df_languages))

41019 41019


In [36]:
collection.add(
    documents = df_documents,
    metadatas = [{"language": lang} for lang in df_languages],
    ids=[f"id{i}" for i in range(len(df_documents))]
)

In [37]:
collection.count()

41019

In [38]:
collection.peek(2)

{'ids': ['id0', 'id1'],
 'embeddings': [[-0.014623771421611309,
   -0.007280128076672554,
   -0.04341854527592659,
   -0.0009863857412710786,
   0.0016464856453239918,
   -0.08099159598350525,
   0.028682362288236618,
   -0.004037132486701012,
   -0.045037440955638885,
   -0.024177655577659607,
   -0.02827092818915844,
   0.028360426425933838,
   -0.035885270684957504,
   -0.03765057772397995,
   0.018827082589268684,
   0.014624853618443012,
   0.02098051831126213,
   -0.006329440977424383,
   -0.008367746137082577,
   -0.008516192436218262,
   -0.042703431099653244,
   -0.022372029721736908,
   -0.013317338190972805,
   0.020588507875800133,
   -0.061727654188871384,
   0.05694252625107765,
   -0.1163429319858551,
   0.005707522388547659,
   -0.08098507672548294,
   0.008175515569746494,
   0.01767764985561371,
   0.022180259227752686,
   0.02368452586233616,
   -0.04931127279996872,
   -0.03783075883984566,
   0.03680940344929695,
   0.03928889334201813,
   0.08276163786649704,
   -

In [39]:
query_results = collection.query(
    query_texts=[""],
    n_results=1
)
query_results


{'ids': [['id4277']],
 'distances': [[1.3976377248764038]],
 'metadatas': [[{'language': 'ta'}]],
 'embeddings': None,
 'documents': [[' ஆனால், பிரித்தானிய தேசிய மதிப்பு வாய்ந்த விமானப் படையின் தாக்குதலைத் தோற்கடிப்பதில் தோல்வி அடைந்தது']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [40]:
len(collection.get(ids='id0', include=['embeddings'])['embeddings'][0])

512