In [1]:
import chromadb
from chromadb.utils import embedding_functions
from pypdf import PdfReader
from tqdm import tqdm
import pandas as pds
from IPython.display import clear_output
import os
import pandas as pd
import wikipediaapi
import tensorflow as tf
import time



In [2]:
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "distiluse-base-multilingual-cased-v1"
COLLECTION_NAME = "WW2-Languages-Wiki-Limited"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [4]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL,
    device="mps"
)

  from tqdm.autonotebook import tqdm, trange


In [5]:
collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func
)

In [6]:
collection = client.get_collection(COLLECTION_NAME, embedding_function=embedding_func)

In [7]:
wikipedia = wikipediaapi.Wikipedia(
    user_agent="Project (ValuedCustomer)",
    language="en",
    extract_format=wikipediaapi.ExtractFormat.WIKI,
)

In [10]:
title = "World War II"
selected_languages = ["ar", "zh", "nl", "fr", "de", "it", "ko", "pl", "pt", "ru", "es", "tr"]

page = wikipedia.page(title=title)
print(page.title, "\n")


all_languages = []
langlinks = page.langlinks
for k in sorted(langlinks.keys()):
    v = langlinks[k]
    all_languages.append(v.language)
print(all_languages)
print(len(all_languages))

World War II 

['af', 'als', 'am', 'an', 'ang', 'ar', 'ary', 'arz', 'as', 'ast', 'av', 'az', 'azb', 'ba', 'ban', 'bar', 'bat-smg', 'bcl', 'be', 'be-x-old', 'bg', 'bh', 'bi', 'blk', 'bn', 'bo', 'br', 'bs', 'btm', 'bxr', 'ca', 'cbk-zam', 'cdo', 'ce', 'ceb', 'ckb', 'co', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'din', 'diq', 'dsb', 'dty', 'dv', 'el', 'eml', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'fi', 'fiu-vro', 'fo', 'fr', 'frp', 'frr', 'fur', 'fy', 'ga', 'gan', 'gcr', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'hak', 'he', 'hi', 'hif', 'hr', 'hsb', 'ht', 'hu', 'hy', 'hyw', 'ia', 'id', 'ig', 'ilo', 'io', 'is', 'it', 'ja', 'jam', 'jbo', 'jv', 'ka', 'kaa', 'kab', 'kbd', 'kbp', 'kcg', 'kk', 'km', 'kn', 'ko', 'krc', 'ks', 'ksh', 'ku', 'kw', 'ky', 'la', 'lad', 'lb', 'lez', 'lfn', 'li', 'lij', 'lld', 'lmo', 'lo', 'lt', 'lv', 'mai', 'map-bms', 'mg', 'mhr', 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'mwl', 'my', 'mzn', 'nap', 'nb', 'nds', 'nds-nl', 'ne', 'new', 'nl', 'nn', 'nqo', 'nv', 'ny', 'oc'

In [11]:
df = pd.DataFrame(columns=["document", "language"])

content = page.text
print("Language:", "en")
print("Title:", page.title)
for chunk in content.split("."):
    df.loc[-1] = [chunk, "en"]
    df.index = df.index + 1
    df = df.sort_index()


for lang in tqdm(selected_languages):
    clear_output(wait=True)
    page_lang = page.langlinks[lang]
    content = page_lang.text
    print("Language:", lang)
    print("Title:", page_lang.title)
    
    for chunk in content.split("."):
        df.loc[-1] = [chunk, lang]
        df.index = df.index + 1
        df = df.sort_index()
    

Language: tr
Title: II. Dünya Savaşı


100%|██████████| 12/12 [00:07<00:00,  1.57it/s]


In [12]:
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))
print()

df = df[df["document"].str.len() > 45]

print("-" * 50)
print("Aftr removing empty strings")
print("Dataframe Shape:", df.shape)
print("Number of empty strings", len(df[df["document"].str.len() == 0]))


Dataframe Shape: (10600, 2)
Number of empty strings 5

--------------------------------------------------
Aftr removing empty strings
Dataframe Shape: (9284, 2)
Number of empty strings 0


In [13]:
print("Dataframe Shape:", df.shape)

print()

print("Average String Length:", df["document"].str.len().mean())
print("Shortest String Length:", df["document"].str.len().min())
print("Longest String Length:", df["document"].str.len().max())



df.head()


Dataframe Shape: (9284, 2)

Average String Length: 164.90198190435157
Shortest String Length: 46
Longest String Length: 23716


Unnamed: 0,document,language
1,\nOmniatlas'ta Avrupa'daki savaşın haritaları ...,tr
2,Dünya Savaşı'nın propaganda posterleri 2 Şuba...,tr
3,\nSavaşın cephelerinin atlası (Temmuz 1943'ten...,tr
4,\nAsya-Pasifik'teki savaşın haritaları 12 Şuba...,tr
5,\nAvrupa'daki savaşın haritaları 12 Ocak 2021 ...,tr


In [14]:
df.tail()


Unnamed: 0,document,language
10595,"Millions died in genocides, including the Hol...",en
10596,World War II was the deadliest conflict in hi...,en
10597,"Tanks and aircraft played major roles, with t...",en
10598,Nearly all of the world's countries—including...,en
10599,World War II or the Second World War (1 Septem...,en


In [15]:
df_documents = df['document'].to_list()
df_languages = df['language'].to_list()
print(len(df_documents), len(df_languages))

9284 9284


In [16]:
collection.add(
    documents = df_documents,
    metadatas = [{"language": lang} for lang in df_languages],
    ids=[f"id{i}" for i in range(len(df_documents))]
)

In [17]:
collection.count()

9284

In [18]:
collection.peek(2)

{'ids': ['id0', 'id1'],
 'embeddings': [[0.005061153322458267,
   0.011680714786052704,
   0.02047288790345192,
   -0.008635272271931171,
   0.03937438130378723,
   0.027949469164013863,
   0.01760045997798443,
   -0.040981680154800415,
   -0.07653786242008209,
   -0.01480383612215519,
   -0.056056611239910126,
   0.039188485592603683,
   -0.05360700562596321,
   -0.02138671837747097,
   0.016545988619327545,
   0.0013316398253664374,
   0.012149748392403126,
   0.05678606778383255,
   -0.13294090330600739,
   -0.0005020932876504958,
   -0.043349117040634155,
   -0.03231129050254822,
   0.08282667398452759,
   0.009622541256248951,
   -0.048200979828834534,
   0.05115851014852524,
   -0.05660281330347061,
   -0.05594514310359955,
   -0.0718565583229065,
   0.05636913701891899,
   0.019364740699529648,
   0.005821708124130964,
   -0.0489606074988842,
   0.005100152920931578,
   0.008797876536846161,
   0.07144660502672195,
   -0.05669818073511124,
   -0.04040353000164032,
   -0.05800760

In [19]:
query_results = collection.query(
    query_texts=[""],
    n_results=1
)
query_results


{'ids': [['id3206']],
 'distances': [[1.6856392621994019]],
 'metadatas': [[{'language': 'pl'}]],
 'embeddings': None,
 'documents': [[' Oddziały niemiecko-włoskie znalazły się w potrzasku']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [20]:
len(collection.get(ids='id0', include=['embeddings'])['embeddings'][0])

512