## Do a similar analysis as in 05_iswiki_final_filtering.ipynb for the norwegian wikipedia dump.

BUT, use the duckdb file instead of pure pandas.  
While the norwegian wiki is small enough to fit in memory,  
the english one will not be. So create a process that works for both using duckdb. 

In [1]:
import os
import duckdb
from IPython.display import display


base_datadir = "../../data/"
duckdb_file = base_datadir + "db_files/wikipedia_articles_no.duckdb"
pageviews_file = base_datadir + "wikipedia_meta/aggregated/no.wikipedia_monthly_views.parquet"


os.path.exists(duckdb_file)

con = duckdb.connect(duckdb_file)

df_tables = con.sql("SHOW TABLES").df()
display(df_tables)


Unnamed: 0,name
0,articles
1,pageviews
2,pageviews_aggregated
3,pageviews_aggregated_top_articles
4,pageviews_top500k


In [2]:
# # the urls in articles are not correct, they are of the form
# # https://en.wikipedia.org/wiki/8._mars	
# # but should be no.wikipedia....
# # e.g. https://no.wikipedia.org/wiki/8._mars	
# # fix in db

# con.execute("""
#     UPDATE articles
#     SET url = REPLACE(url, 'en.wikipedia.org', 'no.wikipedia.org')
# """)


In [3]:
df = con.sql("SELECT * FROM articles LIMIT 100").df()

df.head(3)

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match
0,25238,8. mars,https://no.wikipedia.org/wiki/8._mars,1003,267,0,,1,1,2024-05-19 20:04:57,"\n8. mars er den 67. dagen i året, den 68. i s...",8. mars
1,392195,Ideelle rettigheter,https://no.wikipedia.org/wiki/Ideelle_rettigheter,241,9,0,,2,1,2021-08-28 19:43:59,De ideelle rettighetene (fransk droit moral) e...,ideelle rettigheter
2,193074,Ideologi,https://no.wikipedia.org/wiki/Ideologi,1051,54,0,,2,9,2024-09-30 08:51:55,\nIdeologi kan forstås som et tankesett tuftet...,ideologi


### Drop cols / views for rerunnability of notebook




In [4]:
con.execute("""
    DROP TABLE IF EXISTS pageviews_aggregated_top_articles;
    DROP TABLE IF EXISTS pageviews_aggregated;
    DROP TABLE IF EXISTS articles_unique;
    DROP TABLE IF EXISTS pageviews;
            
    ALTER TABLE articles DROP COLUMN title_match;
            

    DROP VIEW IF EXISTS pageviews_top500k;
""")


<duckdb.duckdb.DuckDBPyConnection at 0x1073c0cf0>

### Add pagewies data to the duckdb database



In [5]:
import pandas as pd

# Load pageviews data using DuckDB's Parquet support
con.execute(f"""
    CREATE TABLE IF NOT EXISTS pageviews AS
    SELECT *
    FROM read_parquet('{pageviews_file}')
""")

# Verify the pageviews table
print("Pageviews table schema:")
print(con.execute("DESCRIBE pageviews").fetchdf())

Pageviews table schema:
     column_name column_type null   key default extra
0      wiki_code     VARCHAR  YES  None    None  None
1  article_title     VARCHAR  YES  None    None  None
2          views      BIGINT  YES  None    None  None
3       page_ids   VARCHAR[]  YES  None    None  None


### Now create the title_match column (standardized title)



In [6]:
con.execute("""
ALTER TABLE pageviews
ADD COLUMN title_match VARCHAR
""")

con.execute("""
ALTER TABLE articles
ADD COLUMN title_match VARCHAR
""")


con.execute("""
    UPDATE pageviews
    SET title_match = LOWER(TRIM(REPLACE(article_title, '-', '')))
""")

con.execute("""
    UPDATE articles
    SET title_match = LOWER(TRIM(REPLACE(title, '-', '')))
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1073c0cf0>

In [7]:
df = con.sql("SELECT * FROM articles LIMIT 100").df()
df_pageviews = con.sql("SELECT * FROM pageviews LIMIT 100").df()

display(df.head(3))
display(df_pageviews.head(3))

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match
0,25238,8. mars,https://no.wikipedia.org/wiki/8._mars,1003,267,0,,1,1,2024-05-19 20:04:57,"\n8. mars er den 67. dagen i året, den 68. i s...",8. mars
1,392195,Ideelle rettigheter,https://no.wikipedia.org/wiki/Ideelle_rettigheter,241,9,0,,2,1,2021-08-28 19:43:59,De ideelle rettighetene (fransk droit moral) e...,ideelle rettigheter
2,193074,Ideologi,https://no.wikipedia.org/wiki/Ideologi,1051,54,0,,2,9,2024-09-30 08:51:55,\nIdeologi kan forstås som et tankesett tuftet...,ideologi


Unnamed: 0,wiki_code,article_title,views,page_ids,title_match
0,no.wikipedia,!,3,[86365],!
1,no.wikipedia,!!!,20,[321005],!!!
2,no.wikipedia,!Kwi,1,[86345],!kwi


### Create the aggregated pageviews table

In [8]:
# Aggregate pageviews by title_match
con.execute("""
    CREATE TABLE IF NOT EXISTS pageviews_aggregated AS
    SELECT 
        title_match,
        SUM(views) AS total_views,
        LISTAGG(DISTINCT page_ids) AS aggregated_page_ids
    FROM pageviews
    GROUP BY title_match
""")

# Verify the aggregated data
print("Aggregated Pageviews:")

df_grouped = con.sql("SELECT * FROM pageviews_aggregated order by total_views desc LIMIT 100").df()
df_grouped.head(3)

Aggregated Pageviews:


Unnamed: 0,title_match,total_views,aggregated_page_ids
0,portal:forside,7305901.0,"[494123, null],[null, 494123],[494123, null, 1..."
1,spesial:søk,1627539.0,"[null, 180861],[null]"
2,spesial:siste_endringer,323174.0,[null]


### Create a table of the top N articles by pageviews (top 500K?)

Choose a larger number than we will embed, since we might filter out some of these..  

I.e. if the target to embed is 100K articles, be very safe and choose 500K.   

We can then further filter down from this set.



### Note, we have to play with the LIMIT to get an acceptable number of articles.

If we set the limit to 500K we may get only about 100K articles out.

In [9]:
VIEW_LIMIT = 700_000

# Create table of top 500K articles by pageviews with their metadata
con.execute(f"""
CREATE VIEW IF NOT EXISTS pageviews_top500k AS
SELECT 
    title_match,
    SUM(views) AS total_views,
    LISTAGG(DISTINCT page_ids) AS aggregated_page_ids
FROM pageviews
GROUP BY title_match
ORDER BY SUM(views) DESC
LIMIT {str(VIEW_LIMIT).replace("_", "")};
""")

# Step 2: Join with articles to create the final table
con.execute("""
CREATE TABLE IF NOT EXISTS pageviews_aggregated_top_articles AS
WITH ranked_articles AS (
    SELECT 
        a.*,
        p.total_views,
        p.aggregated_page_ids,
        'https://no.wikipedia.org/w/index.php?curid=' || a.page_id as url_pageviews,
        PERCENT_RANK() OVER (ORDER BY p.total_views DESC) as views_percentile,
        ROW_NUMBER() OVER (PARTITION BY a.title_match ORDER BY a.word_count DESC) as rn
    FROM articles a
    INNER JOIN pageviews_top500k p ON a.title_match = p.title_match
)
SELECT 
 *
FROM ranked_articles
WHERE rn = 1;
""")


print("Top Articles with Pageviews:")
df_top_articles = con.sql("""
    SELECT *
    FROM pageviews_aggregated_top_articles 
    ORDER BY total_views DESC 
    LIMIT 100
""").df()
display(df_top_articles.head(5))

Top Articles with Pageviews:


Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match,total_views,aggregated_page_ids,url_pageviews,views_percentile,rn
0,99115,Norge,https://no.wikipedia.org/wiki/Norge,15019,1003,0,,318,207,2024-09-26 11:38:44,"\n\n\n\nNorge, offisielt Kongeriket Norge, er ...",norge,266626.0,"[null, 728],[728]",https://no.wikipedia.org/w/index.php?curid=99115,0.0,1
1,190116,Facebook,https://no.wikipedia.org/wiki/Facebook,892,35,0,,32,38,2024-03-14 22:45:28,\n\n\n\nFacebook () er et sosialt nettverk som...,facebook,224285.0,"[289714],[null, 289714]",https://no.wikipedia.org/w/index.php?curid=190116,7e-06,1
2,241256,NAV,https://no.wikipedia.org/wiki/NAV,968,35,0,,22,27,2024-09-30 11:49:05,\n\n\n\n\nNAV (Arbeids- og velferdsforvaltning...,nav,196957.0,"[null, 282978],[150352],[null, 150352]",https://no.wikipedia.org/w/index.php?curid=241256,1.5e-05,1
3,30880,Gazastripen,https://no.wikipedia.org/wiki/Gazastripen,2691,136,0,,39,51,2024-06-05 15:39:06,\n\n\n\nGazastripen eller Gazastripa er en sma...,gazastripen,162757.0,"[20526],[20526, null],[null, 20526]",https://no.wikipedia.org/w/index.php?curid=30880,3e-05,1
4,394721,Israel,https://no.wikipedia.org/wiki/Israel,11276,787,0,,372,411,2024-07-02 02:36:43,"\n\n\n\n\nIsrael, offisielt Staten Israel, er ...",israel,161120.0,"[435523],[null, 435523]",https://no.wikipedia.org/w/index.php?curid=394721,3.7e-05,1


In [10]:
print("Top Articles with Pageviews:")
df_top_articles = con.sql("""
    SELECT *
    FROM pageviews_aggregated_top_articles 
    ORDER BY total_views DESC 
""").df()

df_top_articles

Top Articles with Pageviews:


Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match,total_views,aggregated_page_ids,url_pageviews,views_percentile,rn
0,99115,Norge,https://no.wikipedia.org/wiki/Norge,15019,1003,0,,318,207,2024-09-26 11:38:44,"\n\n\n\nNorge, offisielt Kongeriket Norge, er ...",norge,266626.0,"[null, 728],[728]",https://no.wikipedia.org/w/index.php?curid=99115,0.000000,1
1,190116,Facebook,https://no.wikipedia.org/wiki/Facebook,892,35,0,,32,38,2024-03-14 22:45:28,\n\n\n\nFacebook () er et sosialt nettverk som...,facebook,224285.0,"[289714],[null, 289714]",https://no.wikipedia.org/w/index.php?curid=190116,0.000007,1
2,241256,NAV,https://no.wikipedia.org/wiki/NAV,968,35,0,,22,27,2024-09-30 11:49:05,\n\n\n\n\nNAV (Arbeids- og velferdsforvaltning...,nav,196957.0,"[null, 282978],[150352],[null, 150352]",https://no.wikipedia.org/w/index.php?curid=241256,0.000015,1
3,30880,Gazastripen,https://no.wikipedia.org/wiki/Gazastripen,2691,136,0,,39,51,2024-06-05 15:39:06,\n\n\n\nGazastripen eller Gazastripa er en sma...,gazastripen,162757.0,"[20526],[20526, null],[null, 20526]",https://no.wikipedia.org/w/index.php?curid=30880,0.000030,1
4,394721,Israel,https://no.wikipedia.org/wiki/Israel,11276,787,0,,372,411,2024-07-02 02:36:43,"\n\n\n\n\nIsrael, offisielt Staten Israel, er ...",israel,161120.0,"[435523],[null, 435523]",https://no.wikipedia.org/w/index.php?curid=394721,0.000037,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135293,392195,Zhouqu,https://no.wikipedia.org/wiki/Zhouqu,209,17,0,,3,4,2023-03-29 23:23:41,\n\nZhugqu eller Zhouqu (kinesisk: 舟曲县; pinyin...,zhouqu,11.0,[545164],https://no.wikipedia.org/w/index.php?curid=392195,0.985849,1
135294,16375,Ziglipton,https://no.wikipedia.org/wiki/Ziglipton,267,45,0,,7,2,2023-07-26 18:48:58,\n\n* Se systematisk inndeling\n\n\nZiglipton ...,ziglipton,11.0,[1395842],https://no.wikipedia.org/w/index.php?curid=16375,0.985849,1
135295,16375,Zimmiellini,https://no.wikipedia.org/wiki/Zimmiellini,442,45,0,,7,2,2023-10-18 06:53:44,\n\n\n\nZimmiellini er en gruppe av snutebille...,zimmiellini,11.0,[2043073],https://no.wikipedia.org/w/index.php?curid=16375,0.985849,1
135296,562820,Ærølisten,https://no.wikipedia.org/wiki/Ærølisten,102,6,0,,4,0,2023-07-23 17:35:43,\n\nÆrølisten er en lokalpolitisk borgerliste ...,ærølisten,11.0,[441871],https://no.wikipedia.org/w/index.php?curid=562820,0.985849,1


# Can now implement samf filtering strategy as for smaller wikis:



In [15]:
from_words = 100
to_words = 200

df_merged = df_top_articles

df_merged[df_merged.word_count.between(from_words, to_words)].sort_values("word_count", ascending=True).head(2)

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match,total_views,aggregated_page_ids,url_pageviews,views_percentile,rn
42344,27430,Hyperemi,https://no.wikipedia.org/wiki/Hyperemi,100,14,0,,7,0,2016-03-21 14:56:42,\n\nHyperemi (fra gresk «hyper» over og «aima»...,hyperemi,321.0,"[null, 598334],[598334]",https://no.wikipedia.org/w/index.php?curid=27430,0.313332,1
54112,1318,Baotian,https://no.wikipedia.org/wiki/Baotian,100,16,0,,2,0,2018-02-08 14:10:48,\nBaotian (fullt navn på kinesisk: 江门市中港宝田摩托车实...,baotian,195.0,[418193],https://no.wikipedia.org/w/index.php?curid=1318,0.400339,1


### Filter on max_outlinks?

In [18]:
import plotly.express as px


N_PLOT = 75

df_plot = df_merged.sort_values("outlink_count", ascending=False).head(N_PLOT).copy().reset_index()

display(df_plot.head(3))
px.scatter(df_plot, x=df_plot.index, y="outlink_count", hover_data=["title"])


Unnamed: 0,index,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match,total_views,aggregated_page_ids,url_pageviews,views_percentile,rn
0,2076,496017,Postnummerkatalogen,https://no.wikipedia.org/wiki/Postnummerkatalogen,1322,4450,0,,1,0,2024-09-10 07:37:41,Postnummerkatalogen er en fortegnelse over Nor...,postnummerkatalogen,7525.0,"[null, 1282044],[1282044, null]",https://no.wikipedia.org/w/index.php?curid=496017,0.015368,1
1,243,37718,2023,https://no.wikipedia.org/wiki/2023,22793,2117,0,,35,281,2024-07-17 08:24:15,\n\n2023 (MMXXIII) i den gregorianske kalender...,2023,24465.0,"[null, 14707],[14707]",https://no.wikipedia.org/w/index.php?curid=37718,0.0018,1
2,13,216895,Bergen,https://no.wikipedia.org/wiki/Bergen,21940,2063,0,,466,140,2024-09-23 19:46:22,\n\nBergen er en by og kommune i Vestland og e...,bergen,84850.0,"[null, 95],[95]",https://no.wikipedia.org/w/index.php?curid=216895,0.000103,1


### Final filtering

In [20]:
df_merged = df_merged.sort_values(by="total_views", ascending=False)

df_merged

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match,total_views,aggregated_page_ids,url_pageviews,views_percentile,rn
0,99115,Norge,https://no.wikipedia.org/wiki/Norge,15019,1003,0,,318,207,2024-09-26 11:38:44,"\n\n\n\nNorge, offisielt Kongeriket Norge, er ...",norge,266626.0,"[null, 728],[728]",https://no.wikipedia.org/w/index.php?curid=99115,0.000000,1
1,190116,Facebook,https://no.wikipedia.org/wiki/Facebook,892,35,0,,32,38,2024-03-14 22:45:28,\n\n\n\nFacebook () er et sosialt nettverk som...,facebook,224285.0,"[289714],[null, 289714]",https://no.wikipedia.org/w/index.php?curid=190116,0.000007,1
2,241256,NAV,https://no.wikipedia.org/wiki/NAV,968,35,0,,22,27,2024-09-30 11:49:05,\n\n\n\n\nNAV (Arbeids- og velferdsforvaltning...,nav,196957.0,"[null, 282978],[150352],[null, 150352]",https://no.wikipedia.org/w/index.php?curid=241256,0.000015,1
3,30880,Gazastripen,https://no.wikipedia.org/wiki/Gazastripen,2691,136,0,,39,51,2024-06-05 15:39:06,\n\n\n\nGazastripen eller Gazastripa er en sma...,gazastripen,162757.0,"[20526],[20526, null],[null, 20526]",https://no.wikipedia.org/w/index.php?curid=30880,0.000030,1
4,394721,Israel,https://no.wikipedia.org/wiki/Israel,11276,787,0,,372,411,2024-07-02 02:36:43,"\n\n\n\n\nIsrael, offisielt Staten Israel, er ...",israel,161120.0,"[435523],[null, 435523]",https://no.wikipedia.org/w/index.php?curid=394721,0.000037,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135295,16375,Zimmiellini,https://no.wikipedia.org/wiki/Zimmiellini,442,45,0,,7,2,2023-10-18 06:53:44,\n\n\n\nZimmiellini er en gruppe av snutebille...,zimmiellini,11.0,[2043073],https://no.wikipedia.org/w/index.php?curid=16375,0.985849,1
135296,562820,Ærølisten,https://no.wikipedia.org/wiki/Ærølisten,102,6,0,,4,0,2023-07-23 17:35:43,\n\nÆrølisten er en lokalpolitisk borgerliste ...,ærølisten,11.0,[441871],https://no.wikipedia.org/w/index.php?curid=562820,0.985849,1
134338,2700,Gabriole,https://no.wikipedia.org/wiki/Gabriole,50,5,0,,4,0,2023-11-22 18:37:22,Gabriole er et musikkalbum med Fliflet/Hamre u...,gabriole,11.0,[778685],https://no.wikipedia.org/w/index.php?curid=2700,0.985849,1
135285,549755,Utvälinge,https://no.wikipedia.org/wiki/Utvälinge,206,11,0,,4,2,2022-07-31 04:33:49,\nUtvälinge er et tettsted i Helsingborg kommu...,utvälinge,11.0,[470720],https://no.wikipedia.org/w/index.php?curid=549755,0.985849,1


### In the case of the norwegian wiki, I think the best filtering is to simply use the total views... 

It seems to be a decent proxy for the quality of the article.   
The text chunk splitter then discards completely meaningless chunks.. 

Basically just choose the number of articles you want to embed, and order by total views.   

In [21]:
n_articles_to_keep = 60_000

df_filtered = df_merged.head(n_articles_to_keep)




## Now chunk to analyze how embedding goes

In [22]:
import sys
import importlib
sys.path.append("../../src")

from vectordb import chunk_utils
importlib.reload(chunk_utils)

pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_columns", 50)

df = df_filtered.copy()

df["split_text"] = df["processed_text"].apply(
    lambda x: chunk_utils.split_text(
        x,
        chunk_size=1250,
        chunk_overlap=100,
        discard_chunk_n_words_cutoff=12,  # discard chunks with less than 10 words
        clean_whitespace=True,
        clean_html=True,
        min_words_per_chunk=45,
    )
)

df["n_chunks"] = df["split_text"].apply(lambda x: len(x))
df["n_words_per_chunk"] = df["split_text"].apply(
    lambda x: [len(chunk.split()) for chunk in x]
)

df = df.sort_values("n_chunks", ascending=False)

df.head(2)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,title_match,total_views,aggregated_page_ids,url_pageviews,views_percentile,rn,split_text,n_chunks,n_words_per_chunk
157,197991,Holocaust,https://no.wikipedia.org/wiki/Holocaust,33655,1076,0,,279,180,2024-09-14 00:03:52,"\n:Artikkelen handler om forsøket på å utrydde jødene under andre verdenskrig. For grunnbetydningen, se brennoffer. ...",holocaust,29538.0,"[5796, null]",https://no.wikipedia.org/w/index.php?curid=197991,0.001166,1,"[:Artikkelen handler om forsøket på å utrydde jødene under andre verdenskrig. For grunnbetydningen, se brennoffer. F...",244,"[92, 100, 158, 182, 150, 199, 133, 87, 152, 178, 172, 198, 83, 211, 110, 207, 129, 177, 171, 86, 218, 96, 135, 96, 1..."
13,216895,Bergen,https://no.wikipedia.org/wiki/Bergen,21940,2063,0,,466,140,2024-09-23 19:46:22,"\n\nBergen er en by og kommune i Vestland og et tidligere fylke (til 1972) på Norges vestkyst, omgitt av «De syv fje...",bergen,84850.0,"[null, 95],[95]",https://no.wikipedia.org/w/index.php?curid=216895,0.000103,1,"[Bergen er en by og kommune i Vestland og et tidligere fylke (til 1972) på Norges vestkyst, omgitt av «De syv fjell»...",154,"[182, 158, 75, 209, 162, 100, 190, 96, 160, 86, 192, 86, 202, 55, 102, 210, 97, 190, 189, 157, 151, 176, 186, 145, 1..."


In [23]:
"""
Info about the number of word per chunk
for the article with the most chunks
"""

import math
import numpy as np

c = df.iloc[0].n_words_per_chunk

# print some stats about c
print("mean: ", sum(c) / len(c))
print("max: ", max(c))
print("min: ", min(c))
print("median: ", np.median(c))
print("std: ", math.sqrt(sum((x - sum(c) / len(c)) ** 2 for x in c) / len(c)))

mean:  140.5655737704918
max:  238
min:  46
median:  144.0
std:  43.46837699341073


In [24]:
"""
Check the smallest chunks for the article with the most chunks
"""

for i, chunk in enumerate(df.split_text.iloc[0]):
    if len(chunk.strip().split()) < 63:
        print(chunk)
        print(chunk.strip().split())
        print(i)
        print("---")

        break

behov for å beskytte lokalbefolkningen mot hordene som i propaganda og i rykter ble fremstilt som farlige (mord, voldtekt og ran ble nevnt). Etter at fangene hadde forlatt leirene hadde i SS lenger full kontroll slik at vanlige soldater (fra Wehrmacht), lokalt politi, Hitlerjugend og særlig Volkssturm deltok i massakrer på de marsjerende.
['behov', 'for', 'å', 'beskytte', 'lokalbefolkningen', 'mot', 'hordene', 'som', 'i', 'propaganda', 'og', 'i', 'rykter', 'ble', 'fremstilt', 'som', 'farlige', '(mord,', 'voldtekt', 'og', 'ran', 'ble', 'nevnt).', 'Etter', 'at', 'fangene', 'hadde', 'forlatt', 'leirene', 'hadde', 'i', 'SS', 'lenger', 'full', 'kontroll', 'slik', 'at', 'vanlige', 'soldater', '(fra', 'Wehrmacht),', 'lokalt', 'politi,', 'Hitlerjugend', 'og', 'særlig', 'Volkssturm', 'deltok', 'i', 'massakrer', 'på', 'de', 'marsjerende.']
51
---


In [28]:
print(f"Total number of chunks: {df.n_chunks.sum():,}")

Total number of chunks: 215,835


In [27]:
# the n_words_per_chunk is a list of numbers

n_words_total = sum(df.n_words_per_chunk.sum())

n_tokens_estimated = n_words_total * 1.35

n_tokens_M = n_tokens_estimated / 1_000_000

openai_small_embed_price = 0.008  # per M tokens
openai_large_embed_price = 0.016  # per M tokens

print(f"total number of words: {n_words_total:,}")

print(f"Estimated n token in M tokens: {n_tokens_M:.2f}")
print(f"Estimated cost for small embeddings: {n_tokens_M * openai_small_embed_price:.2f} USD")
print(f"Estimated cost for large embeddings: {n_tokens_M * openai_large_embed_price:.2f} USD")


total number of words: 30,575,700
Estimated n token in M tokens: 41.28
Estimated cost for small embeddings: 0.33 USD
Estimated cost for large embeddings: 0.66 USD


**Basically the price to embed should be negligible, the price might be dominated by the vector db instead?**



In [30]:
"""
Find the largest chunk
"""

max_words = 0
max_i = 0
max_j = 0


for i, n_words_per_chunk in enumerate(df.n_words_per_chunk):
    for j, n_words in enumerate(n_words_per_chunk):
        if n_words > max_words:
            max_words = n_words
            max_i = i
            max_j = j

print(max_words, max_i, max_j)

print(df.iloc[max_i].title)
print(df.iloc[max_i].split_text[max_j])


392 491 30
Dagsavisen
* 1884: 300
* 1892: 1 200
* 1894: 3 000
* 1904: 6 000
* 1912: 15 000
* 1914: 23 000
* 1918: 40 000
* 1921: 85 000
* 1923: 35 000
* 1927: 27 000
* 1930: 34 000
* 1934: 48 000
* 1937: 59 359
* 1938: 58 735
* 1939: 58 681
* 1945: 80 000
* 1947: 56 877
* 1950: 62 845
* 1951: 64 228
* 1952: 65 635
* 1953: 64 524
* 1954: 65 159
* 1955: 65 201
* 1956: 70 087
* 1957: 71 299
* 1958: 68 112
* 1959: 66 271
* 1960: 67 494
* 1961: 67 684
* 1962: 67 894
* 1963: 69 182
* 1964: 67 254
* 1965: 68 278
* 1966: 67 675
* 1967: 70 714
* 1968: 71 267
* 1969: 74 091
* 1970: 73 217
* 1971: 75 372
* 1972: 69 159
* 1973: 64 155
* 1974: 61 931
* 1975: 62 211
* 1976: 60 380
* 1977: 60 152
* 1978: 60 091
* 1979: 59 211
* 1980: 55 125
* 1981: 52 596
* 1982: 52 000
* 1983: 52 500
* 1984: 56 000
* 1985: 57 000
* 1986: 58 000
* 1987: 60 737
* 1988: 57 015
* 1989: 55 707
* 1990: 51 786
* 1991: 47 016
* 1992: 44 046
* 1993: 43 528
* 1994: 42 848
* 1995: 42 870
* 1996: 42 139
* 1997: 40 771
* 1998: 4