In [1]:
import pandas as pd
from collections import defaultdict

# File paths
PAGEVIEWS_FILE = "/Users/einar/git/hafsteinn/together_rag/data/wikipedia_meta/aggregated/is.wikipedia_monthly_views.parquet"
DUMP_FILE = "iswiki.processed_datadump.parquet"
URL_TEMPLATE = "https://{language}.wikipedia.org/w/index.php?curid={page_id}"

def union_page_ids(x):
    return set().union(*x)

def first_not_null_page_id(x):
    return next((page_id for page_id in x if page_id is not None and page_id != "null"), None)

# Load and process pageviews data
df_pageviews = pd.read_parquet(PAGEVIEWS_FILE)
language = df_pageviews.wiki_code.unique()[0].split(".")[0]
df_pageviews["title_match"] = df_pageviews.article_title.str.replace("_", " ").str.lower()


df_pageviews.head(2)


Unnamed: 0,wiki_code,article_title,views,page_ids,title_match
0,is.wikipedia,"""\""Weird_Al\""_Yankovic""",1,[52652],"""\""weird al\"" yankovic"""
1,is.wikipedia,$,2,[30714],$


In [2]:
# Group pageviews data
df_grouped = df_pageviews.groupby("title_match").agg({
    "views": "sum", 
    "page_ids": union_page_ids
})
df_grouped = df_grouped.sort_values("views", ascending=False)
df_grouped["percentile"] = df_grouped.views.rank(pct=True)
df_grouped["url"] = df_grouped.apply(lambda x: URL_TEMPLATE.format(language=language, page_id=first_not_null_page_id(x.page_ids)) if x.page_ids else None, axis=1)


print(len(df_grouped))

df_grouped.head(2) # 202.415, 200.264

200264


Unnamed: 0_level_0,views,page_ids,percentile,url
title_match,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
xxx rottweilerhundar,972756,"{null, 87556}",1.0,https://is.wikipedia.org/w/index.php?curid=87556
forsíða,435739,"{null, 4}",0.999995,https://is.wikipedia.org/w/index.php?curid=4


In [4]:
# Load and process dump data
df_dump = pd.read_parquet(DUMP_FILE)

# Prepare for joining
df_dump["title_match"] = df_dump.title.str.replace("_", " ").str.lower()

# Clean up dump data
# first sort by word count and then drop duplicates, i.e. keep the one with the most words
# since the others are likely redirects to the actual page
df_dump_cleaned = df_dump.sort_values('word_count', ascending=False).drop_duplicates(subset='title_match', keep='first')

print(f"Number of records in df_dump_cleaned: {len(df_dump_cleaned):,}")

df_dump_cleaned.head(2)

Number of records in df_dump_cleaned: 85,506


Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match
64985,147909,Listi yfir morð á Íslandi frá 1970–1999,https://is.wikipedia.org/wiki/Listi_yfir_mor%C...,14782,68,0,,8,141,2024-08-24T05:04:34Z,\nListar yfir morð á Íslandi\n\nListi yfir mor...,"<div class=""mw-collapsible"" style=""float: righ...",listi yfir morð á íslandi frá 1970–1999
11702,27958,Knattspyrnufélagið Valur,https://is.wikipedia.org/wiki/Knattspyrnuf%C3%...,14437,492,0,,259,153,2024-08-23T09:57:04Z,\n\nValur er íslenskt íþróttafélag sem hefur a...,{{Knattspyrnulið\n| Fullt nafn = Knattspyrnufé...,knattspyrnufélagið valur


In [5]:
df_grouped.head(2).reset_index()

Unnamed: 0,title_match,views,page_ids,percentile,url
0,xxx rottweilerhundar,972756,"{null, 87556}",1.0,https://is.wikipedia.org/w/index.php?curid=87556
1,forsíða,435739,"{4, null}",0.999995,https://is.wikipedia.org/w/index.php?curid=4


In [6]:
# Merge data
df_merged = df_dump_cleaned.merge(
    df_grouped.reset_index().rename(columns={"url": "url_pageviews"}), 
    left_on="title_match", right_on="title_match", how="left"
)

print(f"Number of records in df_merged: {len(df_merged):,}")

df_merged.head(2)

Number of records in df_merged: 85,506


Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews
0,147909,Listi yfir morð á Íslandi frá 1970–1999,https://is.wikipedia.org/wiki/Listi_yfir_mor%C...,14782,68,0,,8,141,2024-08-24T05:04:34Z,\nListar yfir morð á Íslandi\n\nListi yfir mor...,"<div class=""mw-collapsible"" style=""float: righ...",listi yfir morð á íslandi frá 1970–1999,10572.0,"{147909, null}",0.999735,https://is.wikipedia.org/w/index.php?curid=147909
1,27958,Knattspyrnufélagið Valur,https://is.wikipedia.org/wiki/Knattspyrnuf%C3%...,14437,492,0,,259,153,2024-08-23T09:57:04Z,\n\nValur er íslenskt íþróttafélag sem hefur a...,{{Knattspyrnulið\n| Fullt nafn = Knattspyrnufé...,knattspyrnufélagið valur,4356.0,"{27958, null}",0.998827,https://is.wikipedia.org/w/index.php?curid=27958


In [7]:
print(f"n with views data: {df_merged.views.notna().sum()}")
print(f"n without views data: {df_merged.views.isna().sum()}")



n with views data: 85148
n without views data: 358


**~~It is a little dissapointing that after all this work, we only join almost 3K pages~~**  
~~Although I expect the pageviews data to be more useful for the English wikipedia.~~

This was before, now joining correctly.  
But another idea would be to join on page ids,  
this would probably best be done by just creating a graph with networkx and asking  
it for connected components, which should all be the same page....

# Strategy:
 
 - All pages with pageview data > k
 - High category count seems to indicate a good page, so keep those.
 - All pages with mininum word count
 - Very high template count pages are usually lists, which do not really embed well, or do not contain much information.
 - Same with outlink count
 - external_link_count is not appropriate for filtering, does not seem to correlate with the actual content of the page.

That seems to be all I have to filter on.

The algorithm will be something like:
 - Filter out pages with unwanted prefixes
 - All pages with pageview data > k are kept, no matter if they would have been filtered otherwise (except for the prefixes)
 - Keep pages with high category count
 - Filter out pages with very high template count
 - Filter out pages with very high outlink count
 - Filter out pages with very low word count 
 
The algorithm should use parameters for the filtering, which will be easy to tweak and apply to other languages.  
Basically think of the filtering as putting articles into two buckets.  

Keep a list of removed pages as well so we can check them later for quality.




In [8]:
# Filter out unwanted prefixes
prefixes_to_remove = [
    "Flokkur:", "Spjall:", "Snið:", "Notandaspjall:", "Notandi:", "Mynd:",
    "Wikipedia:", "Talk:", "Module:", "Wikipediaspjall:", "Sniðaspjall:",
    "Flokkaspjall:", "Melding:", "flokkur:", "Hjálp:", "User:", "wikipedia:",
    "notandi:", "Listi_yfir_CSI:", "snið:", "Gátt:", "Myndaspjall:", "Hjálparspjall:"
]
df_filtered = df_merged.copy()
print(f"Number of records before removing prefixes: {len(df_filtered)}")

for prefix in prefixes_to_remove:
    df_filtered = df_filtered[~df_filtered.title.str.startswith(prefix)]

print(f"Number of records after removing prefixes: {len(df_filtered)}")

print(f"Final number of records: {len(df_filtered)}")
# df_filtered.sort_values("extrenal_link_count", ascending=False).head(10)
df_filtered.sort_values("views", ascending=False).head(2)

Number of records before removing prefixes: 85506
Number of records after removing prefixes: 85506
Final number of records: 85506


Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews
4542,87556,XXX Rottweilerhundar,https://is.wikipedia.org/wiki/XXX_Rottweilerhu...,734,12,0,,1,3,2024-06-03T17:07:56Z,\nXXX Rottweilerhundar er íslensk rapphljómsve...,{{heimildir}}\n'''XXX Rottweilerhundar''' er í...,xxx rottweilerhundar,972756.0,"{null, 87556}",1.0,https://is.wikipedia.org/w/index.php?curid=87556
50852,4,Forsíða,https://is.wikipedia.org/wiki/Fors%C3%AD%C3%B0a,28,5,0,,11,0,2021-01-02T00:52:18Z,\n\n\n\n\n\n\n\nGrein mánaðarins\n\nEldri grei...,"<templatestyles src=""Template:Forsíða/styles.c...",forsíða,435739.0,"{4, null}",0.999995,https://is.wikipedia.org/w/index.php?curid=4


In [9]:
pd.set_option("display.max_colwidth", 120)
df_filtered.sort_values("category_count", ascending=False).head(5)

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews
5453,139892,Ferðamálayfirvöld Katar,https://is.wikipedia.org/wiki/Fer%C3%B0am%C3%A1layfirv%C3%B6ld_Katar,642,7,4,Category:Hagkerfi Katar|Category:Umhverfi Katar|Category:Samfélag Katar|Category:Ferðamál í Katar,6,12,2022-11-13T05:18:23Z,"Ferðamálayfirvöld Katar, sem heyrir undir ríkisstjórn Katar, er sá aðili sem ber ábyrgð á gerð og framkvæmd regla, r...","'''Ferðamálayfirvöld Katar''', sem heyrir undir ríkisstjórn [[Katar]], er sá aðili sem ber ábyrgð á gerð og framkvæm...",ferðamálayfirvöld katar,27.0,{139892},0.708003,https://is.wikipedia.org/w/index.php?curid=139892
12692,145646,Bogolan,https://is.wikipedia.org/wiki/Bogolan,301,17,3,Category:African clothing|Category:Textile arts of Africa|Category:Malian culture,2,2,2021-12-26T20:31:48Z,\nBogolan eða Bògòlanfini er handspunnið baðmullarefni frá Malí sem er litað með gerjaðri mold. \n\nthumb|Bògòlanfi...,[[Mynd:Traditional mud cloth.jpg|thumb|Bògòlanfini efni]]\n'''Bogolan''' eða '''Bògòlanfini''' er handspunnið [[Baðm...,bogolan,47.0,{145646},0.7749,https://is.wikipedia.org/w/index.php?curid=145646
33127,136874,Stictinsýra,https://is.wikipedia.org/wiki/Stictins%C3%BDra,88,17,3,Category:Arómatísk efnasambönd|Category:Lífrænar sýrur|Category:Fléttuefni,5,0,2019-01-09T21:51:35Z,"\n\n\nStictinsýra er arómatísk lífræn sýra sem myndast í sumum tegundum fléttna sem fylgiumbrotsefni (enska), til dæ...",[[Mynd:Stictic acid.svg|250px|tright|thumb|Efnabygging stictinsýru.]]\n[[Mynd:Stictic acid - 3D - Ball-and-stick Mod...,stictinsýra,21.0,{136874},0.674719,https://is.wikipedia.org/w/index.php?curid=136874
20418,155921,Hnokkmosaflokkur,https://is.wikipedia.org/wiki/Hnokkmosaflokkur,191,68,2,Category:Flokkar mosa|Category:Baukmosar,36,0,2020-02-07T14:15:31Z,\nHnokkmosaflokkur (latína: Bryopsida) er stærsti flokkur mosa og inniheldur 95% allra mosategunda eða um þa bil 11....,"{{Taxobox\n| name = Hnokkmosaflokkur\n| image = Bryum argenteum (d, 144719-474801) 4161.JPG\n| image_width = 300px\n...",hnokkmosaflokkur,23.0,{155921},0.687265,https://is.wikipedia.org/w/index.php?curid=155921
13125,177590,Loki (2. þáttaröð),https://is.wikipedia.org/wiki/Loki_%282._%C3%BE%C3%A1ttar%C3%B6%C3%B0%29,292,13,2,Category:Articles with short description|Category:Short description matches Wikidata,0,0,2023-12-04T14:07:54Z,"Í annarri þáttaröð bandarísku sjónvarpsþáttanna Loki, sem er byggð á teiknimyndasögum eftir Marvel, er Loki að vinna...","Í annarri þáttaröð bandarísku sjónvarpsþáttanna ''[[Loki (Marvel Cinematic Universe)#2012 variant|Loki]]'', sem er b...",loki (2. þáttaröð),41.0,"{null, 177590}",0.759617,https://is.wikipedia.org/w/index.php?curid=177590


In [10]:
assert(len(df_merged) == len(df_merged.title_match.unique()))

## Find filter values



### Find views threshold

In [11]:
# Calculate the percentile rank for the 'views' column
df_merged['views_percentile'] = df_merged['views'].rank(pct=True)

# Display the DataFrame with the new 'views_percentile' column
df_merged[df_merged.views > 2].sort_values("views_percentile", ascending=True).head(10)


Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews,views_percentile
67411,35641,Rökfræðileg staðreyndahyggja,https://is.wikipedia.org/wiki/R%C3%B6kfr%C3%A6%C3%B0ileg_sta%C3%B0reyndahyggja,3,1,0,,0,0,2007-01-04T22:27:24Z,#tilvísun Rökfræðileg raunhyggja,#tilvísun [[Rökfræðileg raunhyggja]],rökfræðileg staðreyndahyggja,3.0,{31936},0.338745,https://is.wikipedia.org/w/index.php?curid=31936,0.060342
64952,42526,Aston Villa FC,https://is.wikipedia.org/wiki/Aston_Villa_FC,3,1,0,,0,0,2007-05-05T01:06:56Z,#tilvísun Aston Villa,#tilvísun [[Aston Villa]],aston villa fc,3.0,{42512},0.338745,https://is.wikipedia.org/w/index.php?curid=42512,0.060342
75497,152361,Calandrinia uniflora,https://is.wikipedia.org/wiki/Calandrinia_uniflora,2,1,0,,0,0,2019-06-26T23:50:19Z,#TILVÍSUNParakeelya uniflora,#TILVÍSUN[[Parakeelya uniflora]],calandrinia uniflora,3.0,{152364},0.338745,https://is.wikipedia.org/w/index.php?curid=152364,0.060342
64944,127172,Kollur Grímsson,https://is.wikipedia.org/wiki/Kollur_Gr%C3%ADmsson,3,1,0,,0,0,2021-12-26T00:16:01Z,#tilvísun Dala-Kollur Veðra-Grímsson,#tilvísun [[Dala-Kollur Veðra-Grímsson]],kollur grímsson,3.0,{84872},0.338745,https://is.wikipedia.org/w/index.php?curid=84872,0.060342
64938,127178,Indversk olíu sardína,https://is.wikipedia.org/wiki/Indversk_ol%C3%ADu_sard%C3%ADna,3,1,0,,0,0,2015-02-22T21:51:29Z,#tilvísun Indversk olíusardína,#tilvísun [[Indversk olíusardína]],indversk olíu sardína,3.0,{127157},0.338745,https://is.wikipedia.org/w/index.php?curid=127157,0.060342
64937,103271,Downing Street 10,https://is.wikipedia.org/wiki/Downing_Street_10,3,1,0,,0,0,2011-11-13T18:15:02Z,#TILVÍSUN Downingstræti 10,#TILVÍSUN [[Downingstræti 10]],downing street 10,3.0,{103267},0.338745,https://is.wikipedia.org/w/index.php?curid=103267,0.060342
60326,141331,Wellington hertogi,https://is.wikipedia.org/wiki/Wellington_hertogi,5,1,0,,0,0,2018-01-10T23:47:02Z,"#TILVÍSUNArthur Wellesley, hertogi af Wellington","#TILVÍSUN[[Arthur Wellesley, hertogi af Wellington]]",wellington hertogi,3.0,{140428},0.338745,https://is.wikipedia.org/w/index.php?curid=140428,0.060342
79156,103686,Ísis (gyðja),https://is.wikipedia.org/wiki/%C3%8Dsis_%28gy%C3%B0ja%29,2,1,0,,0,0,2011-11-30T00:11:54Z,#tilvísun Ísis,#tilvísun [[Ísis]],ísis (gyðja),3.0,{77048},0.338745,https://is.wikipedia.org/w/index.php?curid=77048,0.060342
83833,172090,Pendlar,https://is.wikipedia.org/wiki/Pendlar,1,1,0,,0,0,2023-01-23T22:33:34Z,#TILVÍSUNTannmeitlar,#TILVÍSUN[[Tannmeitlar]],pendlar,3.0,{171402},0.338745,https://is.wikipedia.org/w/index.php?curid=171402,0.060342
64934,103316,Ibn Nafis,https://is.wikipedia.org/wiki/Ibn_Nafis,3,1,0,,0,0,2011-11-15T09:25:21Z,#tilvísun: Ibn al-Nafis,#tilvísun: [[Ibn al-Nafis]],ibn nafis,3.0,{102745},0.338745,https://is.wikipedia.org/w/index.php?curid=102745,0.060342


In [12]:
# group by n views and get mean, max, min word count, and number of rows for the group

df_merged.groupby(df_merged.views).agg(
    {"word_count": ["mean", "median", "max", "min"],
     "title": "count"
     }
).rename(columns={"title": "count"}).head(20)


Unnamed: 0_level_0,word_count,word_count,word_count,word_count,count
Unnamed: 0_level_1,mean,median,max,min,count
views,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1.0,2.539769,2.0,13,1,1471
2.0,2.889346,2.0,972,1,2431
3.0,2.719142,2.0,209,1,2471
4.0,2.948718,2.0,201,1,2145
5.0,3.354095,2.0,212,1,1734
6.0,7.198996,2.0,4089,1,1593
7.0,5.977657,2.0,225,1,1477
8.0,10.088919,2.0,430,1,1462
9.0,28.400868,2.0,5749,1,1382
10.0,31.197194,3.0,4222,1,1354


### Find template count threshold



In [13]:
# df_merged.sort_values("template_count", ascending=False).head(20)
import plotly.express as px
# import matplotlib.pyplot as plt

N_PLOT = 50

df_plot = df_merged.sort_values("template_count", ascending=False).head(N_PLOT).copy()

# # plot ordered by template count
# plt.figure(figsize=(10, 6))
# plt.barh(df_plot.title, df_plot.template_count)
# plt.show()

# # plot a scatter plot ordered by template count
# with template count on y axis, and index on x axis. 
# On hover show title and template count

df_plot = df_plot.reset_index()

px.scatter(df_plot, x=df_plot.index, y="template_count", hover_data=["title"])



### Word count threshold

In [14]:
from_words = 100
to_words = 200

df_merged[df_merged.word_count.between(from_words, to_words)].sort_values("word_count", ascending=True).head(2)

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews,views_percentile
31108,145432,Solar Impulse,https://is.wikipedia.org/wiki/Solar_Impulse,100,13,0,,2,0,2018-07-26T14:06:46Z,\nSolar Impulse er svissneskt verkefni sem gengur út á smíði langdrægra sólarorkuknúinna flugvéla í tilraunaskyni. U...,[[Mynd:SolarImpulse_HB-SIA_landing_Brussels_Airport_3-crop.jpg|thumb|right|''Solar Impulse 1'' lendir á [[Brussel-fl...,solar impulse,58.0,{145432},0.798301,https://is.wikipedia.org/w/index.php?curid=145432,0.551833
30996,120200,Vetrarólympíuleikarnir 1972,https://is.wikipedia.org/wiki/Vetrar%C3%B3lymp%C3%ADuleikarnir_1972,100,22,0,,2,0,2015-08-18T21:20:58Z,\nVetrarólympíuleikarnir 1972 voru vetrarólympíuleikar haldnir í Sapporo í Japan frá 3. til 13. febrúar árið 1972. Þ...,[[Mynd:1972 CPA 4101.jpg|thumb|right|Sovésk frímerki gefin út í tilefni leikanna]]\n'''Vetrarólympíuleikarnir 1972''...,vetrarólympíuleikarnir 1972,120.0,{120200},0.874171,https://is.wikipedia.org/w/index.php?curid=120200,0.713969


### Find max_outlink_count


In [15]:
N_PLOT = 75

df_plot = df_merged.sort_values("outlink_count", ascending=False).head(N_PLOT).copy().reset_index()

px.scatter(df_plot, x=df_plot.index, y="outlink_count", hover_data=["title"])



## Implement algorithm



In [16]:
def filter_articles(df, options):
    """
    Filters articles based on specified criteria.

    Parameters:
    df (DataFrame): The DataFrame containing article data to be filtered.
    options (dict): A dictionary of filtering options, which includes:
        - min_views (int): The minimum number of views an article must have to be kept, regardless of other criteria.
        - min_word_count (int): The minimum word count an article must have to be considered for keeping, unless it meets the min_views or keep_category_count criteria.
        - max_template_count (int): The maximum number of templates an article can have to be considered for keeping, unless it meets the min_views or keep_category_count criteria.
        - max_outlink_count (int): The maximum number of outlinks an article can have to be considered for keeping, unless it meets the min_views or keep_category_count criteria.
        - keep_category_count (int): The minimum number of categories an article must have to be kept, regardless of other criteria.
        - prefixes_to_remove (list of str): A list of prefixes. Articles with titles starting with any of these prefixes will be removed from consideration.

    Returns:
    tuple: A tuple containing:
        - DataFrame: The filtered DataFrame with articles that meet the specified criteria.
        - set: A set of `title_match` strings for articles that were discarded during the filtering process.

    Example usage:
    options = {
        'min_views': 20,
        'min_word_count': 100,
        'max_template_count': 250,
        'max_outlink_count': 100,
        'keep_category_count': 200,
        'prefixes_to_remove': [
            "Flokkur:", "Spjall:", "Snið:", "Notandaspjall:", "Notandi:", "Mynd:",
            "Wikipedia:", "Talk:", "Module:", "Wikipediaspjall:", "Sniðaspjall:",
            "Flokkaspjall:", "Melding:", "flokkur:", "Hjálp:", "User:", "wikipedia:",
            "notandi:", "Listi_yfir_CSI:", "snið:", "Gátt:", "Myndaspjall:", "Hjálparspjall:"
        ]
    }

    df_filtered, discarded_titles = filter_articles(df_merged, options)
    """
    # Unpack options
    min_views = options.get('min_views', 0)
    min_word_count = options.get('min_word_count', 0)
    max_template_count = options.get('max_template_count', float('inf'))
    max_outlink_count = options.get('max_outlink_count', float('inf'))
    keep_category_count = options.get('keep_category_count', 0)
    prefixes_to_remove = options.get('prefixes_to_remove', [])

    # Copy the dataframe to avoid modifying the original
    df_filtered = df.copy()

    # Remove unwanted prefixes
    for prefix in prefixes_to_remove:
        df_filtered = df_filtered[~df_filtered.title.str.startswith(prefix)]

    # Apply unconditional keeping criteria
    keep_unconditionally = (
        (df_filtered.views >= min_views) |
        (df_filtered.category_count >= keep_category_count)
    )

    # Apply conditional filtering criteria
    df_filtered = df_filtered[
        keep_unconditionally |
        (
            (df_filtered.word_count >= min_word_count) &
            (df_filtered.template_count <= max_template_count) &
            (df_filtered.outlink_count <= max_outlink_count)
        )
    ]

    # Identify discarded articles
    discarded_titles = set(df.title_match) - set(df_filtered.title_match)

    return df_filtered, discarded_titles

# Example usage
options = {
    'min_views': 60,
    'min_word_count': 120,
    'max_template_count': 250,
    'max_outlink_count': 100,
    'keep_category_count': 3,
    'prefixes_to_remove': [
        "Flokkur:", "Spjall:", "Snið:", "Notandaspjall:", "Notandi:", "Mynd:",
        "Wikipedia:", "Talk:", "Module:", "Wikipediaspjall:", "Sniðaspjall:",
        "Flokkaspjall:", "Melding:", "flokkur:", "Hjálp:", "User:", "wikipedia:",
        "notandi:", "Listi_yfir_CSI:", "snið:", "Gátt:", "Myndaspjall:", "Hjálparspjall:"
    ]
}



df_filtered, discarded_titles = filter_articles(df_merged, options)

print(len(df_filtered))

45050


In [24]:
df_filtered.head(2)

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews,views_percentile
0,147909,Listi yfir morð á Íslandi frá 1970–1999,https://is.wikipedia.org/wiki/Listi_yfir_mor%C3%B0_%C3%A1_%C3%8Dslandi_fr%C3%A1_1970%E2%80%931999,14782,68,0,,8,141,2024-08-24T05:04:34Z,\nListar yfir morð á Íslandi\n\nListi yfir morð á Íslandi frá 1874–1969\nListi yfir morð á Íslandi frá 1970–1999\nLi...,"<div class=""mw-collapsible"" style=""float: right"">\n<div style=""text-align: center; "">Listar yfir [[morð á Íslandi]]...",listi yfir morð á íslandi frá 1970–1999,10572.0,"{147909, null}",0.999735,https://is.wikipedia.org/w/index.php?curid=147909,0.999483
1,27958,Knattspyrnufélagið Valur,https://is.wikipedia.org/wiki/Knattspyrnuf%C3%A9lagi%C3%B0_Valur,14437,492,0,,259,153,2024-08-23T09:57:04Z,"\n\nValur er íslenskt íþróttafélag sem hefur aðstöðu að Hlíðarenda. Valur teflir fram liðum í knattspyrnu, handknatt...",{{Knattspyrnulið\n| Fullt nafn = Knattspyrnufélagið Valur\n| Mynd = [[Mynd:Valur.svg|250x250dp]]\n| Gælunafn = Valsa...,knattspyrnufélagið valur,4356.0,"{27958, null}",0.998827,https://is.wikipedia.org/w/index.php?curid=27958,0.997463


# Now prepare for embedding (split into chunks)

In [37]:
import sys
import importlib
sys.path.append("../../src")

from vectordb import chunk_utils
importlib.reload(chunk_utils)

pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_columns", 50)

df = df_filtered.copy()

df["split_text"] = df["processed_text"].apply(
    lambda x: chunk_utils.split_text(
        x,
        chunk_size=1250,
        chunk_overlap=100,
        discard_chunk_n_words_cutoff=12,  # discard chunks with less than 10 words
        clean_whitespace=True,
        clean_html=True,
        min_words_per_chunk=45,
    )
)

df["n_chunks"] = df["split_text"].apply(lambda x: len(x))
df["n_words_per_chunk"] = df["split_text"].apply(
    lambda x: [len(chunk.split()) for chunk in x]
)

df = df.sort_values("n_chunks", ascending=False)

df.head(2)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews,views_percentile,split_text,n_chunks,n_words_per_chunk
1,27958,Knattspyrnufélagið Valur,https://is.wikipedia.org/wiki/Knattspyrnuf%C3%A9lagi%C3%B0_Valur,14437,492,0,,259,153,2024-08-23T09:57:04Z,"\n\nValur er íslenskt íþróttafélag sem hefur aðstöðu að Hlíðarenda. Valur teflir fram liðum í knattspyrnu, handknatt...",{{Knattspyrnulið\n| Fullt nafn = Knattspyrnufélagið Valur\n| Mynd = [[Mynd:Valur.svg|250x250dp]]\n| Gælunafn = Valsa...,knattspyrnufélagið valur,4356.0,"{27958, null}",0.998827,https://is.wikipedia.org/w/index.php?curid=27958,0.997463,"[Valur er íslenskt íþróttafélag sem hefur aðstöðu að Hlíðarenda. Valur teflir fram liðum í knattspyrnu, handknattlei...",100,"[213, 161, 80, 209, 211, 127, 126, 205, 150, 115, 113, 152, 183, 243, 189, 240, 100, 144, 94, 251, 173, 95, 202, 63,..."
2,73089,Úrslit Gettu betur,https://is.wikipedia.org/wiki/%C3%9Arslit_Gettu_betur,12644,1395,0,,1,2,2024-03-21T22:06:54Z,\n\nÞetta er listi yfir úrslit í keppninni Gettu betur.\n\n== 2021-2030 ==\n=== 2024 ===\n25 lið skráðu sig til kepp...,{{DISPLAYTITLE:Úrslit <i>Gettu betur</i>}}\n\nÞetta er listi yfir úrslit í keppninni ''[[Gettu betur]]''.\n\n== 2021...,úrslit gettu betur,2728.0,"{73089, null}",0.997463,https://is.wikipedia.org/w/index.php?curid=73089,0.994363,[Þetta er listi yfir úrslit í keppninni Gettu betur.\n\n== 2021-2030 ==\n=== 2024 ===\n25 lið skráðu sig til keppni....,90,"[124, 85, 168, 110, 84, 172, 175, 149, 138, 83, 171, 82, 106, 177, 174, 185, 121, 87, 163, 115, 80, 166, 189, 137, 1..."


In [38]:
"""
Info about the number of word per chunk
for the article with the most chunks
"""

import math
import numpy as np

c = df.iloc[0].n_words_per_chunk

# print some stats about c
print("mean: ", sum(c) / len(c))
print("max: ", max(c))
print("min: ", min(c))
print("median: ", np.median(c))
print("std: ", math.sqrt(sum((x - sum(c) / len(c)) ** 2 for x in c) / len(c)))

mean:  147.75
max:  251
min:  50
median:  151.5
std:  51.14496553914178


In [41]:
"""
Check the smallest chunks for the article with the most chunks
"""

for i, chunk in enumerate(df.split_text.iloc[0]):
    if len(chunk.strip().split()) < 63:
        print(chunk)
        print(chunk.strip().split())
        print(i)
        print("---")

        break

28 ára hrakningasögu félagsins. Forystumenn félagsins höfðu háleitar hugsjónir um uppbyggingu að Hlíðarenda og horfðu til framtíðar, sáu fyrir sér draumsýnir um íþróttasvæði og íþróttamiðstöð en gerðu sér jafnframt grein fyrir því að langt yrði í land að draumur þeirra um framtíðarsvæði Vals rættist, en þeir höfðu tekið fyrsta skrefið.
['28', 'ára', 'hrakningasögu', 'félagsins.', 'Forystumenn', 'félagsins', 'höfðu', 'háleitar', 'hugsjónir', 'um', 'uppbyggingu', 'að', 'Hlíðarenda', 'og', 'horfðu', 'til', 'framtíðar,', 'sáu', 'fyrir', 'sér', 'draumsýnir', 'um', 'íþróttasvæði', 'og', 'íþróttamiðstöð', 'en', 'gerðu', 'sér', 'jafnframt', 'grein', 'fyrir', 'því', 'að', 'langt', 'yrði', 'í', 'land', 'að', 'draumur', 'þeirra', 'um', 'framtíðarsvæði', 'Vals', 'rættist,', 'en', 'þeir', 'höfðu', 'tekið', 'fyrsta', 'skrefið.']
32
---


## Analyze how the embedding would go

In [42]:
print(f"Total number of chunks: {df.n_chunks.sum():,}")

Total number of chunks: 108,931


In [45]:
# the n_words_per_chunk is a list of numbers

n_words_total = sum(df.n_words_per_chunk.sum())
n_words_total

14286264

In [47]:
n_tokens_estimated = n_words_total * 1.35

n_tokens_M = n_tokens_estimated / 1_000_000

openai_small_embed_price = 0.008  # per M tokens
openai_large_embed_price = 0.016  # per M tokens

print(f"Estimated n token in M tokens: {n_tokens_M:.2f}")
print(f"Estimated cost for small embeddings: {n_tokens_M * openai_small_embed_price:.2f} USD")
print(f"Estimated cost for large embeddings: {n_tokens_M * openai_large_embed_price:.2f} USD")


Estimated n token in M tokens: 19.29
Estimated cost for small embeddings: 0.15 USD
Estimated cost for large embeddings: 0.31 USD


openai_small_embed_price = 0.008  # per M tokens
openai_large_embed_price = 0.016  # per M tokens

##### Find the longest chunk of all

In [58]:
max_words = 0
max_i = 0
max_j = 0


for i, n_words_per_chunk in enumerate(df.n_words_per_chunk):
    for j, n_words in enumerate(n_words_per_chunk):
        if n_words > max_words:
            max_words = n_words
            max_i = i
            max_j = j

max_words, max_i, max_j


(398, 2852, 4)

In [60]:
print(df.iloc[max_i].title)
print(df.iloc[max_i].split_text[max_j])

Afbrigði latneska stafrófsins
== Pólska stafrófið ==
Pólska stafrófið hefur 32 bókstafi.

A	Ą	B	C	Ć	D	E	Ę	F	G	H	I	J	K	L	Ł	M	N	Ń	O	Ó	P	R	S	Ś	T	U	W	Y	Z	Ź	Ż
a	ą	b	c	ć	d	e	ę	f	g	h	i	j	k	l	ł	m	n	ń	o	ó	p	r	s	ś	t	u	w	y	z	ź	ż


Latnesku bókstafirnir Q, V og X eru ekki notaðir í pólsku. Pólska hefur 9 bókstafi sem ekki eru notaðir í latneska stafrófinu: Ą, Ć, Ę, Ł, Ń, Ó, Ś, Ź og Ż.

== Tékkneska stafrófið ==
Tékkneska stafrófið hefur 42 bókstafi:

A	Á	B	C	Č	D	Ď	E	É	Ě	F	G	H	CH	I	Í	J	K	L	M	N	Ň	O	Ó	P	Q	R	Ř	S	Š	T	Ť	U	Ú	Ů	V	W	X	Y	Ý	Z	Ž
a	á	b	c	č	d	ď	e	é	ě	f	g	h	ch	i	í	j	k	l	m	n	ň	o	ó	p	q	r	ř	s	š	t	ť	u	ú	ů	v	w	x	y	ý	z	ž

Athugið að farið er með "CH" sem sérstakan staf. Titilmálið er Ch.

== Slóvakíska stafrófið ==
Slóvakíska stafrófið hefur 46 bókstafi, sá lengsti af hverju evrópsku stafrófi.

A	Á	Ä	B	C	Č	D	Ď	Ǳ	Ǆ	E	É	F	G	H	CH	I	Í	J	K	L	Ĺ	Ľ	M	N	Ň	O	Ó	Ô	P	Q	R	Ŕ	S	Š	T	Ť	U	Ú	V	W	X	Y	Ý	Z	Ž
a	á	ä	b	c	č	d	ď	ǳ	ǆ	e	é	f	g	h	ch	i	í	j	k	l	ĺ	ľ	m	n	ň	o	ó	ô	p	q	r	ŕ	s	š	t	ť	u	ú	v	w	x	y	ý	z	ž

Ǳ, Ǆ og CH teljast bóks

## Now embed a subset of the data:

Just for test purposes, the next step will then be to embed the whole dataset.


In [51]:
"""
Example code:

import os
from together import Together

client = Together()

response = client.embeddings.create(
  model = "togethercomputer/m2-bert-80M-8k-retrieval",
  input = [
    "Our solar system orbits the Milky Way galaxy at about 515,000 mph",
    "Jupiter's Great Red Spot is a storm that has been raging for at least 350 years."
  ]
)
"""
import os
from together import Together
from dotenv import load_dotenv

load_dotenv()

client = Together(
    api_key=os.getenv("TOGETHER_API_KEY")
)
client

<together.client.Together at 0x3f5a1ef80>

In [53]:
# 100 chunks in the first article

chunks = df.iloc[0].split_text[:100]
chunks[0]


'Valur er íslenskt íþróttafélag sem hefur aðstöðu að Hlíðarenda. Valur teflir fram liðum í knattspyrnu, handknattleik og körfuknattleik og leika allir meistaraflokkar Vals í efstu deild bæði í karla- og kvennaflokki. Félagið var stofnað þann 11. maí árið 1911 af drengjum í K.F.U.M., að hluta til fyrir tilstilli séra Friðriks Friðrikssonar. Í fyrstu var Valur aðeins deild innan K.F.U.M. en síðar rofnuðu tengslin við K.F.U.M. Þrátt fyrir það minnast Valsmenn ávallt tengslanna við K.F.U.M. en einkunnarorð félagsins „Látið aldrei kappið bera fegurðina ofurliði“ eru fengin úr ræðu séra Friðriks sem hann hélt við vígslu fyrsta knattspyrnuvallar félagsins. Valur tók þátt í Íslandsmótinu í knattspyrnu karla í fyrsta sinn árið 1915 og varð Íslandsmeistari í fyrsta sinn árið 1930. Alls hefur meistaraflokkur karla unnið Íslandsmótið í knattspyrnu karla 23 sinnum, síðast árið 2020. Kvennalið Vals vann Íslandmeistaratitil í fyrsta sinn árið 1978 en alls hefur meistaraflokkur kvenna unnið Íslandsmót

In [61]:
# not sure if 2K is enough, but I think it should be
# we have a max chunk size of 1250, which should refer to n characters
# while the 2k model is for n tokens, so we should be fine
embedding_model = "togethercomputer/m2-bert-80M-2k-retrieval"

response = client.embeddings.create(
    model=embedding_model,
    input=chunks
)

response

EmbeddingResponse(id=None, model='togethercomputer/m2-bert-80M-2k-retrieval', object='list', data=[EmbeddingChoicesData(index=0, object=<ObjectType.Embedding: 'embedding'>, embedding=[-0.08451458, -0.014248785, -0.14626709, 0.076986425, 0.09913313, 0.24677683, -0.14302057, -0.05730217, -0.3053956, 0.16546549, -0.047718596, -0.4487139, 0.04270028, 0.0008288744, -0.10302171, 0.044810895, -0.37244594, -0.016276961, -0.26099703, -0.36959848, -0.11205893, -0.1555698, -0.1808031, 0.33073992, 0.06540301, -0.33176064, 0.4383163, 0.20263563, -0.1355899, -0.05916564, 0.11197033, -0.10887746, 0.26476175, -0.18410397, -0.21836288, 0.024222879, -0.194681, -0.3071166, 0.16951807, -0.19251555, -0.18918316, 0.069037035, 0.11512586, -0.047527496, -0.03645245, 0.16698909, -0.035255186, -0.28485394, 0.19098976, 0.040487915, -0.07627034, -0.048105195, 0.13759343, 0.029534834, 0.103885375, 0.04048426, 0.036174804, 0.078228824, -0.0044137556, 0.3171441, -0.16160066, -0.009427827, -0.16548172, -0.0010829723,

'list'

In [69]:
total_n_chunks = df.n_chunks.sum()

time_100_chunks_secs = 7.1

time_total_secs = time_100_chunks_secs * total_n_chunks / 100

print(f"Estimated time to embed all chunks: {time_total_secs / 60:.2f} minutes")


Estimated time to embed all chunks: 128.90 minutes


In [64]:
df.head(1)

Unnamed: 0,page_id,title,url,word_count,outlink_count,category_count,categories,template_count,external_link_count,last_modified,processed_text,raw_text,title_match,views,page_ids,percentile,url_pageviews,views_percentile,split_text,n_chunks,n_words_per_chunk
1,27958,Knattspyrnufélagið Valur,https://is.wikipedia.org/wiki/Knattspyrnuf%C3%A9lagi%C3%B0_Valur,14437,492,0,,259,153,2024-08-23T09:57:04Z,"\n\nValur er íslenskt íþróttafélag sem hefur aðstöðu að Hlíðarenda. Valur teflir fram liðum í knattspyrnu, handknatt...",{{Knattspyrnulið\n| Fullt nafn = Knattspyrnufélagið Valur\n| Mynd = [[Mynd:Valur.svg|250x250dp]]\n| Gælunafn = Valsa...,knattspyrnufélagið valur,4356.0,"{27958, null}",0.998827,https://is.wikipedia.org/w/index.php?curid=27958,0.997463,"[Valur er íslenskt íþróttafélag sem hefur aðstöðu að Hlíðarenda. Valur teflir fram liðum í knattspyrnu, handknattlei...",100,"[213, 161, 80, 209, 211, 127, 126, 205, 150, 115, 113, 152, 183, 243, 189, 240, 100, 144, 94, 251, 173, 95, 202, 63,..."


**works nicely**

#### Next steps:

 - Use the create_chunks() function in chunk_utils instead of split_text()
    - This will also add metadata to the chunks
 - Insert vectors to pinecone



