In [11]:
%reload_ext autoreload

In [1]:
import pandas as pd
import numpy as np
import json
from dutchanalyzer.config import *
from dutchanalyzer.utils import *
from pathlib import Path
from dotenv import load_dotenv
from io import StringIO
import datetime
import re

In [16]:
## PP = postprocessed file provided by kaikki
enwikt_en_pp_file = Path(RAW_KAIKKI_DIR, 'en', 'kaikki.org-dictionary-English.jsonl')
enwikt_nl_pp_file = Path(RAW_KAIKKI_DIR, 'en', 'kaikki.org-dictionary-Dutch.jsonl')
en_raw_file = Path(RAW_KAIKKI_DIR, 'en', 'kaikki_en-raw-wiktextract-data.jsonl') # EN without post processing 
nl_raw_file = Path(RAW_KAIKKI_DIR, 'en', 'kaikki_nl-raw-extract.jsonl')
nlwikt_en_pp_file = Path(RAW_KAIKKI_DIR, 'nl', 'kaikki.org-dictionary-Engels.jsonl')
nlwikt_nl_pp_file = Path(RAW_KAIKKI_DIR, 'nl', 'kaikki.org-dictionary-Nederlands.jsonl')

In [8]:
current_save_folder = Path(INTERIM_DATA_DIR, 'preprocessing', 'wikt', '31-10-25')
en_en_concat_folder = Path(current_save_folder, 'concat_dfs_en_en')
en_nl_concat_folder = Path(current_save_folder, 'concat_dfs_en_nl')
nl_en_concat_folder = Path(current_save_folder, 'concat_dfs_nl_en')
nl_nl_concat_folder = Path(current_save_folder, 'concat_dfs_nl_nl')

In [4]:
total_lines_en_en = count_lines_with_progress(enwikt_en_pp_file)
print(total_lines_en_en)
total_lines_en_nl = count_lines_with_progress(enwikt_nl_pp_file)
print(total_lines_en_nl)

Counting Lines: 100%|██████████| 2.80G/2.80G [00:02<00:00, 1.39GB/s]


1423954


Counting Lines: 100%|██████████| 237M/237M [00:00<00:00, 1.36GB/s]

140922





In [None]:
enwikt_en_df, en_error = make_raw_pages_df(enwikt_en_pp_file, en_en_concat_folder, 'csv', 100000, lang_prefix='en_en', lang_codes=['en'], total_lines=total_lines_en_en)

Loading JSON objects:  50%|█████     | 711977/1423954 [01:15<01:15, 9390.09it/s] 


In [7]:
enwikt_nl_df, en_nl_error = make_raw_pages_df(enwikt_nl_pp_file, en_nl_concat_folder, 'csv', 100000, lang_prefix='en_nl', lang_codes=['nl'], total_lines=total_lines_en_nl)

Loading JSON objects:  50%|█████     | 70461/140922 [00:01<00:01, 35638.35it/s]


In [9]:
total_lines_nl_en = count_lines_with_progress(nlwikt_en_pp_file)
print(total_lines_nl_en)
total_lines_nl_nl = count_lines_with_progress(nlwikt_nl_pp_file)
print(total_lines_nl_nl)

Counting Lines: 100%|██████████| 17.5M/17.5M [00:00<00:00, 1.20GB/s]


17441


Counting Lines: 100%|██████████| 1.20G/1.20G [00:00<00:00, 1.50GB/s]

611444





In [14]:
nlwikt_en_df, nl_en_error = make_raw_pages_df(nlwikt_en_pp_file, nl_en_concat_folder, 'csv', 100000, lang_prefix='nl_en', lang_codes=['en'], total_lines=total_lines_nl_en)

Loading JSON objects:  50%|█████     | 8721/17441 [00:00<00:00, 38455.14it/s]


In [10]:
nlwikt_nl_df, nl_nl_error = make_raw_pages_df(nlwikt_nl_pp_file, nl_nl_concat_folder, 'csv', 100000, lang_prefix='nl_nl', lang_codes=['nl'], total_lines=total_lines_nl_nl)

Loading JSON objects:  50%|█████     | 305722/611444 [00:59<00:59, 5161.00it/s] 


In [None]:
enwikt_en_df.to_csv(Path(RAW_KAIKKI_DIR, 'en', 'enwikt_en_pp_df_raw.csv'))

In [None]:
enwikt_nl_df.to_csv(Path(RAW_KAIKKI_DIR, 'en', 'enwikt_nl_pp_df_raw.csv'))

In [None]:
nlwikt_en_df.to_csv(Path(RAW_KAIKKI_DIR, 'nl', 'nlwikt_en_pp_df_raw.csv'))
nlwikt_nl_df.to_csv(Path(RAW_KAIKKI_DIR, 'nl', 'nlwikt_nl_pp_df_raw.csv'))

## No External Processing File

In [20]:
raw_en_concat_folder = Path(current_save_folder, 'concat_dfs_raw_en')
raw_nl_concat_folder = Path(current_save_folder, 'concat_dfs_raw_nl')

In [17]:
total_lines_raw_en = count_lines_with_progress(en_raw_file)
print(total_lines_raw_en)

Counting Lines: 100%|██████████| 21.3G/21.3G [00:15<00:00, 1.42GB/s]

10329308





In [18]:
total_lines_raw_nl = count_lines_with_progress(nl_raw_file)
print(total_lines_raw_nl)

Counting Lines: 100%|██████████| 1.15G/1.15G [00:00<00:00, 1.47GB/s]

1050145





In [None]:
RAW_ENWIKT_DF, error_list = make_raw_pages_df(en_raw_file, raw_en_concat_folder, 'csv', 100000, lang_prefix='en', lang_codes=['en', 'nl'], total_lines=total_lines_raw_en)

Loading JSON objects:  50%|█████     | 5164654/10329308 [08:04<08:04, 10650.47it/s] 


In [None]:
RAW_NLWIKT_DF, error_list = make_raw_pages_df(nl_raw_file, raw_nl_concat_folder, 'csv', 100000, lang_prefix='nl', lang_codes=['nl', 'en'], total_lines=total_lines_raw_nl)

Loading JSON objects:  50%|█████     | 525073/1050145 [00:19<00:19, 26810.06it/s]


In [None]:
RAW_ENWIKT_DF.to_csv(Path(RAW_KAIKKI_DIR, 'en', 'enwikt_raw_df.csv'))

In [None]:
RAW_NLWIKT_DF.to_csv(Path(RAW_KAIKKI_DIR, 'nl', 'nlwikt_raw_df.csv'))

## Split Raw DFs, begin processing

In [30]:
r_enwikt_en_df = RAW_ENWIKT_DF[RAW_ENWIKT_DF['lang_code'] == 'en']
r_enwikt_nl_df = RAW_ENWIKT_DF[RAW_ENWIKT_DF['lang_code'] == 'nl']

In [29]:
# RAW files will now have an r denoting it was from that file type
r_nlwikt_en_df = RAW_NLWIKT_DF[RAW_NLWIKT_DF['lang_code'] == 'en']
r_nlwikt_nl_df = RAW_NLWIKT_DF[RAW_NLWIKT_DF['lang_code'] == 'nl']

In [33]:
r_enwikt_en_df.to_csv(Path(raw_en_concat_folder, 'r_enwikt_en_df.csv'))

In [32]:
r_enwikt_nl_df.to_csv(Path(raw_en_concat_folder, 'r_enwikt_nl_df.csv'))

In [34]:
r_nlwikt_en_df.to_csv(Path(raw_nl_concat_folder, 'r_nlwikt_en_df.csv'))
r_nlwikt_nl_df.to_csv(Path(raw_nl_concat_folder, 'r_nlwikt_nl_df.csv'))