### Motivation
For the library, we kept both the Results and Conclusion sections, that we joined before chunking.
But the policy analysis pipeline is more sensitive to noise and mostly interested in the content of the conclusion.
Thus, we keep only the conclusions.

In [1]:
import pandas as pd
from pyarrow.parquet import ParquetFile, ParquetWriter, read_table, write_table
from pyarrow import Table

### Filter for discussions

In [6]:
pqf = ParquetFile('data/results_conclusions_585k_2025-01-02.parquet')

In [7]:
pqf.metadata.num_rows

585775

In [8]:
iter = pqf.iter_batches(batch_size=50000, columns=['openalex_id', 'discussion'])
batch = next(iter).to_pandas()

In [9]:
batch

Unnamed: 0_level_0,discussion
openalex_id,Unnamed: 1_level_1
W3003650968,"(619), 10515 (Siregi), 10551 (Logawa), dan 105..."
W4394796190,The study's conclusions indicate that a heavy ...
W2052294550,"The analysis using daily precipitation data, h..."
W3139482061,As one of the major rice cultivators and coal ...
W1638538521,The algal coverage data showed that the green ...
...,...
W3090877303,"The performance of 32 simple ET0 alternatives,..."
W4229064146,The results of our study show that inflammatio...
W4390186956,"In recent decades, changes in the structure of..."
W4385603623,This study explored how Chinese adolescents co...


In [10]:
(batch == '').sum()

discussion    2766
dtype: int64

In [None]:
BS = 50_000
writer = None
output_file = 'data/conclusions_only.parquet'
total = 0
try:
    for batch in pqf.iter_batches(batch_size=BS, columns=['openalex_id', 'discussion']):
        df = batch.to_pandas()
        df = df[df['discussion'].str.strip() !=  '']

        # save progressively to results file
        table = Table.from_pandas(df)
        if writer is None:
            writer = ParquetWriter(output_file, table.schema)
        writer.write_table(table)
        total += len(df)
        print(f'Wrote {total} rows', end='\r')
finally:
    writer.close()

In [None]:
opf = ParquetFile(output_file)

In [None]:
opf.metadata.num_rows

557423

In [None]:
iter = opf.iter_batches(batch_size=50000)
batch = next(iter).to_pandas()

In [None]:
batch

Unnamed: 0_level_0,discussion
openalex_id,Unnamed: 1_level_1
W3003650968,"(619), 10515 (Siregi), 10551 (Logawa), dan 105..."
W4394796190,The study's conclusions indicate that a heavy ...
W2052294550,"The analysis using daily precipitation data, h..."
W3139482061,As one of the major rice cultivators and coal ...
W1638538521,The algal coverage data showed that the green ...
...,...
W1963569670,"In the present sample, the prevalence of abnor..."
W3145352840,The results obtained in experiments with air h...
W3122354699,Many countries are liberalising their trade po...
W4391022307,HCAT\nDespite the necessity of more quantitati...


In [None]:
(batch == '').sum()

discussion    0
dtype: int64

### Chunk discussions

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
splitter = SentenceTransformersTokenTextSplitter(model_name='Qwen/Qwen3-Embedding-0.6B', tokens_per_chunk=1024)

In [4]:
pf = ParquetFile('data/conclusions_557k_2026-01-26.parquet')

In [31]:
iter = pf.iter_batches(batch_size=10000)
batch = next(iter).to_pandas()

In [32]:
splitter.count_tokens(text=batch.discussion.iloc[0])

68

In [8]:
def count_tokens(text: str) -> int:
    return splitter.count_tokens(text=text)

def process_row(idx, row):
    text = row.discussion
    chunks = splitter.split_text(text)
    return [(idx,i,c) for i, c in enumerate(chunks)]

#print(count_tokens(text=batch.discussion.iloc[0]))
#process_row(0, batch.iloc[0])

In [35]:
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_row, idx, row) for idx, row in batch.iterrows()]
    print("Submitted all tasks")
    results = []
    for future in tqdm(as_completed(futures), total=len(batch), desc="Processing rows"):
        results += future.result()

Submitted all tasks


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
pd.DataFrame(results, columns=['original_idx', 'chunk_idx', 'chunk_text'])

In [9]:
BS = 10_000
writer = None
output_file = 'data/chunked_conclusions.parquet'
total = 0
try:
    for batch in pf.iter_batches(batch_size=BS, columns=['openalex_id', 'discussion']):
        df = batch.to_pandas()
        
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(process_row, idx, row) for idx, row in df.iterrows()]
            results = []
            for future in as_completed(futures):
                results += future.result()

        table = Table.from_pandas(pd.DataFrame(results, columns=['original_idx', 'chunk_idx', 'chunk_text']))
        if writer is None:
            writer = ParquetWriter(output_file, table.schema)
        writer.write_table(table)
        total += len(df)
        print(f'Wrote {total} rows')
finally:
    writer.close()

Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 10000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 20000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 30000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 40000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 50000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 60000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 70000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 80000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 90000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 100000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 110000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 120000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 130000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 140000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 150000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 160000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 170000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 180000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 190000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 200000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 210000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 220000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 230000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 240000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 250000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 260000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 270000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 280000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 290000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 300000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 310000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 320000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 330000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 340000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 350000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 360000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 370000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 380000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 390000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 400000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 410000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 420000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 430000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 440000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 450000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 460000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 470000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 480000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 490000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 500000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 510000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 520000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 530000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 540000 rows


Processing rows:   0%|          | 0/10000 [00:00<?, ?it/s]

Wrote 550000 rows


Processing rows:   0%|          | 0/7423 [00:00<?, ?it/s]

Wrote 557423 rows


In [10]:
cpf = ParquetFile('data/chunked_conclusions.parquet')
cpf.metadata.num_rows

1254377

In [14]:
iter = cpf.iter_batches(batch_size=1000)
batch = next(iter).to_pandas()

In [19]:
table = read_table('data/chunked_conclusions.parquet')

In [22]:
renamed_table = table.rename_columns(['openalex_id', 'chunk_idx', 'text'])

In [24]:
write_table(renamed_table, 'data/chunked_conclusions_renamed.parquet')

In [6]:
df = pd.read_parquet('data/chunked_conclusions.parquet', columns=['openalex_id', 'chunk_idx'])

In [14]:
counts = df.groupby('openalex_id').chunk_idx.max().value_counts()
counts[:20]

chunk_idx
0     278317
1     118618
2      81004
3      37959
4      16965
5       8269
6       4594
7       2802
8       1845
9       1236
10       925
11       736
12       564
13       430
14       354
15       322
16       235
17       211
18       195
19       156
Name: count, dtype: int64

In [22]:
counts.index.max()

np.int64(520)

In [23]:
counts.cumsum()[:20]

chunk_idx
0     278317
1     396935
2     477939
3     515898
4     532863
5     541132
6     545726
7     548528
8     550373
9     551609
10    552534
11    553270
12    553834
13    554264
14    554618
15    554940
16    555175
17    555386
18    555581
19    555737
Name: count, dtype: int64

### Sample 10k texts
To avoid incomplete textes for this sample, we won't work with chunks but with a sample of the texts short enough to fit comfortaby in the model's context.
Practically, we'll take those that fit in 3 chunks or less.

In [3]:
conc = pd.read_parquet('data/chunked_conclusions_557k_2026-01-26.parquet', columns=['openalex_id', 'chunk_idx'])

In [6]:
chunk_counts = conc.groupby('openalex_id').chunk_idx.max()
chunk_counts

openalex_id
W1000416519    1
W1000499890    1
W1000757617    0
W1000792626    0
W1000905369    2
              ..
W994598208     0
W994607650     2
W997083526     0
W999532798     0
W999656656     1
Name: chunk_idx, Length: 557404, dtype: int64

In [7]:
chunk_counts.value_counts()

chunk_idx
0      278317
1      118618
2       81004
3       37959
4       16965
        ...  
126         1
163         1
106         1
120         1
136         1
Name: count, Length: 150, dtype: int64

In [17]:
# 0 = 1 chunk
# 1 = 2 chunks
# 2 = 3 chunks
SAMPLE_SIZE = 10_000
eligible = chunk_counts[chunk_counts <= 2].sample(SAMPLE_SIZE).index.values.tolist()
eligible

['W2469147023',
 'W4224242952',
 'W4388617202',
 'W4393087284',
 'W4312210460',
 'W2742097602',
 'W2092324318',
 'W2009309746',
 'W3119421705',
 'W2911270562',
 'W4398141644',
 'W4391185440',
 'W4323314204',
 'W4210505916',
 'W4391296666',
 'W4323075131',
 'W4385567849',
 'W4200565109',
 'W4366596274',
 'W4389223257',
 'W4380147842',
 'W4390805466',
 'W2975382526',
 'W2890090460',
 'W4399288307',
 'W2154538018',
 'W4288084584',
 'W2605499706',
 'W2893533321',
 'W3023005963',
 'W3217541616',
 'W4390202842',
 'W4224282304',
 'W2601539792',
 'W3134106831',
 'W1965286517',
 'W4390737420',
 'W3176395198',
 'W3032612633',
 'W3003266019',
 'W4410189033',
 'W2762529737',
 'W2922720930',
 'W4365147651',
 'W4322743891',
 'W4390600098',
 'W2005356531',
 'W4406696164',
 'W2766982489',
 'W4385364852',
 'W4385724317',
 'W3124610089',
 'W4307164518',
 'W4414282619',
 'W2780262699',
 'W2057208463',
 'W3095620834',
 'W4414155476',
 'W4378194620',
 'W4395958099',
 'W3207490359',
 'W2029962400',
 'W44010

In [19]:
pf = ParquetFile('data/conclusions_557k_2026-01-26.parquet')

In [21]:
# keep only rows with openalex_id in eligible
sample_df = pd.read_parquet('data/conclusions_557k_2026-01-26.parquet', filters=[('openalex_id', 'in', eligible)])

In [23]:
len(eligible)

10000

In [24]:
sample_df

Unnamed: 0_level_0,discussion
openalex_id,Unnamed: 1_level_1
W4387168299,This is the first study to assess the associat...
W3014778543,The main conclusions extracted from this study...
W3048162401,"LPG, CNG, ethanol and biodiesel are good candi..."
W1005805659,The results of the study revealed that growth ...
W4319601041,"In our use of participatory video, following R..."
...,...
W4390273040,Erosion is a significant issue impacting upon ...
W4298143484,"In this article, the individual and collective..."
W4387449772,Research findings\nAccording to the context of...
W2623732815,A FDS Input Files\nList of Figures\nPredicted ...


In [27]:
set(sample_df.index) - set(eligible)

set()

In [31]:
sample_df[sample_df.duplicated()]

Unnamed: 0_level_0,discussion
openalex_id,Unnamed: 1_level_1
W4383536051,"This study aimed to analyze 88 types of VOCs, ..."


In [34]:
sample_df = sample_df.drop_duplicates()
sample_df

Unnamed: 0_level_0,discussion
openalex_id,Unnamed: 1_level_1
W4387168299,This is the first study to assess the associat...
W3014778543,The main conclusions extracted from this study...
W3048162401,"LPG, CNG, ethanol and biodiesel are good candi..."
W1005805659,The results of the study revealed that growth ...
W4319601041,"In our use of participatory video, following R..."
...,...
W4390273040,Erosion is a significant issue impacting upon ...
W4298143484,"In this article, the individual and collective..."
W4387449772,Research findings\nAccording to the context of...
W2623732815,A FDS Input Files\nList of Figures\nPredicted ...


In [35]:
sample_df.discussion.str.len().describe()

count    10000.000000
mean      4947.200600
std       3755.628966
min          3.000000
25%       1723.750000
50%       3788.000000
75%       7731.000000
max      17705.000000
Name: discussion, dtype: float64

In [36]:
sample_df.to_parquet('data/conclusions_sample_10k_2026-01-27.parquet')