In [1]:
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import polars as pl
import pandas as pd
import pyarrow.parquet as pypq
import textwrap
from pathlib import Path
import time

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [3]:
works2topic_by_topic_parquet_folder = "data/works2topic_by_topic_parquet/"
author2work_by_topic_parquet_folder = "data/author2work_by_topic_parquet/"
filtered_works_data_by_topic_parquet_folder = "data/filtered_works_data_by_topic_parquet/"
os.makedirs(filtered_works_data_by_topic_parquet_folder, exist_ok=True)

In [4]:
topics = [topic[:-8] for topic in os.listdir(works2topic_by_topic_parquet_folder)]

In [None]:
for topic in tqdm(topics):
    works2topics_topic_df = pd.read_parquet(os.path.join(works2topic_by_topic_parquet_folder, f'{topic}.parquet'))
    works2topics_topic_df['s0'] = works2topics_topic_df['s0'].astype(float)
    works2topics_topic_df['s1'] = works2topics_topic_df['s1'].astype(float)
    works2topics_topic_df['s2'] = works2topics_topic_df['s2'].astype(float)
    max_topics = 3

    # Filter at score 0.9
    for i in range(max_topics):
        s_col = f's{i}'
        t_col = f't{i}'
        mask = works2topics_topic_df[s_col] < 0.9
        works2topics_topic_df.loc[mask, [s_col, t_col]] = pd.NA
    # get list of topics per work
    topic_cols = ['t0', 't1', 't2']
    works2topics_topic_df['topics'] = works2topics_topic_df[topic_cols].apply(lambda row: [t for t in row if pd.notna(t)], axis=1)
    works2topics_topic_df.rename(columns={'id': 'work_id'}, inplace=True)
    works2topics_topic_df = works2topics_topic_df[['work_id', 'date', 'topics']]

    # GDB TODO ADD DOI
    
    works2author_topic_df = pd.read_parquet(os.path.join(author2work_by_topic_parquet_folder, f'{topic}.parquet'), columns=['author_id', 'work_id', 'date'])
    authors_by_work_df = works2author_topic_df.groupby(['work_id', 'date'])['author_id'].agg(list).reset_index()
    authors_by_work_df.rename(columns={'author_id': 'authors'}, inplace=True)
    
    filtered_df = pd.merge(works2topics_topic_df, authors_by_work_df, on=['work_id', 'date'], how='inner')
    # Filter out rows where len(topics) == 0
    filtered_df = filtered_df[filtered_df['topics'].apply(len) > 0]
    # Sort by date in ascending order
    filtered_df['date'] = pd.to_datetime(filtered_df['date'], errors='coerce')
    filtered_df = filtered_df.sort_values(by=['date', 'work_id'], ascending=True).reset_index(drop=True)
    filtered_df.to_parquet(os.path.join(filtered_works_data_by_topic_parquet_folder, f'{topic}.parquet'), compression = "brotli")

  1%|▏         | 58/4516 [08:10<6:46:48,  5.48s/it] 

In [6]:
filtered_df

Unnamed: 0,work_id,date,topics,authors
0,W3195138540,1970-01-01,"[T14064, T14470]",[A5090862154]
1,W3195333390,1970-01-01,[T14064],"[A5000351189, A5062658976]"
2,W2577896278,1974-01-01,[T14064],[A5080382136]
3,W655054654,1975-01-01,[T14064],[A5086115443]
4,W565300031,1982-01-01,[T14064],[A5084437035]
...,...,...,...,...
10429,W4410774481,2025-05-26,"[T14064, T13560, T10006]","[A5018920724, A5117712517]"
10430,W4410760926,2025-05-27,"[T14064, T10055, T12651]","[A5111042122, A5117707616, A5117707617, A51177..."
10431,W4410778805,2025-05-27,[T14064],"[A5112681361, A5109320604]"
10432,W4410786447,2025-05-27,[T14064],[A5117716867]
