In [16]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from pathlib import Path
import os
from data_io import get_book

In [17]:
def author_to_onehop(df):
    s = pd.Series(list(df['author']))
    one_hot = pd.get_dummies(s)
    df = pd.concat([df, one_hot], axis=1)
    return df

In [18]:
metadata_filename = 'metadata.csv'
counts_dirname = 'counts'
tokens_dirname = 'tokens'

metadata_df = pd.read_csv(metadata_filename)

filtered_df = metadata_df[(metadata_df.language == "['en']") & (metadata_df.type == 'Text')]

SELECTED_COLUMNS = ['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath']
filtered_df = filtered_df.dropna(subset=SELECTED_COLUMNS)
filtered_df = filtered_df[SELECTED_COLUMNS]
filtered_df = filtered_df.reset_index(drop=True)

author_count = filtered_df['author'].value_counts()
many_works_author = author_count[author_count >= 10]
filtered_df = filtered_df[filtered_df.author.isin(many_works_author.index.to_numpy())].reset_index()

'PG8700' in filtered_df.id

False

In [19]:
# filtered_df = filtered_df.sample(n=50, random_state=2).reset_index()

sampled_authors = filtered_df.author.sample(n=50, random_state=1)

train_ids = []
test_ids = []
val_ids = []

for author in sampled_authors:
    works = filtered_df[filtered_df.author == author].sample(n=3, random_state=1)
    train_id, test_id, val_id = works.id
    
    # Does not check if this file exists and is valid
    
    train_ids.append(train_id)
    test_ids.append(test_id)
    val_ids.append(val_id)



train_df = filtered_df[filtered_df.id.isin(train_ids)]
test_df = filtered_df[filtered_df.id.isin(test_ids)]
val_df = filtered_df[filtered_df.id.isin(val_ids)]

df_arrs = []
for df in [train_df, test_df, val_df]:
    # one hot
    df = author_to_onehop(df)
    docs = []   
    docs_unavail_pg_ids = []
    for pg_id in df.id:    
        try:
            tokens = get_book(pg_id, os.path.join(tokens_dirname), level='tokens')
            # docs: a list of list
            docs.append(tokens)
        except:
            docs_unavail_pg_ids.append(pg_id)
    
    df = df[~df.id.isin(docs_unavail_pg_ids)].reset_index()
    df = df.drop(columns=['level_0', 'index', 'id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath'])
    
    # split to 512
    chunk_size = 512
    chunk_list = []
    for index, row in df.iterrows():
        doc = docs[index]
        for i in range(0, len(doc), chunk_size):
            sub_doc = pd.Series({"text": ' '.join(doc[i: i+chunk_size])})
            new_row = pd.concat([sub_doc, row])
            chunk_list.append(new_row)
    chunk_df = pd.DataFrame(chunk_list)
    #df.insert(0, 'text', docs, True)
    df_arrs.append(chunk_df)

train_df, test_df, val_df = df_arrs

48
48
48


In [20]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)
val_df.to_csv('val.csv', index=False)