In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

from pathlib import Path
import os
from data_io import get_book

In [2]:
metadata_filename = 'metadata.csv'
counts_dirname = 'counts'
tokens_dirname = 'tokens'

metadata_df = pd.read_csv(metadata_filename)

filtered_df = metadata_df[(metadata_df.language == "['en']") & (metadata_df.type == 'Text')]

SELECTED_COLUMNS = ['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath']
filtered_df = filtered_df.dropna(subset=SELECTED_COLUMNS)
filtered_df = filtered_df[SELECTED_COLUMNS]
filtered_df = filtered_df.reset_index(drop=True)

author_count = filtered_df['author'].value_counts()
many_works_author = author_count[author_count >= 10]
filtered_df = filtered_df[filtered_df.author.isin(many_works_author.index.to_numpy())].reset_index()

'PG8700' in filtered_df.id

False

In [3]:
filtered_df = filtered_df.sample(n=50, random_state=2).reset_index()

docs = []
docs_unavail_pg_ids = []
for pg_id in filtered_df.id:    
    try:
        doc = ' '.join(get_book(pg_id, os.path.join(tokens_dirname), level='tokens'))
        docs.append(doc)
    except:
        docs_unavail_pg_ids.append(pg_id)
        
docs_unavail_pg_ids

['PG6248']

In [4]:
filtered_df = filtered_df[~filtered_df.id.isin(docs_unavail_pg_ids)]
'PG6248' in filtered_df.id

False

In [5]:
filtered_df = filtered_df[['id', 'author']]
filtered_df.insert(2, 'text', docs, True)
# filtered_df

In [6]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(filtered_df.text)
vectorizer.get_feature_names_out()

array(['aa', 'aa coom', 'aa doa', ..., 'νοῡς like', 'στοά',
       'στοά translator'], dtype=object)

In [7]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
X.toarray().shape

(49, 1668521)