# Part 1: Books3 Dataset
datasource: https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home

For this dataset, we'll be scraping copyright tags and filenames for each of the 193,383 books in the dataset.
In part 2, we'll use string matching to match the scraped filenames to the Goodreads dataset.

In [8]:
import os
import pandas as pd
import re
import pickle

# create list of all characters 0-9, A-Z
characters = [chr(i) for i in range(48, 58)] + [chr(i) for i in range(65, 91)]

characters.append('0_Other')

In [16]:
alltitles = []
for character in characters:
    directorypath = './Bibliotik/{}'.format(character)
    filenames = os.listdir(directorypath.format(character))
    for filename in filenames:
        alltitles.append(character+ '/' + filename)

In [18]:
df = pd.DataFrame(alltitles, columns=['filepath'])

df['title'] = df['filepath'].apply(lambda x: x.replace('.epub.txt', ''))
df['title'] = df['title'].apply(lambda x: x.split('/', 1)[1])
df.head()

Unnamed: 0,filepath,title
0,0/059600298X_CPP.epub.txt,059600298X_CPP
1,0/01 - Alec Dunn.epub.txt,01 - Alec Dunn
2,0/03 Cole_ Ninja Of Earth (Scholastic) - Greg ...,03 Cole_ Ninja Of Earth (Scholastic) - Greg Fa...
3,0/0596006977_CNutshell.epub.txt,0596006977_CNutshell
4,0/091 - The Berenstain Bears and the In-Crowd ...,091 - The Berenstain Bears and the In-Crowd - ...


In [88]:
copyrighttags = ['COPYRIGHT', '©']
# find copyright in each file and print surrounding lines
def find_cpright(filepath):
    with open(filepath, 'r') as f:
        text = f.read()
        for tag in copyrighttags:
            matches = re.findall(tag, text, re.IGNORECASE)
            if matches:
                matchtexts = []
                for match in matches:
                    start = text.index(match)
                    end = min(start + 100, len(text))
                    matchtexts.append(text[start:end])
                return [filepath, matchtexts]
            else:
                continue

In [89]:
directorypath = './Bibliotik/{}'
pathlist = df['filepath'].to_list()

cprightlist = []

for path in pathlist:
    filepath = find_cpright(directorypath.format(path))
    cprightlist.append(filepath)

In [155]:
with open('cprightlist.pkl', 'wb') as f:
    pickle.dump(cprightlist, f)

In [1]:
'''
It's important to note that while Spacy is fairly accurate, it's not perfect. 
In their documentation, their largest language model 'en_core_web_trf' has a 90% accuracy rate on Named Entity Recognition tasks.
This accuracy rate could introduce bias into our analysis. For example, depending on how the model is trained, it may be more likely to identify common English names as people.
This would bias our data towards English names with non-English names less likely to be recognized as people.
However, for the scope of this project, we will work with the assumption that the model is accurate enough for our purposes.

'''

import spacy
nlp = spacy.load('en_core_web_trf')

In [153]:
def find_entities(item):
    entitydict = {
        'author': '',
        'title': '',
        'copyright': [],
        'copyrightYear': '',
        'filepath': '',
    }

    fp, texts = item

    fp_stripped = fp.replace('.epub.txt', '').replace('./Bibliotik/', '').split('/', )[1]
    fp_stripped = fp_stripped.replace('_', ' ').replace('[', '').replace(']', '')
    title, author = fp_stripped.rsplit(' - ', 1) if ' - ' in fp_stripped else (fp_stripped, 'Unknown')

    if nlp(author).ents and nlp(author).ents[0].label_ in {'PERSON', 'ORG'}:
        entitydict['author'] = nlp(author).ents[0].text
        entitydict['title'] = title
    elif nlp(title).ents and nlp(title).ents[0].label_ in {'PERSON', 'ORG'}:
        entitydict['author'] = nlp(title).ents[0].text
        entitydict['title'] = author
    else:
        entitydict['author'] = author
        entitydict['title'] = title

    if isinstance(texts, str):
        texts = [texts]

    for text in texts:
        text = text.replace('\n', ' ')
        textdoc = nlp(text)
        for ent in textdoc.ents:
            if ent.label_ in {'PERSON', 'ORG'} and any(tag in text[ent.start_char-12:ent.start_char] for tag in {'COPYRIGHT', '©', 'COPYRIGHT'}):
                entitydict['copyright'].append(ent.text)
            elif ent.label_ == 'DATE' and any(tag in text[ent.start_char-12:ent.start_char] for tag in {'COPYRIGHT', '©', 'COPYRIGHT'}):
                entitydict['copyrightYear'] = ent.text

    entitydict['filepath'] = fp
    return entitydict

In [17]:
entitylist = []
with open('entitylist.txt') as f:
    for line in f:
        entitylist.append(eval(line))

entity_df = pd.DataFrame(entitylist)
entity_df.to_parquet('books3copyright.parquet')