In [None]:
%matplotlib inline
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pdb

### Retrieve, parse and combine db information

In [None]:
# import zotero database
zotero_csv = pd.read_csv('zotero.csv')  # library exported as csv

#### Zotero RDF
Use Zotero RDF export to get information about which items are in which collections.

In [None]:
# import zotero rdf databse
# it will be used to get information about collections
tree = ET.parse('zotero.rdf')  # library exported as rdf
root = tree.getroot()

In [None]:
# rdf's 'about' to zotero's 'key' mapping
keys = {}
tags = []
for child in root:
    key = child.find('{http://www.zotero.org/namespaces/export#}key')
    if key is not None:
        # if it has a zotero key, push it to the items array
        about = child.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
        if child.tag != '{http://www.zotero.org/namespaces/export#}Attachment':
            # exclude attachments because they have no relevant data
            # and are not included in the csv
            keys[about] = key.text

In [None]:
collections = {}
for collection in root.iter('{http://www.zotero.org/namespaces/export#}Collection'):
    title = collection.find('{http://purl.org/dc/elements/1.1/}title').text
    collection_id = collection.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
    children = []
    subcollections = []
    for child in collection.findall('{http://purl.org/dc/terms/}hasPart'):
        resource = child.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource')
        if resource.startswith('#collection_'):
            subcollections.append(resource)
        else:
            children.append(resource)
    collections[collection_id] = {
        'title': title,
        'subcollections': set(subcollections),
        'children': set(children)
    }

In [None]:
# get immediate parent for each collection
collection_parents = {}
for collection_id, collection in collections.items():
    for subcollection in collection['subcollections']:
        collection_parents[subcollection] = collection_id
    if collection_id not in collection_parents:
        collection_parents[collection_id] = None

In [None]:
# get full ancestry for each collection (i.e., parent, grandparent, etc)
collection_ancestries = {}
for collection in collection_parents.keys():
    collection_id = collection
    ancestry = []
    while True:
        parent = collection_parents[collection_id]
        if parent is not None:
            ancestry.insert(0, parent)
            collection_id = parent
        else:
            break
    collection_ancestries[collection] = ancestry

In [None]:
# get fullname for each collection: '... > grandparent > parent > child'
collection_fullnames = {}
for collection_id, collection in collections.items():
    name = collection['title']
    ancestor_names = [collections[ancestor]['title'] for ancestor in collection_ancestries[collection_id]]
    fullname = ancestor_names.append(name)
    collection_fullnames[collection_id] = ' > '.join(ancestor_names)

In [None]:
# list collections per item
item_collections = {}
for collection_id, collection in collections.items():
    collection_fullname = collection_fullnames[collection_id]
    for child in collection['children']:
        if child in keys:
            # ignore references to items not in the keys dict
            key = keys[child]
            if key not in item_collections:
                item_collections[key] = []
            item_collections[key].append(collection_fullname)

In [None]:
collections_df = pd.DataFrame([{'Key': about, 'collections': collections} for about, collections in item_collections.items()])

#### Combine data
Merge information about sources coming from the csv export file with information about collections coming from the rdf export file.

In [None]:
zotero_df = zotero_csv.merge(collections_df, how='outer', on='Key')

# if item belongs to no collection, replace nan with empty array
zotero_df.collections = zotero_df.collections.apply(lambda x: [] if x is np.nan else x)

### What type of sources are there in the database?

There are different types of sources (journal articles, books, book sections, etc). As long as the author name is available, estimating the gender is relatively straightforward regardless of the source type.  
However, depending on the type of source, how to determine the contact information and the author's location might change.  
Therefore, a first step would be to find out what is the proportion of each source type in the database. Note that the database has different collections, so a general view and a per-collection view will be provided.

In [None]:
df = zotero_df.loc[:, ['Key', 'Item Type', 'collections']]

# converge collections into their root collections
# for example: a > b > c --> a
def only_root_collections(collections):
    root_collections = []
    for collection in collections:
        root_collection = collection.split(' > ')[0]
        root_collections.append(root_collection)
    return list(set(root_collections))

df['root_collections'] = df['collections'].apply(only_root_collections)

# return unique root collections
root_collections = list(set(df.root_collections.sum()))

# add one column per root collection, and show true/false if item belongs to it
for root_collection in root_collections:
    df[root_collection] = df.root_collections.apply(lambda x: root_collection in x)

# add 'any' root collection for plotting
df['any'] = True
root_collections.insert(0, 'any')

# transform wide array into long array
df2 = pd.DataFrame()
for root_collection in root_collections:
    tmp = df.loc[df[root_collection], ['Key', 'Item Type']]
    tmp['root_collection'] = root_collection
    df2 = pd.concat([df2, tmp])

df2 = df2.reset_index(drop=True)

# plot
df3 = df2.groupby(['root_collection', 'Item Type']).agg({'Key': 'count'}).reset_index()
df3 = df3.rename(columns={'Key': 'count'})

# sorted item_types
item_types = df3.groupby('Item Type')['count'].sum().sort_values(ascending=False).index

sns.catplot(
    x='root_collection',
    y='count',
    hue='Item Type',
    data=df3,
    order=root_collections,
    hue_order=item_types,
    kind='bar'
)
plt.gcf().set_size_inches(15, 10)
plt.show()

del df, df2, df3

## Author information
### Make an author database
Gender and procedence are author information.

Therefore, it will be useful to expand the sources database (one row per source) into an author database (one row per author per source).

Some sources have 'Authors' (e.g., journal articles). Others have 'Editors' too (e.g., books with multiple authors). There may be other "person" categories. These will be obtained from the rdf export file. Note that some of these categories in the RDF file seem to be collapsed into the "Author" category in the CSV file (e.g., Podcaster -> Author).

- A 'Role' column will be added to the author database indicating whether the person is an author, editor, etc, for the corresponding source.
- A 'Order' column will be added indicating whether an author/editor/etc is the 1st, 2nd, ... author/editor/etc.



In [None]:
# types of agents (e.g., Author, Editor)
agent_types = root.findall('.//{http://xmlns.com/foaf/0.1/}Person/../../..')
agent_types = list(set([
    element
        .tag
        .split('}')[-1]
        .capitalize()[:-1]
    for element in agent_types
]))

In [None]:
# what columns in the original df correspond to agent/creators?
agent_columns = [column for column in zotero_df.columns if column in agent_types]

# expand the original df, one row per creator per source
authors_df = pd.DataFrame()
for agent_column in agent_columns:
    df = zotero_df.loc[:, ['Key', agent_column]]
    df[agent_column] = df[agent_column].str.split(';')

    df = (
        df
            .set_index('Key')[agent_column]
            .apply(pd.Series)
            .stack()
            .reset_index()
            .rename(columns={
                'level_1': 'Creator Order',
                0: 'Creator'
            })
    )
    df['Creator Role'] = agent_column
    authors_df = pd.concat([authors_df, df])

authors_df = authors_df.reset_index(drop=True)

# bring information of complete df into the expanded df
authors_df = authors_df.merge(
    zotero_df[[column for column in zotero_df.columns if column not in agent_columns]],
    how='outer',
    on='Key',
)

# remember to handle sources without authors



# df = zotero_df.loc[:, other_columns + [agent_column]]
    

In [None]:
authors_df

In [None]:
authors_df

### Gender

Using Worldwide gender-name dictionary to infer gender from name
> Julio Raffo, 2016.  
> "Worldwide Gender-Name Dictionary," WIPO Economics & Statistics Related Resources
> 10, World Intellectual Property Organization - Economics and Statistics Division.
> <https://ideas.repec.org/c/wip/eccode/10.html>

...

Working with the CSV export from Zotero, agents (authors, editors, etc) are separated with ';', and are displayed in the form  "\<surname\>, \<givenName\>".  
It will be assumed that if no comma is present, the agent is not a person but an institution instead, and gender will not be inferred.

In [None]:
# import Worldwide gender-name dictionary
# gender_name_dict = pd.