In [None]:
%matplotlib inline
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pdb

### Retrieve, parse and combine db information

In [None]:
# import zotero database
zotero_csv = pd.read_csv('zotero.csv')  # library exported as csv

#### Zotero RDF
Use Zotero RDF export to get information about which items are in which collections.

In [None]:
# import zotero rdf databse
# it will be used to get information about collections
tree = ET.parse('zotero.rdf')  # library exported as rdf
root = tree.getroot()

In [None]:
# rdf's 'about' to zotero's 'key' mapping
keys = {}
tags = []
for child in root:
    key = child.find('{http://www.zotero.org/namespaces/export#}key')
    if key is not None:
        # if it has a zotero key, push it to the items array
        about = child.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
        if child.tag != '{http://www.zotero.org/namespaces/export#}Attachment':
            # exclude attachments because they have no relevant data
            # and are not included in the csv
            keys[about] = key.text

In [None]:
collections = {}
for collection in root.iter('{http://www.zotero.org/namespaces/export#}Collection'):
    title = collection.find('{http://purl.org/dc/elements/1.1/}title').text
    collection_id = collection.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about')
    children = []
    subcollections = []
    for child in collection.findall('{http://purl.org/dc/terms/}hasPart'):
        resource = child.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource')
        if resource.startswith('#collection_'):
            subcollections.append(resource)
        else:
            children.append(resource)
    collections[collection_id] = {
        'title': title,
        'subcollections': set(subcollections),
        'children': set(children)
    }

In [None]:
# get immediate parent for each collection
collection_parents = {}
for collection_id, collection in collections.items():
    for subcollection in collection['subcollections']:
        collection_parents[subcollection] = collection_id
    if collection_id not in collection_parents:
        collection_parents[collection_id] = None

In [None]:
# get full ancestry for each collection (i.e., parent, grandparent, etc)
collection_ancestries = {}
for collection in collection_parents.keys():
    collection_id = collection
    ancestry = []
    while True:
        parent = collection_parents[collection_id]
        if parent is not None:
            ancestry.insert(0, parent)
            collection_id = parent
        else:
            break
    collection_ancestries[collection] = ancestry

In [None]:
# get fullname for each collection: '... > grandparent > parent > child'
collection_fullnames = {}
for collection_id, collection in collections.items():
    name = collection['title']
    ancestor_names = [collections[ancestor]['title'] for ancestor in collection_ancestries[collection_id]]
    fullname = ancestor_names.append(name)
    collection_fullnames[collection_id] = ' > '.join(ancestor_names)

In [None]:
# list collections per item
item_collections = {}
for collection_id, collection in collections.items():
    collection_fullname = collection_fullnames[collection_id]
    for child in collection['children']:
        if child in keys:
            # ignore references to items not in the keys dict
            key = keys[child]
            if key not in item_collections:
                item_collections[key] = []
            item_collections[key].append(collection_fullname)

In [None]:
collections_df = pd.DataFrame([{'Key': about, 'collections': collections} for about, collections in item_collections.items()])

#### Combine data
Merge information about sources coming from the csv export file with information about collections coming from the rdf export file.

In [None]:
zotero_df = zotero_csv.merge(collections_df, how='outer', on='Key')

# if item belongs to no collection, replace nan with empty array
zotero_df.collections = zotero_df.collections.apply(lambda x: [] if x is np.nan else x)

# converge collections into their root collections
# for example: a > b > c --> a
def only_root_collections(collections):
    root_collections = []
    for collection in collections:
        root_collection = collection.split(' > ')[0]
        root_collections.append(root_collection)
    return list(set(root_collections))

zotero_df['root_collections'] = zotero_df['collections'].apply(only_root_collections)

# return unique root collections
root_collections = list(set(zotero_df.root_collections.sum()))

# add one column per root collection, and show true/false if item belongs to it
for root_collection in root_collections:
    zotero_df['RC: ' + root_collection] = zotero_df.root_collections.apply(lambda x: root_collection in x)

### What type of sources are there in the database?

There are different types of sources (journal articles, books, book sections, etc). As long as the author name is available, estimating the gender is relatively straightforward regardless of the source type.  
However, depending on the type of source, how to determine the contact information and the author's location might change.  
Therefore, a first step would be to find out what is the proportion of each source type in the database. Note that the database has different collections, so a general view and a per-collection view will be provided.

In [None]:
root_collections = [column for column in zotero_df.columns if column.startswith('RC: ')]

df = zotero_df.loc[:, ['Key', 'Item Type', 'collections'] + root_collections]

# add 'any' root collection for plotting
df['any'] = True
root_collections.insert(0, 'any')

# transform wide array into long array
df2 = pd.DataFrame()
for root_collection in root_collections:
    tmp = df.loc[df[root_collection], ['Key', 'Item Type']]
    tmp['root_collection'] = root_collection
    df2 = pd.concat([df2, tmp])

df2 = df2.reset_index(drop=True)

# plot
df3 = df2.groupby(['root_collection', 'Item Type']).agg({'Key': 'count'}).reset_index()
df3 = df3.rename(columns={'Key': 'count'})

# sorted item_types
item_types = df3.groupby('Item Type')['count'].sum().sort_values(ascending=False).index

sns.catplot(
    x='root_collection',
    y='count',
    hue='Item Type',
    data=df3,
    order=root_collections,
    hue_order=item_types,
    kind='bar'
)
plt.gcf().set_size_inches(15, 10)
plt.show()

del df, df2, df3, root_collections

## Author information
### Make an author database
Gender and procedence are author information.

Therefore, it will be useful to expand the sources database (one row per source) into an author database (one row per author per source).

Some sources have 'Authors' (e.g., journal articles). Others have 'Editors' too (e.g., books with multiple authors). There may be other "person" categories. These will be obtained from the rdf export file. Note that some of these categories in the RDF file seem to be collapsed into the "Author" category in the CSV file (e.g., Podcaster -> Author).

- A 'Role' column will be added to the author database indicating whether the person is an author, editor, etc, for the corresponding source.
- A 'Order' column will be added indicating whether an author/editor/etc is the 1st, 2nd, ... author/editor/etc.



In [None]:
# types of agents (e.g., Author, Editor)
agent_types = root.findall('.//{http://xmlns.com/foaf/0.1/}Person/../../..')
agent_types = list(set([
    element
        .tag
        .split('}')[-1]
        .capitalize()[:-1]
    for element in agent_types
]))

In [None]:
# what columns in the original df correspond to agent/creators?
agent_columns = [column for column in zotero_df.columns if column in agent_types]

# expand the original df, one row per creator per source
authors_df = pd.DataFrame()
for agent_column in agent_columns:
    df = zotero_df.loc[:, ['Key', agent_column]]
    df[agent_column] = df[agent_column].str.split(';')

    df = (
        df
            .set_index('Key')[agent_column]
            .apply(pd.Series)
            .stack()
            .reset_index()
            .rename(columns={
                'level_1': 'Creator Order',
                0: 'Creator'
            })
    )
    df['Creator Role'] = agent_column
    authors_df = pd.concat([authors_df, df])

authors_df = authors_df.reset_index(drop=True)

# bring information of complete df into the expanded df
authors_df = authors_df.merge(
    zotero_df[[column for column in zotero_df.columns if column not in agent_columns]],
    how='outer',
    on='Key',
)
authors_df.head()

### Gender

Using Worldwide gender-name dictionary to infer gender from name
> Julio Raffo, 2016.  
> "Worldwide Gender-Name Dictionary," WIPO Economics & Statistics Related Resources
> 10, World Intellectual Property Organization - Economics and Statistics Division.
> <https://ideas.repec.org/c/wip/eccode/10.html>

They have information from 14 different sources. Each source may have information from more than one country. Hence, the units of information are name-country pairs (and their reported gender).  
They report that there is 10% overlap between sources (i.e., names provided by more than one source), but that only in some cases (0.7%) there are conflicts between sources (i.e., name-country pairs which are reported with different genders among sources).

This source provides four different databases:
- WGND_source: this is the original data. A list of name-country pairs, and the gender reported by each of the 14 information sources.
- WGND_country: a list of name-country pairs, with the consensus gender among sources (given that the rate of conflict is low).
- WGND_noctry: including only names without conflict among countries (hence, the shortest database).
- WGND_langctry: an expanded database, where each name-country pair was expanded to include other countries which speak the same language.
For more information, see the original paper: https://www.wipo.int/edocs/pubdocs/en/wipo_pub_econstat_wp_33.pdf

We will use the WGND_country database here.
...

Working with the CSV export from Zotero, agents (authors, editors, etc) are separated with ';', and are displayed in the form  "\<surname\>, \<givenName\>".  
It will be assumed that if no comma is present, the agent is not a person but an institution instead, and gender will not be inferred.

In [None]:
# import Worldwide gender-name dictionary
# only empty string '' will be taken as 'nan',
# otherwise 'na' is taken as 'nan' (and there are names and codes 'na' in the db)
names = pd.read_csv('wgnd_ctry.csv', na_values=[''], keep_default_na=False)
names.tail()

In [None]:
def get_gender(fullname):
    # given an author's fullname (surname, name), it returns:
    # consensus gender: most frequent gender for the given names
    # agreement index: percentage of agreement for the consensus gender
    # split: if the name was found "as-is", or if it had to be split to be found
    # male/female/ambiguous: details of the matches found
    import unicodedata
    import regex
    result = {
        'gender': None,
        'agreement': None,
        'split': None,
        'male': None,
        'female': None,
        'ambiguous': None
    }
    
    if fullname == fullname:  # false for nan
        fullname = fullname.split(', ')

        if len(fullname) == 2:
            # keep given names only (do not use surname)
            name_string = fullname[-1]
            # drop accents
            name_string = u"".join(
                [c for c in unicodedata.normalize('NFKD', name_string) if not unicodedata.combining(c)]
            )
            # capitalize
            name_string = name_string.upper()
            # replace unicode dash-type characters (regex \p{Pd}) with spaces
            name_string = regex.sub(r'\p{Pd}+', ' ', name_string)

            # remove extra spaces at beginning and end
            name_string = name_string.strip()
            # find all matching name-country pairs
            matching_names = names.loc[names['name'] == name_string, :]
            if len(matching_names) > 0:
                # the name was found "as-is"
                did_split = False
            else:
                # the name was not found. Trying its parts separately
                did_split = True
                for name in name_string.split():
                    # remove extra spaces at beginning and end
                    name = name.strip()
                    df = names.loc[names['name'] == name, :]
                    matching_names = pd.concat([matching_names, df])

            if len(matching_names) > 0:
                result['split'] = did_split                

                # get consensus gender and agreement index
                # in case of match (equal counts) all most frequent genders are returned
                # e.g., 'FM' if 'F' and 'M' are equally frequent
                gender_counts = matching_names.gender.value_counts(normalize=True)
                max_gender_freq = gender_counts.max()
                result['gender'] = ''.join(
                    gender_counts[gender_counts == max_gender_freq]
                    .index
                    .sort_values()
                    .tolist()
                )
                result['agreement'] = max_gender_freq

                # return details per gender
                for key, gender in {'F': 'female', 'M': 'male', '?': 'ambiguous'}.items():
                    result[gender] = matching_names[
                        matching_names.gender == key
                    ].groupby('name').code.apply(pd.Series.tolist).to_dict()
            else:
                result['gender'] = 'Name(s) not found'                
        else:
            result['gender'] = 'Unexpected name format'
    else:
        result['gender'] =  'No name'
    
    return pd.Series(result)

In [None]:
gender_path = 'gender.csv'
if os.path.exists(gender_path):
    gender_df = pd.read_csv(gender_path)
else:
    gender_df = authors_df.loc[:, 'Creator'].apply(get_gender)
    gender_df = pd.concat([authors_df.loc[:, ['Key', 'Creator', 'Creator Role', 'Creator Order']], gender_df], axis=1)
    gender_df.to_csv(gender_path, index=False)

In [None]:
# output authors + gender database
df = pd.concat([
    authors_df,
    gender_df.loc[:, [column for column in gender_df.columns if column not in ['Key', 'Creator', 'Creator Role', 'Creator Order']]]
], axis=1)
df.to_csv('authors.csv', index=False)
del df

In [None]:
# quick gender plot
collections_include = ['**CITED', '*WORK IN']

df = pd.concat([authors_df, gender_df.loc[:, ['gender', 'agreement']]], axis=1)
df['root_collections'] = df['collections'].apply(only_root_collections)
df = df.loc[
    (df['Creator Role'] == 'Author') &
    (df['Creator Order'] == 0) &
    (df['agreement'] > .75) &
    (df['root_collections'].apply(lambda x: bool(set(x) & set(collections_include)))) & 
    True, 
    :
]
df = df.groupby('gender')['Key'].count()
df

#### Possible ambiguous bias from 'CA' name-country pairs?
I was also noticing some ambiguous-gender bias coming from 'CA' name-country pairs. I.e., it looked as if in many cases all name-country pairs agreed on either 'male' or 'female', but the 'CA' pair disagreed with '?' (ambiguous). Hence, I'm plotting the name count per gender and code to see if 'CA' has a higher proportion of 'ambiguous' names.

In [None]:
# plot name count per country and gender
k = 5 # number of k codes to keep

df = pd.crosstab(
    names.code,
    names.gender,
    margins=True
#     normalize='index'
).reset_index()

# exlude column margins
df = df.loc[df.code != 'All', :]

# sort by index margin (i.e., total name count per code)
df = df.sort_values('All', ascending=False)

# and then drop index margin
df = df.loc[:, [column for column in df.columns if column != 'All']]

# keep top k codes
df = df[:k*3]

# make wide df into long df
df = df.melt(id_vars='code')

sns.catplot(
    x='code',
    y='value',
    hue='gender',
    data=df,
    kind='bar'
)

#### Gender - known issues
- **Institution may be taken as people**: All authors with a format (*string*, *string*) are considered humans and looked up in the gender database. Even if they are institutions. For example, “Science, London School of Economics and Political”. A possible solution could be to use the RDF export file instead of the CSV, and use the “givenName” property (instead of looking for "*string*, *string*" and splitting at the *comma*).
- **Some names are not found in WGND**: However, some of these name are found in other sources. For example, the name "Laurajane" is not found in WGND, but it is found in https://genderize.io/, or in https://genderapi.io/. A solution would be to look for these names in these services (they have APIs). In this case, build a list of names and genders retrieved, to minimize API calls. And add a column to the output indicating what source the gender was inferred with.
- **Some sources have no authors declared**: Some of these cases could be solved when the Crossref API is called to get author affiliation information. This may also help with sources for which only the author's initial was included (these initials are not found in the gender database).
- **Consensus gender may be biased in compund names**: Right now the consensus gender is calculated pooling all name-country pairs found in the database. This may bias the gender toward the name (in a multiple-part name) for which more entries were found. For example, if we are looking for “María José”, if there were more results for José than for María, then the consensus gender could be biased toward “José” (male). Alternatively, one consensus gender with its agreement index should be obtained for each name part, and only then averaged across names.

It doesn't seem that 'CA' name-country pairs are heavily biased toward ambiguous gender. Hence, keeping this name-country pairs.