# Collaborations

This page focuses on aspects of collaboration, like co-authorship or co-editorship, in the *Bibliographie sur le XVIIIe siècle*. 

In [5]:
# === Imports === 

import re 
import seaborn as sns
from matplotlib import pyplot as plt
from os.path import join, abspath, normpath, realpath
import os
from lxml import etree
from io import StringIO, BytesIO
from collections import Counter
import pandas as pd

# === Namespaces === 

namespaces = {
    "foaf" : "http://xmlns.com/foaf/0.1/",
    "bib" : "http://purl.org/net/biblio#",
    "dc" : "http://purl.org/dc/elements/1.1/",
    "z" : "http://www.zotero.org/namespaces/export#",
    "rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    }

# === Files and parameters === 

wdir = join("/", "media", "christof", "Data", "Github", "christofs", "BIB18")
print(wdir)
bibdatafile = join(wdir, "data", "BIB18_Zotero-RDF.rdf") 
print(bibdatafile)
#bibdatafile = join(wdir, "data", "BIB18_Zotero-RDF_TEST.rdf") 

/media/christof/Data/Github/christofs/BIB18
/media/christof/Data/Github/christofs/BIB18/data/BIB18_Zotero-RDF.rdf


In [6]:
def read_xml(bibdatafile): 
    bibdata = etree.parse(bibdatafile)
    return bibdata

bibdata = read_xml(bibdatafile)

## Number of collaborators

This part determines how frequent collaborations (co-authorship, co-editorship) are. This is based on the number of "Person" elements within the "authors" or "editors" element.

In [7]:
def get_number_collaborators(bibdata): 
    """
    Finds out how frequent collaborations (co-authorship, co-editorship) are.
    Number of "Person" elements within "authors" or "editors" element. 
    """
    print("\nNumber of collaborations with specific number of collaborators.")

    # Find all instances of authors
    num_coauthors = []
    xpath = "//bib:authors"
    authors = bibdata.xpath(xpath, namespaces=namespaces)
    print(len(authors), "instances of Element 'authors'")
    num_coauthors = []
    for item in authors:
        #print(item)
        xpath = "rdf:Seq/rdf:li/foaf:Person"
        coauthors = item.xpath(xpath, namespaces=namespaces)
        num_coauthors.append(len(coauthors))
    num_coauthors_counts = Counter(num_coauthors)
    print(num_coauthors_counts)

    # Calculate percentages
    num_coauthors_perc = {}
    total = sum(num_coauthors_counts.values())
    for key,val in num_coauthors_counts.items():
        num_coauthors_perc[key] = str(round(val/total * 100, 3)) + '%'
    print(num_coauthors_perc)

    # Find all instances of editors
    num_coeditors = []
    xpath = "//bib:editors"
    editors = bibdata.xpath(xpath, namespaces=namespaces)
    print(len(editors), "instances of Element 'editors'")
    num_coeditors = []
    for item in editors:
        xpath = "rdf:Seq/rdf:li/foaf:Person"
        coeditors = item.xpath(xpath, namespaces=namespaces)
        num_coeditors.append(len(coeditors))
    num_coeditors_counts = Counter(num_coeditors)
    print(dict(num_coeditors_counts))

    # Calculate percentages
    num_coeditors_perc = {}
    total = sum(num_coeditors_counts.values())
    for key,val in num_coeditors_counts.items():
        num_coeditors_perc[key] = str(round(val/total * 100, 2)) + '%'
    print(num_coeditors_perc)

get_number_collaborators(bibdata)


Number of collaborations with specific number of collaborators.
56860 instances of Element 'authors'
Counter({1: 53488, 2: 3361, 3: 9, 4: 2})
{1: '94.07%', 2: '5.911%', 3: '0.016%', 4: '0.004%'}
17135 instances of Element 'editors'
{1: 5558, 4: 904, 3: 2638, 2: 7743, 5: 256, 6: 35, 11: 1}
{1: '32.44%', 4: '5.28%', 3: '15.4%', 2: '45.19%', 5: '1.49%', 6: '0.2%', 11: '0.01%'}


## Collaborator networks

Builds the data for a network of people having collaborated as editors.

In [20]:
coeditorcounts_top_file = join(wdir, "results", "coeditor-counts_top.csv")
coeditorcounts_full_file = join(wdir, "results", "coeditor-counts_full.csv")


def network_coeditors(bibdata): 
    """
    Builds the data for a network of people having collaborated as editors. 
    """
    # Find all instances of editors
    xpath = "//bib:editors"
    editors = bibdata.xpath(xpath, namespaces=namespaces)
    print("There are " + str(len(editors)) + " editors (i.e., instances of Element 'editors').")

    # Collect the names of each person within each editors element
    coeditors = []
    for item in editors:
        xpath = "rdf:Seq/rdf:li/foaf:Person"
        coeditors_elements = item.xpath(xpath, namespaces=namespaces)
        coeditors_names = []
        # Get the names (full name or first name, last name) from each person
        for item in coeditors_elements: 
            if len(item) == 2: 
                coeditors_names.append(item[0].text + ", " + item[1].text)
        coeditors.append(coeditors_names)

    # Establish the count of each collaboration between editors
    import itertools 
    all_coeditor_combinations = []
    for item in coeditors: 
        coeditor_combinations = list(itertools.combinations(item, 2))
        coeditor_combinations = [tuple(sorted(item)) for item in coeditor_combinations]
        for coedcomb in coeditor_combinations: 
            all_coeditor_combinations.append(coedcomb)
    ccc = dict(Counter(all_coeditor_combinations)) # ccc = coeditor_combinations_count

    # Transform to a DataFrame
    ccc = pd.DataFrame.from_dict(ccc, orient="index", columns=["count"])
    ccc = ccc.reset_index()
    ccc_split = pd.DataFrame(ccc["index"].tolist())
    ccc_merged = ccc_split.merge(ccc, left_index=True, right_index=True)
    ccc = ccc_merged.drop(["index"], axis=1)
    ccc = ccc.rename({0 : "coeditor1", 1 : "coeditor2"}, axis=1)
    ccc = ccc.sort_values(by="count", ascending=False)
    #print(ccc.head())
    #print(ccc.shape, "shape of dataframe")
    with open(join(coeditorcounts_full_file), "w", encoding="utf8") as outfile: 
        ccc.to_csv(outfile, sep=";")

    # Filter the DataFrame to make it manageable for visualization
    # Determine the top N most frequent co-editors
    coeditors_top = list(set(list(ccc.head(20).loc[:,"coeditor1"]) +\
        list(ccc.head(20).loc[:,"coeditor2"])))
    #print(coeditors_top)
    print("Among all editors, " + str(len(coeditors_top)) + " have been selected as the most active co-editors.")
    # Filter the DataFrame to include just the collaborations involving at least one of the top co-editors. 
    # The resulting DataFrame will have all collaborations between the top co-editors and their co-editors. 
    ccc_filtered = ccc[(ccc["coeditor1"].isin(coeditors_top)) |\
                       (ccc["coeditor2"].isin(coeditors_top))]
    #print(ccc_filtered.shape, "shape of dataframe of top co-editors and their co-editors.")
    # Simplify the labels 
    #ccc_filtered = ccc_filtered.replace(' .*?]', '',regex=True).astype(str)
    ccc_filtered.loc[:,'coeditor1'] =  [re.sub(r', .*','', str(x)) for x in ccc_filtered.loc[:,'coeditor1']]
    ccc_filtered.loc[:,'coeditor2'] =  [re.sub(r', .*','', str(x)) for x in ccc_filtered.loc[:,'coeditor2']]
    print("The following table shows the 5 most active pairs of editors.\n")
    print(ccc_filtered.head())
    with open(join(coeditorcounts_top_file), "w", encoding="utf8") as outfile: 
        ccc_filtered.to_csv(outfile, sep=";")

network_coeditors(bibdata)

There are 17135 editors (i.e., instances of Element 'editors').
Among all editors, 20 have been selected as the most active co-editors.
The following table shows the 5 most active pairs of editors.

         coeditor1  coeditor2  count
1848        Herman  Pelckmans    221
3896       Hasquin    Mortier    103
2546  Ferreyrolles    Versini     84
184          Biard    Leuwers     78
183          Biard    Bourdin     64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ccc_filtered.loc[:,'coeditor1'] =  [re.sub(r', .*','', str(x)) for x in ccc_filtered.loc[:,'coeditor1']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ccc_filtered.loc[:,'coeditor2'] =  [re.sub(r', .*','', str(x)) for x in ccc_filtered.loc[:,'coeditor2']]


## Co-editor networks 

As shown above, editorship is an area of particularly intense collaboration in the community of Dix-huitiémistes, based on the data in the bibliography. The following is an initial, experimental attempt to elucidate the data using network visualization. 

The following shows a network of the top 20 co-editors and all of their co-editors, resulting in 148 different co-editor pairs. Each node is one editor, and each time two people have co-edited a publication, a link between them is created. The more co-editorships a person accumulates, the larger the node. The more co-editorships two people share, the thicker the edge connecting them. For this visualization, the full data of collaborations for edited volumes and editions has been massively reduced. Different parameters may strongly affect the results. See the full coeditor data in the `data` folder for more details.  

![Network showing the top 20 co-editors and all of their co-editors, created using Gephi.](figures/coeditors_top2.svg)

This figure is available also as a [zoom-able image file](/figures/coeditors_top2.svg) and [with somewhat friendlier colors but no community detection](/figures/coeditors_top1.svg).  

We basically see three key co-editor networks (the different colors are based on an algorithmic community or cluster detection): 

- Porret, Rosset, Majeur, Farkas, Baczko et al. 
- Sermain, Herman, Pelckmans, Escola, Omacini, Peeters, Paschoud, Berchtold et al. 
- Leuwers, Bourdin, Biard, Simien, Serna, Antoine et al. 
- Smaller clusters with Didier and Neefs as well around Kolving and Mortier. 

Some initial observations: While Porret appears to be the most productive co-editor overall, this is achived with some frequent, but also with a large number of less frequent coeditors. Inversely, the most intense collaboration appears to be between Herman and Pelckmans, who seem to avoid one-off collaborations. Finally, Rosset also functions as a bridge linking Porret and Baczko on the one hand, and Herman and Pelckmans on the other hand, and their respective coeditor networks. No similar bridge exists towards the Leuwers et al. network. The smaller Didier cluster is also connected to Sermain.  
