# Bibliometric analysis on DAOs
In order to perform this bibliometric analysis we will look at documents from Scopus, WebOfScience and Science Direct, and we will analyze co-citation, bibliographic coupling and co-authoring networks regarding papers related to DAOs and decentralized organizations from the fields of business and computer science.

## Scopus and ScienceDirect api initialization
We will start by initializing the API for Scopus and ScienceDirect, and set up a search strategy to recall articles with given keywords in their title or abstracts from the fields of interests.<br>

In [None]:
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
import pandas as pd

    
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
#client.inst_token = config['insttoken']

Our study began by performing a set of searches on Semantic Scholar and Google Scholar, employing the following "climate AND dao*", "decentralized AND autonomous AND organiz*", "dao* AND governance", "dao* AND web*", "web* AND decentraliz*.<br>
We then widened our research scope in two ways: the first was to include articles from Google Scholar and Semantic Scholar retrieved through the keywords "p2p AND dao*", "peer-to-peer AND dao*", "p2p AND blockchain AND organiz*", and "peer-to-peer AND blockchain AND organiz*".; 

### First we initialize our series of keywords
In order to perform our research we must have a list of keywords we want to search using the apis.

In [None]:
keyword_list=["climate AND dao*", "decentralized AND autonomous AND organiz*", "dao* AND governance", "dao* AND web*", "web* AND decentraliz*", "p2p AND dao*", "peer-to-peer AND dao*", "p2p AND blockchain AND organiz*", "peer-to-peer AND blockchain AND organiz*"]

#### We aggregate papers published in the business domain
First of all we create a Pandas dataframe aggregating papers retrieved with the aforementioned keywords in the business domain. In the DF we memorize the title, author(s), year of publication and SCopusID of each paper.

In [None]:
business_df=pd.DataFrame(columns=['Author(s)', 'Title', 'YearPublished', 'DOI', 'ScopusID'])

for key in keyword_list:
    #we print the keyword as an indicator of the progress
    #print(key)
    doc_srch = ElsSearch("TITLE-ABS-KEY("+key+") AND SUBJAREA(BUSI OR DECI OR ECON OR SOCI)",'scopus')
    doc_srch.execute(client, get_all = True)

    for el in doc_srch.results:
        try:
            data=[el["dc:creator"],el["dc:title"],el["prism:coverDate"], el["prism:doi"], el["dc:identifier"].split(":")[1]]
        except Exception as e:
            data=[]
            input_elements=["dc:creator", "dc:title", "prism:coverDate", "prism:doi", "dc:identifier"]
            for field in input_elements:
                try:
                    data.append(el[field])
                except Exception as f:
                    data.append("NaN")

        business_df.loc[len(business_df), :] = data

business_df=business_df.drop_duplicates()
#business_df.to_csv('business.csv', index=False)

#### Then we look in the computer science domain
We perform the same operation in the computer science domain, and we create a second df, with the same structure.

In [None]:
## Initialize doc search object using Scopus and execute search, retrieving 
#   all results
comp_df=pd.DataFrame(columns=['Author(s)', 'Title', 'YearPublished', 'DOI','ScopusID'])

for key in keyword_list:
    #we print the keyword as an indicator of the progress
    #print(key)
    doc_srch = ElsSearch("TITLE-ABS-KEY("+key+") AND SUBJAREA(COMP)",'scopus')
    doc_srch.execute(client, get_all = True)

    for el in doc_srch.results:
        try:
            data=[el["dc:creator"],el["dc:title"],el["prism:coverDate"], el["prism:doi"], el["dc:identifier"].split(":")[1]]
        except Exception as e:
            data=[]
            input_elements=["dc:creator", "dc:title", "prism:coverDate", "prism:doi", "dc:identifier"]
            for field in input_elements:
                try:
                    data.append(el[field])
                except Exception as f:
                    data.append("NaN")

        comp_df.loc[len(comp_df), :] = data

comp_df=comp_df.drop_duplicates()
#comp_df.to_csv('computerScience.csv', index=False)

#### We now look for papers found in both categories
We will start by reviewing them, and then we will proceed with single domains and eliminate papers whose titles clearly do not fit with our research questions. <br>
We will then <b>scrape abstracts</b> for the remaining papers and further refine our sample.

In [None]:
business_df=pd.read_csv('business.csv') 
comp_df=pd.read_csv('computerScience.csv') 
common_papers = pd.merge(comp_df, business_df, on=["Author(s)", "Title", "YearPublished", "DOI", "ScopusID"])
common_papers=common_papers.drop_duplicates()
all_papers = pd.concat([business_df, comp_df])
all_papers = all_papers.drop_duplicates()

#all_papers.to_csv('allPapers.csv', index=False)
#common_papers.to_csv('commonPapers.csv', index=False)

#### After retrieving papers we proceeded to manually clean them
We looked at titles and checked for their relevance given the topic of our review. the next step is to create single 'clean' files for each of our subjects (i.e., Busness and Computer Science). To do so, we will simply compare the single documents with the cleaned one and remove papers that do not belong to their intersection.


In [None]:
import pandas as pd

#we read all the datasets
business_df=pd.read_csv('business.csv') 
comp_df=pd.read_csv('computerScience.csv') 
common_papers = pd.read_csv('commonPapers.csv') 
all_papers = pd.read_csv('clean/allPapersClean.csv')

business_df_clean=pd.merge(all_papers, business_df["DOI"], how ='inner', on=["DOI"])
comp_df_clean=pd.merge(all_papers, comp_df["DOI"], how ='inner', on=["DOI"])
common_papers_clean=pd.merge(all_papers, common_papers["DOI"], how ='inner', on=["DOI"])

#business_df_clean.to_csv("clean/businessClean.csv", index=False)
#comp_df_clean.to_csv("clean/computerScienceClean.csv", index=False)
#common_papers_clean.to_csv("clean/commonPapersClean.csv", index=False)

## We can now look at citations of each of the papers we retrieved
To do so we will use Crossref API through the DOIs.

In [None]:
import pandas as pd 
from habanero import Crossref
from habanero import counts
from habanero import cn
cr = Crossref()

In [None]:
notFound=[]
references=[]
citations_dict={}

business_df_clean=pd.read_csv('clean/businessClean.csv')
start_year=business_df_clean["YearPublished"].min().split("/")[2]
end_year=business_df_clean["YearPublished"].max().split("/")[2]

business_citations_edges=pd.DataFrame(columns=['Source', 'Target'])
business_citations_nodes=pd.DataFrame(columns=['Key', 'Title', 'PubYear'])

for row in business_df_clean.iterrows():
    id=str(row[1]["DOI"])
    try:
        references=cr.works(ids = id)["message"]["reference"]
        business_citations_nodes.loc[len(business_citations_nodes), :] = [id,cr.works(ids = id)["message"]["title"][0], cr.works(ids = id)["message"]["created"]["date-parts"][0][0]]
        reference_list=[]
        for item in references:
            data=[item["key"]]
            business_citations_edges.loc[len(business_citations_edges), :] = [id,item["key"]]
            if "article-title" in item.keys():
                data.append(item["article-title"])
            else:
                data.append("NoTitle")
            if "year" in item.keys():
                data.append(item["year"])
            else:
                data.append("NoDate")
            business_citations_nodes.loc[len(business_citations_nodes), :] = data
    except Exception as e:
        notFound.append(str(e).split("/")[-1])                

business_citations_edges=business_citations_edges.drop_duplicates()
business_citations_nodes=business_citations_nodes.drop_duplicates()

#business_citations_nodes.to_csv("Networks/businessCitationsNodes.csv", index=False)
#business_citations_edges.to_csv("Networks/businessCitationsEdges.csv", index=False)

We now apply the same procedure for <b>Computer Science</b> papers.

In [None]:
notFound=[]
references=[]
citations_dict={}

comp_df_clean=pd.read_csv('clean/computerScienceClean.csv')
start_year=comp_df_clean["YearPublished"].min().split("/")[2]
end_year=comp_df_clean["YearPublished"].max().split("/")[2]

comp_citations_edges=pd.DataFrame(columns=['Source', 'Target'])
comp_citations_nodes=pd.DataFrame(columns=['Key', 'Title', 'PubYear'])

for row in comp_df_clean.iterrows():
    id=str(row[1]["DOI"])
    try:
        references=cr.works(ids = id)["message"]["reference"]
        comp_citations_nodes.loc[len(comp_citations_nodes), :] = [id,cr.works(ids = id)["message"]["title"][0], cr.works(ids = id)["message"]["created"]["date-parts"][0][0]]
        reference_list=[]
        for item in references:
            data=[item["key"]]
            comp_citations_edges.loc[len(comp_citations_edges), :] = [id,item["key"]]
            if "article-title" in item.keys():
                data.append(item["article-title"])
            else:
                data.append("NoTitle")
            if "year" in item.keys():
                data.append(item["year"])
            else:
                data.append("NoDate")
            comp_citations_nodes.loc[len(comp_citations_nodes), :] = data
    except Exception as e:
        notFound.append(str(e).split("/")[-1])               

comp_citations_edges=comp_citations_edges.drop_duplicates()
comp_citations_nodes=comp_citations_nodes.drop_duplicates()

#comp_citations_nodes.to_csv("Networks/compCitationsNodes.csv", index=False)
#comp_citations_edges.to_csv("Networks/compCitationsEdges.csv", index=False)

#### Co-citations...
From the citation network we just obtained we can deduce co-citations and bibliographic coupling networks.<br>
We will start with the <b>co-citation</b> network for <b>Business</b>.

In [None]:
import pandas as pd
import csv

business_citations_edges=pd.read_csv('Networks/businessCitationsEdges.csv')
business_citations_nodes=pd.read_csv('Networks/businessCitationsNodes.csv')

edges=[]
business_co_citations_nodes=pd.DataFrame(columns=['Key', 'Title', 'PubYear'])
done_ids=[]

#first we find co-citations
for row in business_citations_edges.iterrows():
    if row[1]["Source"] not in done_ids:
        paper_citations=business_citations_edges.loc[business_citations_edges['Source'].isin([row[1]["Source"]])]
        idx=paper_citations.index
        c=idx[0]
        while c<=idx[-1]:
            business_co_citations_nodes=pd.concat([business_co_citations_nodes, (business_citations_nodes[(business_citations_nodes['Key'] == paper_citations.loc[c]["Target"])])])
            g=c+1
            while g<=idx[-1]:
                edges.append((paper_citations.loc[c]["Target"],paper_citations.loc[g]["Target"]))
                                                
                business_co_citations_nodes=pd.concat([business_co_citations_nodes, (business_citations_nodes[(business_citations_nodes['Key'] == paper_citations.loc[g]["Target"])])])

                g+=1

            c+=1
        done_ids.append(row[1]["Source"])

'''
business_co_citations_nodes=business_co_citations_nodes.drop_duplicates()
business_co_citations_nodes.to_csv("Networks/businessCoCitationsNodes.csv", index=False)

# Configures output path
output_path = "Networks/businessCoCitationsEdges.csv"

# Writes synonyms to specified output
with open(output_path, mode="w", encoding="UTF-8") as f:
    writer = csv.writer(f, delimiter="\t", lineterminator="\n")
    writer.writerow(["Source", "Target"])
    for row in edges:
        buffer = [[source, target] for source in row for target in row if source != target]
        writer.writerows(buffer)
'''

We also extract our <b>co-citation</b> network for <b>Computer science</b>.

In [None]:
import pandas as pd
import csv

comp_citations_edges=pd.read_csv('Networks/compCitationsEdges.csv')
comp_citations_nodes=pd.read_csv('Networks/compCitationsNodes.csv')

edges=[]
comp_co_citations_nodes=pd.DataFrame(columns=['Key', 'Title', 'PubYear'])
done_ids=[]

#first we find co-citations
for row in comp_citations_edges.iterrows():
    if row[1]["Source"] not in done_ids:
        paper_citations=comp_citations_edges.loc[comp_citations_edges['Source'].isin([row[1]["Source"]])]
        idx=paper_citations.index
        c=idx[0]
        while c<=idx[-1]:
            comp_co_citations_nodes=pd.concat([comp_co_citations_nodes, (comp_citations_nodes[(comp_citations_nodes['Key'] == paper_citations.loc[c]["Target"])])])
            g=c+1
            while g<=idx[-1]:
                edges.append((paper_citations.loc[c]["Target"],paper_citations.loc[g]["Target"]))
                                                
                comp_co_citations_nodes=pd.concat([comp_co_citations_nodes, (comp_citations_nodes[(comp_citations_nodes['Key'] == paper_citations.loc[g]["Target"])])])

                g+=1

            c+=1
        done_ids.append(row[1]["Source"])


'''
comp_co_citations_nodes=comp_co_citations_nodes.drop_duplicates()
comp_co_citations_nodes.to_csv("Networks/compCoCitationsNodes.csv", index=False)

# Configures output path
output_path = "Networks/compCoCitationsEdges.csv"

# Writes synonyms to specified output
with open(output_path, mode="w", encoding="UTF-8") as f:
    writer = csv.writer(f, delimiter="\t", lineterminator="\n")
    writer.writerow(["Source", "Target"])
    for row in edges:
        buffer = [[source, target] for source in row for target in row if source != target]
        writer.writerows(buffer)
'''

#### ...and bibliographic coupling
We then proceed to check for <b>Bibliographic coupling</b> for <b>Business</b>.

In [None]:
import pandas as pd
import csv

business_citations_edges=pd.read_csv('Networks/businessCitationsEdges.csv')
business_citations_edges=business_citations_edges.sort_values(by=['Target'], ignore_index=True)
business_citations_nodes=pd.read_csv('Networks/businessCitationsNodes.csv')

edges=[]
business_bib_coupling_nodes=pd.DataFrame(columns=['Key', 'Title', 'PubYear'])
done_ids=[]

#then we check for bibliographic coupling
for row in business_citations_edges.iterrows():
    if row[1]["Target"] not in done_ids:
        paper_citations=business_citations_edges.loc[business_citations_edges['Target'].isin([row[1]["Target"]])]
        idx=paper_citations.index
        c=idx[0]
        while c<=idx[-1]:
            business_bib_coupling_nodes=pd.concat([business_bib_coupling_nodes, (business_citations_nodes[(business_citations_nodes['Key'] == paper_citations.loc[c]["Source"])])])
            g=c+1
            while g<=idx[-1]:
                edges.append((paper_citations.loc[c]["Source"],paper_citations.loc[g]["Source"]))
                                                
                business_bib_coupling_nodes=pd.concat([business_bib_coupling_nodes, (business_citations_nodes[(business_citations_nodes['Key'] == paper_citations.loc[g]["Source"])])])

                g+=1

            c+=1
        done_ids.append(row[1]["Target"])


'''
business_bib_coupling_nodes=business_bib_coupling_nodes.drop_duplicates()
business_bib_coupling_nodes.to_csv("Networks/businessBibCouplingNodes.csv", index=False)

# Configures output path
output_path = "Networks/businessBibCouplingEdges.csv"

# Writes synonyms to specified output
with open(output_path, mode="w", encoding="UTF-8") as f:
    writer = csv.writer(f, delimiter="\t", lineterminator="\n")
    writer.writerow(["Source", "Target"])
    for row in edges:
        buffer = [[source, target] for source in row for target in row if source != target]
        writer.writerows(buffer)
'''

And our <b>Bibliographic coupling</b> network for <b>Computer Science</b>.

In [None]:
import pandas as pd
import csv

comp_citations_edges=pd.read_csv('Networks/compCitationsEdges.csv')
comp_citations_edges=comp_citations_edges.sort_values(by=['Target'], ignore_index=True)
comp_citations_nodes=pd.read_csv('Networks/compCitationsNodes.csv')

edges=[]
comp_bib_coupling_nodes=pd.DataFrame(columns=['Key', 'Title', 'PubYear'])
done_ids=[]

#then we check for bibliographic coupling
for row in comp_citations_edges.iterrows():
    if row[1]["Target"] not in done_ids:
        paper_citations=comp_citations_edges.loc[comp_citations_edges['Target'].isin([row[1]["Target"]])]
        idx=paper_citations.index
        c=idx[0]
        while c<=idx[-1]:
            comp_bib_coupling_nodes=pd.concat([comp_bib_coupling_nodes, (comp_citations_nodes[(comp_citations_nodes['Key'] == paper_citations.loc[c]["Source"])])])
            g=c+1
            while g<=idx[-1]:
                edges.append((paper_citations.loc[c]["Source"],paper_citations.loc[g]["Source"]))
                                                
                comp_bib_coupling_nodes=pd.concat([comp_bib_coupling_nodes, (comp_citations_nodes[(comp_citations_nodes['Key'] == paper_citations.loc[g]["Source"])])])

                g+=1

            c+=1
        done_ids.append(row[1]["Target"])

'''
comp_bib_coupling_nodes=comp_bib_coupling_nodes.drop_duplicates()
comp_bib_coupling_nodes.to_csv("Networks/compBibCouplingNodes.csv", index=False)

# Configures output path
output_path = "Networks/compBibCouplingEdges.csv"

# Writes synonyms to specified output
with open(output_path, mode="w", encoding="UTF-8") as f:
    writer = csv.writer(f, delimiter="\t", lineterminator="\n")
    writer.writerow(["Source", "Target"])
    for row in edges:
        buffer = [[source, target] for source in row for target in row if source != target]
        writer.writerows(buffer)
'''

## Let's now merge the graphs and compute our measures regarding boundary papers

In [6]:
import pandas as pd
#business citations
business_citations_edges=pd.read_csv('Networks/businessCitationsEdges.csv')
business_citations_nodes=pd.read_csv('Networks/businessCitationsNodes.csv')
#computer science citations
comp_citations_edges=pd.read_csv('Networks/compCitationsEdges.csv')
comp_citations_nodes=pd.read_csv('Networks/compCitationsNodes.csv')
#intersection citations
intersection_citations=pd.merge(comp_citations_edges, business_citations_edges, how ='inner', on=["Source", "Target"])
intersection_nodes=set()
for row in intersection_citations.iterrows():
    intersection_nodes.add(row[1]["Source"])
    intersection_nodes.add(row[1]["Target"])
#merge citations
merge_citations=pd.concat([business_citations_edges, comp_citations_edges]).drop_duplicates()
merge_nodes=set()
for row in merge_citations.iterrows():
    merge_nodes.add(row[1]["Source"])
    merge_nodes.add(row[1]["Target"])

cross_disc_cits=set()
for row in business_citations_edges.iterrows():
    if row[1]["Source"] in comp_citations_nodes['Id'].values or row[1]["Target"] in comp_citations_nodes['Id'].values:
        for row_2 in intersection_citations.iterrows():
            if (row[1]["Source"] == row_2[1]["Source"] and row[1]["Target"] == row_2[1]["Target"]) or (row[1]["Target"] == row_2[1]["Source"] and row[1]["Source"] == row_2[1]["Target"]):
                continue
            else:
               cross_disc_cits.add((row[1]["Source"], row[1]["Target"])) 

for row in comp_citations_edges.iterrows():
    if row[1]["Source"] in business_citations_nodes['Id'].values or row[1]["Target"] in business_citations_nodes['Id'].values:
        for row_2 in intersection_citations.iterrows():
            if (row[1]["Source"] == row_2[1]["Source"] and row[1]["Target"] == row_2[1]["Target"]) or (row[1]["Target"] == row_2[1]["Source"] and row[1]["Source"] == row_2[1]["Target"]):
                continue
            else:
               cross_disc_cits.add((row[1]["Source"], row[1]["Target"])) 

In [7]:
boundary_papers=len(intersection_nodes)/len(merge_nodes)
citation_connections=len(comp_citations_edges.index)+len(business_citations_edges.index)-len(merge_citations.index)+len(cross_disc_cits)
cross_disc=(1-(len(comp_citations_edges.index)+len(business_citations_edges.index)-len(merge_citations.index))/citation_connections)
dominance_level=boundary_papers/citation_connections
print(boundary_papers, cross_disc, dominance_level)

0.2617953060730704 0.6169976289270895 1.9397992447619322e-05
