In [147]:
import pandas as pd
import ast
import networkx as nx
import numpy as np
from tqdm import tqdm

In [148]:
df_CSS_papers = pd.read_csv('my_data/df_CSS_paper.csv')
df_authors = pd.read_csv('my_data/df_author.csv')

In [149]:
def fix_authors(authors):
    #Fix string, none type and type errors
    authors = ast.literal_eval(authors)
    authors = [i for i in author_list if i is not None]
    authors = [int(ids) for ids in authors]
    return authors

In [98]:
# Get unique authors
authors = set()

for i, paper in df_CSS_papers.iterrows():
    paper_authors = set(ast.literal_eval(paper["authors"]))
    authors.update(paper_authors)

if None in authors: # If none is in the author list
    authors.remove(None)

authors = set([int(id) for id in authors]) # convert strings to ints to compare them
total_CSS_authors = set([int(author) for author in df_authors["id"]])

CSS_authors = authors & total_CSS_authors

In [99]:
# Generate unique pairs of authors
author_list =  list(CSS_authors)
author_list = [i for i in author_list if i is not None]
author_pairs = list()

for i, author in enumerate(author_list):
    for colab in author_list[i + 1:]:
        author_pairs.append((author, colab))


In [100]:
# Count the amount of times each pair have worked together

# Setup dicts
author_dict = {}
author_list_sorted = sorted(author_list)
for i, author in enumerate(author_list_sorted):
    author_dict[author] = {}
    for coauthor in author_list_sorted: #When sorted they aren't in the same order as the pairs
        author, coauthor = int(author), int(coauthor)
        author_dict[author][coauthor] = 0

# Counting
for authors in df_CSS_papers['authors']:
    authors = ast.literal_eval(authors)

    if len(authors) > 1:
        if None in authors:
            authors = [i for i in author_list if i is not None]

        authors = [int(ids) for ids in authors]
        authors = set(authors) & CSS_authors

        sorted_authors = sorted(authors)
        for i, author in enumerate(sorted_authors):
                for coauthor in sorted_authors[i+1:]:
                        author, coauthor = int(author), int(coauthor)
                        author_dict[author][coauthor] += 1

In [101]:
#Creating weighted edges to use in network
weighted_edges = []
for pair in author_pairs:
    weighted_edges.append((*pair, author_dict[pair[0]][pair[1]]))

CSS_graph = nx.Graph()
CSS_graph.add_weighted_edges_from(weighted_edges)

In [116]:
#Getting node atributes
# First 2 atts
author_atts = dict()

for author in list(CSS_authors):
    author_atts[author] = dict()

    #Getting the name of each author
    aliases = df_authors.loc[df_authors['id'] == author, 'aliases'].values[0]
    if type(aliases) != list():
        name = df_authors.loc[df_authors['id'] == author, 'name'].values[0]
    else:
        name = max(aliases, key=len) #Asume the true name is the longest alias
    author_atts[author]['att1'] = name


    field = df_authors.loc[df_authors['id'] == author, 'field'].values[0]
    author_atts[author]['att2'] = field

In [145]:
#Last three atts

citation_dict = {}
amount_of_css_papers = {}
first_paper_year = {}

# Setup dicts
for author in CSS_authors:
    amount_of_css_papers[author] = 0
    citation_dict[author] = []
    first_paper_year[author] = np.inf # going for the first paper created from the author which much be eariler than inf

# Counting the papers each author as contributed to
for authors in df_CSS_papers['authors']:
    authors = fix_authors(authors)
    authors = set(authors) & CSS_authors

    for author in authors:
        amount_of_css_papers[author] += 1



for i, paper in df_CSS_papers.iterrows():

    authors = fix_authors(paper['authors'])
    authors = set(authors) & CSS_authors

    for author in authors:
        # Citation count
        citation_dict[author].append(paper['citationCount'])

        #First paper
        if paper['year'] < first_paper_year[author]:
            first_paper_year[author] = paper['year']

In [146]:
# Create attribute
for author in CSS_authors:

    author_atts[author]['att3'] = np.median(citation_dict[author])
    author_atts[author]['att4'] = amount_of_css_papers[author]
    author_atts[author]['att5'] = first_paper_year[author]

In [None]:
# Set node attributes
nx.set_node_attributes(CSS_graph, author_atts)