In [None]:
import csv
import pandas as pd
from functools import reduce
from genderperformr import GenderPerformr


Applies Filters to the Dataset
1. Cuts all publications before 1980
2. [Maybe] Only includes authors who have had their first publication past 1980 (does it matter?)
3. Restricts authors to those who have published at least once every 5 years 
4. Remove all publications with empty authorlist 

In [None]:
df = pd.read_csv('data/aps_raw_data.csv')
author_df = pd.read_csv('/Users/christopherarcher/Desktop/Research/Papers/supplementarydata/APS_author2DOI.dat', on_bad_lines='skip')
author_df.columns = ['ID','Author', 'Publications']
author_df['Publications']=author_df['Publications'].map(lambda s: s.split('\t'))
df['Year'] = pd.to_datetime(df['Date']).dt.year

check if network is growing over time 
check for active after 1st publication 
sum total weight of red, sum_total weight of blue
parameters for netowork:
- estimate $\lambda$: curve fitting
- estimating q: strategy
- estimating p: strategy
- weights: citations/years - total citations after 4 years
- estimate weight parameters: average across all red-red, red-blu
- estimating preferential attachment in the network
- 

DATAFRAME FILTERING
- estimate gender using GenderPerformr classifier and only accept if confident in gender
- for publications dataframe, remove all publications before 2000, and re

In [None]:

gp = GenderPerformr()
probs, labels = gp.predict(list(author_df['Author']))
author_df['Gender'] = pd.Series(['M' if p <= 0.1 else 'F' if p >= 0.9 else 'N' for p in probs])
tmp_df = author_df[author_df['Gender'] != 'N']
filtered_author_df = tmp_df.explode('Publications')
filtered_author_df

In [None]:
print(f"Num Identified: {len(tmp_df)}")
print(f"Total number of authors: {len(author_df)}")
print(f"Proportion Kept: {len(tmp_df)/len(author_df)}")

In [None]:
#PUBLICATION DATAFRAME FILTERING
tmp_df = df[df['Authors'].map(lambda l: len(l)>0)]
publication_df = tmp_df[tmp_df['Year']>= 1980]
publication_df


In [None]:
#MERGE DATAFRAMES
merged_publication_df = pd.merge(filtered_author_df, publication_df, left_on='Publications', right_on='ID', how='inner')
merged_publication_df

In [None]:
import ast
merged_publication_df['Coauthors'] =merged_publication_df['Authors'].apply(lambda x: ast.literal_eval(x))
merged_publication_df['Num Coauthors'] = merged_publication_df['Coauthors'].map(lambda x: len(x))
merged_publication_df = merged_publication_df[['Author', 'Publications', 'Gender', 'Date', 'Year', 'Num Coauthors', 'Coauthors']]

In [None]:
#AUTHOR FILTERING: An active author is one who has published at most every five years

def filter_active_authors(df, last_year=2009):
    authors = df['Author'].unique()
    print(authors)
    print(len(authors))
    inactive_authors = []
    for author in authors:
        tmp_df = df[df['Author']==author]
        #print(tmp_df['Authors'].head())
        years = (list(tmp_df['Year'])+[last_year]) #add in the last year so every list has at least two elements
        years.sort()
        print(years)
        author_active = reduce(lambda a,b: a & b, [years[i]- years[i-1] <= 5 for i in range(1,len(years))]) 
        if not author_active:
            inactive_authors.append(author)
    
    return df[df['Author'].map(lambda s: s not in inactive_authors)]


In [None]:

penultimate_df = filter_active_authors(merged_publication_df)
penultimate_df


In [None]:
def author_rank(full_name, author_list):
    """
    Returns index of rank in coauthor list 
    """
    names = full_name.strip().lower().split(" ")
    print(names[0][0])
    print(names[-1])
    if author_list == []:
        return 0
    for i, author in enumerate(author_list):
        author_names = author.strip().lower().replace(',','').split(" ")
        #remove undesirables
        try:
            author_names.remove('jr.')
        except ValueError:
            pass
        print(author_names)
        author_first_initial, author_last_name = author_names[0][0], author_names[-1]
        print(author_first_initial, author_last_name)
        if names[0][0] == author_first_initial and names[-1] == author_last_name:
            return i

In [None]:
#we don't use this explicitly but checking if an author is first or last is a good to have
def first_or_last_author(full_name, author_list):
    """
    Returns boolean of whether the author is first or last author in coauthor_list
    """
    try:
        if len(author_list) <= 2:
            return True
        first_author_names = author_list[0].strip().lower().split(" ")
        last_author_names = author_list[-1].strip().lower().split(" ")
        names = full_name.strip().lower().split(" ")
        return (names[0][0] in (first_author_names[0][0], last_author_names[0][0])  and names[-1] in (first_author_names[-1],last_author_names[-1]))
    except IndexError:
        print(full_name.split(" "), author_list)
        print(first_author_names)
        print(last_author_names)

In [None]:
l = []
for index, row in penultimate_df.iterrows():
    l.append(author_rank(row['Author'],row['Coauthors']))
penultimate_df['Author Rank'] = l
penultimate_df = penultimate_df.reset_index(drop=True)
penultimate_df

In [None]:
#FIND THE TWO AUTHORS NEAREST TO THE FIRST AND LAST: This is the method we use in the paper
def soft_restrict_coauthors2(df):
    """
    Returns a df where the first and last author are checked to see if active, and if not then the next author in the rank will be paired:
    this is so every publication will have exactly 1 edge in the graph associated with it (to avoid double counting)
    """
    def append_row(df, row):
        return pd.concat([
                df, 
                pd.DataFrame([row], columns=row.index)]
            ).reset_index(drop=True)
    df['index1'] = df.index
    rows_to_keep = []
    acc_df = pd.DataFrame(data=None, columns=df.columns)
    for i,pub in enumerate(df['Publications'].unique()):
        if i % 1000 == 0:
            print(f"Iteration {i}/{len(df['Publications'].unique())}")

        x_df= df[df['Publications']==pub]
        x_df.sort_values(by=['Author Rank'])
        #print(x_df)
        rmin,rmax = x_df.iloc[0], x_df.iloc[-1]
        #add rows to accumulator
        acc_df.loc[len(acc_df)] = (rmin)
        if rmin['Author'] != rmax['Author']:
            acc_df.loc[len(acc_df)]= rmax
    #print(rows_to_keep)
    #new_df = df.loc[~df.index.isin(set(rows_to_keep))].reset_index(drop=True)
    return acc_df

In [None]:
#FIND THE TWO AUTHORS NEAREST TO THE FIRST AND LAST
def soft_restrict_coauthors(df):
    """
    Returns a df where the first and last author are checked to see if active, and if not then the next author in the rank will be paired:
    this is so every publication will have exactly 1 edge in the graph associated with it (to avoid double counting)
    """
    Pub_IDs = {} #of the form (publication: ((min_index, max_index),(min_rank,max_rank))
    #df = df.sample(frac=1).reset_index(drop=True) #shuffle dataframe rows
    for index, row in df.iterrows():
        if index % 1000 == 0:
            print(index)
            print(row)
        
        try:
            (min_index, max_index),(min_rank,max_rank) = Pub_IDs[row['Publications']]
            if row['Author Rank'] <= min_rank:
                min_index, min_rank = index, row['Author Rank'] 
            elif row['Author Rank'] >= max_rank:
                max_index, max_rank = index, row['Author Rank']
            Pub_IDs[row['Publications']] = ((min_index, max_index),(min_rank,max_rank) )
        except KeyError:
            Pub_IDs[row['Publications']] = ((index,index), (row['Author Rank'],row['Author Rank']))

    #accumulate all indices to connect
    indices_to_keep = []
    for k in Pub_IDs:
        indices_to_keep.append(Pub_IDs[k][0][0])
        indices_to_keep.append(Pub_IDs[k][0][1])
    #print(indices_to_keep[0:20])   
    new_df = df.loc[~df.index.isin(set(indices_to_keep))].reset_index(drop=True)
    return new_df

In [None]:
#ALTERNATIVE FILTERING METHOD: ONLY ACCEPT FIRST AND LAST AUTHORS
def hard_restrict_coauthors(df):
    """
    Returns a df with only the first and last author:
    this is so every publication will have exactly 1 edge in the graph associated with it (to avoid double counting)
    """
    Pub_IDs = {}
    indices_to_drop = []
    #df = df.sample(frac=1).reset_index(drop=True) #shuffle dataframe rows
    for index, row in df.iterrows():
        if index % 1000 == 0:
            print(index)
            #print(row)
        if first_or_last_author(row['Author'],row['Coauthors']):
            try:
                Pub_IDs[row['Publications']] += 1/(1 if len(row['Coauthors'])<= 1 else 2)
            except:
                Pub_IDs[row['Publications']] = 1/(1 if len(row['Coauthors'])<= 1 else 2)
        else:
            indices_to_drop.append(index)
    #print(indices_to_drop)
    #find publications where both first and last author are active
    admissible_publications = [k for k in Pub_IDs if Pub_IDs[k]==1]
    new_df = df.drop(indices_to_drop)
    new_df = new_df[new_df['Publications'].isin(admissible_publications)].reset_index(drop=True)
    return new_df

In [None]:
final_df = soft_restrict_coauthors2(penultimate_df)
#final_df = penultimate_df
final_df = final_df.reset_index(drop=True)
final_df[final_df['Gender']=='F']

In [None]:
nan_df = final_df[final_df['Author Rank'].isna()]
nan_df

In [None]:
#Test case
pub = '10.1103/PhysRevB.53.7890'
df = penultimate_df
df['index1'] = df.index
x_df= df[df['Publications']==pub]
x_df.sort_values(by=['Author Rank'])
            
imin,imax = x_df.iloc[0]['index1'], x_df.iloc[-1]['index1']
print(imin)
print(imax)#add rows to accumulator
print(df.iloc[imin])

In [None]:
final_df.to_csv('data/aps_filtered_data.csv')