In [2]:
import pandas as pd
import ast
import networkx as nx
import numpy as np


In [3]:
# Function that takes in list of authorIds and outputs the corresponding top fields in the same order 
def author_field(ids, df_author):
    """
    Compute the top field of the given authors 

    Args:
        ids (list): containing the ids of the authors in question
        df_author (pandas df): data frame of type author that contains the given ids 
    
    return: 
        list of top fields of the given authors 
    """
    # Get boolean array that indicates where the authors are 
    mask = df_author["id"].isin(ids)
    
    # Get the fields 
    fields = list(df_author["field"][mask])
    
    return fields 

# # Debugging 
# author_df = pd.read_csv("df_author0.csv")
# fields = author_field(author_df["id"][:6], author_df=author_df) 
# print(author_df["field"][:6])
# print(fields) 


In [76]:
def argument_paper_dataframe(df_papers, df_author):
    """
    Takes in a paper data frame, arguments it with a new column and puts the fields of the author in this column.

    Args:
        df_papers (pandas dataframe): paper dataframe (like before)
        df_author (pandas dataframe): author dataframe (like before)

    Returns:
        df_papers (pandas dataframe): the dataframe from before, argumentet with a new column
    """
    df_papers["author_field"] = None 

    for index, row in df_papers.iterrows():
        # Currently authors are stored as a string representation of the list so we make it a list again 
        authors = ast.literal_eval(row["authors"]) 
        # Now we need to turn the authors into a list of integers, because the df_paper dataframe stores them as such 
        authors = [eval(id) for id in authors if id is not None]
        #authors = [eval(id) for id in authors]
        # Find the fields of the authors 
        authors_fields = author_field(authors, df_author=df_author)
        df_papers["author_field"][index] = authors_fields
        # print(authors_fields) # debugging 
    # print(df_papers.head()) # debugging 
        
    return df_papers 


In [79]:
# Clean the paper datafram to a social science dataframe
def generate_CCS_papers(df_papers, social_science_fields, quantitative_fields, verbose=False):
    
    rows_to_drop = []
    
    # Drop rows that are before 2008
    for index, row in df_papers.iterrows():
        if row["year"] > 2008: 
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because they were to old. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    # Drop the papers with more than 9 Computational Social Science authors? TODO what does she mean! 
    for index, row in df_papers.iterrows():
        if len(ast.literal_eval(row["authors"])) > 9: # Count authors, should I check if they are in the author data frame? That's gonna take a while
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because there are more than 9 authors. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
        
    # Drop rows that do not contain a DOI
    for index, row in df_papers.iterrows():
        if row["doi"] == None: 
            rows_to_drop.append(index)
        else: 
            try:
                ast.literal_eval(row["doi"])[0]["DOI"]
            except: 
                rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because they did not have a DOI. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    # Drop paper if it includes biology
    for index, row in df_papers.iterrows():
        # Go through the entire list of fields for each paper 
        for field in ast.literal_eval(row["field"]):
            if field["category"] == "Biology":
                rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because biology was in the field. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    # Drop paper if the fields are not not included in social science fields
    for index, row in df_papers.iterrows():
        is_in_SCF = False 
        # Go through the entire list of fields for each paper 
        for field in ast.literal_eval(row["field"]):
            if field["category"] in social_science_fields:
                is_in_SCF = True
        
        # if the paper was not in social science fields drop it 
        if not is_in_SCF: 
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because thier fields not in Social Science Fields. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    # Drop paper if the fields are not not included in quantitative data and authors aren't either
    for index, row in df_papers.iterrows():
        is_in_SCF = False 
        # Go through the entire list of fields for each paper 
        for field in ast.literal_eval(row["field"]):
            if field["category"] in quantitative_fields:
                is_in_SCF = True
        
        # Check if the authors are in the quantitative_fields
        for author_field in row["author_field"]: 
            if author_field in quantitative_fields: 
                is_in_SCF = True
       
        # if the paper was not in social science fields drop it 
        if not is_in_SCF: 
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because thier fields and authors are not in Quantitative Fields. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    return df_papers
    

In [80]:
# Heuristic 
social_science_fields = {"Political Science", "Sociology", "Economics"}
quantitative_fields = {"Mathematics", "Physics", "Computer Science"}

# Load dataframes
df_author = pd.read_csv("df_author.csv")
df_papers = pd.read_csv("df_paper.csv")

# Argment paper dataframe 
df_papers = argument_paper_dataframe(df_papers=df_papers, df_author=df_author)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_papers["author_field"][index] = authors_fields


In [81]:
# Remove invalid entries 
n_papers_before = len(df_papers)
df_papers = generate_CCS_papers(df_papers, social_science_fields, quantitative_fields, verbose=True)

print(f"There are {len(df_papers)} papers left of {n_papers_before} which is {100*len(df_papers)/n_papers_before:.2f}%") 

234018 papers removed because they were to old. (68%)
13109 papers removed because there are more than 9 authors. (12%)
28818 papers removed because they did not have a DOI. (30%)
14192 papers removed because biology was in the field. (21%)
50109 papers removed because thier fields not in Social Science Fields. (93%)
3214 papers removed because thier fields and authors are not in Quantitative  Fields. (86%)
There are 540 papers left of 344000 which is 0.16%


In [82]:
pd.DataFrame.to_csv(df_papers, "df_CSS_paper.csv")