In [68]:
import pandas as pd
import ast
import networkx as nx
import numpy as np
from tqdm import tqdm


In [69]:
# Function that takes in list of authorIds and outputs the corresponding top fields in the same order 
def author_field(ids, df_author):
    """
    Compute the top field of the given authors 

    Args:
        ids (list): containing the ids of the authors in question
        df_author (pandas df): data frame of type author that contains the given ids 
    
    return: 
        fields (list): list of top fields of the given authors 

    """
    # Get boolean array that indicates where the authors are 
    mask = df_author["id"].isin(ids)
    
    # Get the fields 
    fields = list(df_author["field"][mask]) 
    
    return fields 


In [70]:
def argument_paper_dataframe(df_papers, df_author):
    """
    Takes in a paper data frame, arguments it with a new column and puts the fields of the author in that column.

    Args:
        df_papers (pandas dataframe): paper dataframe (like before)
        df_author (pandas dataframe): author dataframe (like before)

    Returns:
        df_papers (pandas dataframe): the dataframe from before, argumentet with the new column
    """
    df_papers["author_field"] = None 

    for index, row in tqdm(df_papers.iterrows()):
        # Currently authors are stored as a string representation of the list so we make it a list again 
        authors = ast.literal_eval(row["authors"]) 
        # Now we need to turn the authors into a list of integers, because the df_paper dataframe stores them as such 
        authors = [eval(id) for id in authors if id is not None]
        #authors = [eval(id) for id in authors]
        # Find the fields of the authors 
        authors_fields = author_field(authors, df_author=df_author)
        df_papers["author_field"][index] = authors_fields
        # print(authors_fields) # debugging 
    # print(df_papers.head()) # debugging 
        
    return df_papers 


In [71]:
# Clean the paper datafram to a social science dataframe
def generate_CCS_papers_1(df_papers, social_science_fields, quantitative_fields, verbose=False):
    
    rows_to_drop = []
    
    # Drop paper if the fields are not not included in social science fields
    for index, row in df_papers.iterrows():
        is_in_SCF = False 
        # Go through the entire list of fields for each paper 
        try: 
            for field in ast.literal_eval(row["field"]):
                if field["category"] in social_science_fields:
                    is_in_SCF = True
        except: 
            is_in_SCF = False 
        
        # if the paper was not in social science fields drop it 
        if not is_in_SCF: 
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because thier fields not in Social Science Fields. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    # Drop rows that are before 2008
    for index, row in tqdm(df_papers.iterrows()):
        if row["year"] <= 2008: 
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because they were to old. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
        
    # Drop rows that do not contain a DOI
    for index, row in df_papers.iterrows():
        if row["doi"] == None: 
            rows_to_drop.append(index)
        else: 
            try:
                ast.literal_eval(row["doi"])[0]["DOI"]
            except: 
                rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because they did not have a DOI. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    # Drop paper if it includes biology
    for index, row in df_papers.iterrows():
        # Go through the entire list of fields for each paper 
        for field in ast.literal_eval(row["field"]):
            if field["category"] == "Biology":
                rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because biology was in the field. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    return df_papers


# It is time consuming to add a row in the dataframe, hence it helps that the dataframe is 100 times smaller 
def generate_CCS_papers_2(df_papers, social_science_fields, quantitative_fields, verbose=False):
    
    rows_to_drop = []
    
    # Drop the papers with more than 9 Computational Social Science authors? TODO what does she mean! 
    for index, row in df_papers.iterrows():
        if len(row["author_field"]) > 9: # Count authors, should I check if they are in the author data frame? That's gonna take a while
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because there are more than 9 CSS authors. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    # Drop paper if the fields are not not included in quantitative data and authors aren't either
    for index, row in df_papers.iterrows():
        is_in_SCF = False 
        # Go through the entire list of fields for each paper 
        for field in ast.literal_eval(row["field"]):
            if field["category"] in quantitative_fields:
                is_in_SCF = True
        
        # Check if the authors are in the quantitative_fields
        for author_field in row["author_field"]: 
            if author_field in quantitative_fields: 
                is_in_SCF = True
       
        # if the paper was not in social science fields drop it 
        if not is_in_SCF: 
            rows_to_drop.append(index)
    if verbose: print(f"{len(rows_to_drop)} papers removed because thier fields and authors are not in Quantitative Fields. ({len(rows_to_drop)/len(df_papers)*100:.0f}%)")
    df_papers.drop(index=rows_to_drop, inplace=True) 
    rows_to_drop = []
    
    return df_papers
    

In [72]:
# Heuristic 
social_science_fields = {"Political Science", "Sociology", "Economics"}
quantitative_fields = {"Mathematics", "Physics", "Computer Science"}

# Load dataframes
df_author = pd.read_csv("df_author.csv")
df_papers = pd.read_csv("df_paper.csv")

# Drop papers, but not based on the authors
n_papers_before = len(df_papers)

print(f"Initially we have {n_papers_before} papers.")
df_papers = generate_CCS_papers_1(df_papers, social_science_fields, quantitative_fields, verbose=True)

# Papers removed so far 
print(f"There are {len(df_papers)} papers left of {n_papers_before} which is {100*len(df_papers)/n_papers_before:.2f}%") 

Initially we have 1078817 papers.
1010697 papers removed because thier fields not in Social Science Fields. (94%)


68120it [00:02, 28072.65it/s]


20011 papers removed because they were to old. (29%)
11330 papers removed because they did not have a DOI. (24%)
98 papers removed because biology was in the field. (0%)
There are 36681 papers left of 1078817 which is 3.40%


In [73]:
# Now the time consuming part 
# Argment paper dataframe 
df_papers = argument_paper_dataframe(df_papers=df_papers, df_author=df_author)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_papers["author_field"][index] = authors_fields
36681it [00:44, 821.19it/s]


In [74]:
# Drop papers based on authors 
df_papers = generate_CCS_papers_2(df_papers, social_science_fields, quantitative_fields, verbose=True)

print(f"There are {len(df_papers)} papers left of {n_papers_before} which is {100*len(df_papers)/n_papers_before:.2f}%") 

165 papers removed because there are more than 9 CSS authors. (0%)
31084 papers removed because thier fields and authors are not in Quantitative Fields. (85%)
There are 5432 papers left of 1078817 which is 0.50%


In [75]:
# Remove duplicates 
n_papers = len(df_papers)
df_papers.drop(columns=["Unnamed: 0", "Unnamed: 0.1"], inplace=True) # weird columns that were somehow created? 
df_papers.drop_duplicates(subset=["id"], inplace=True)
print(f"Removed {n_papers - len(df_papers)} that were duplicates. ({100*(n_papers - len(df_papers)) / n_papers :.2f}%)")
print(f"There are {len(df_papers)} left")

Removed 403 that were duplicates. (92.58%)
There are 5029 left


In [76]:
# Save the agumentet dataframe 
pd.DataFrame.to_csv(df_papers, "df_CSS_paper.csv")

# TODO Check how many unique authors have written these papers. 

In [86]:
# Print 10 top papers 
df_papers.sort_values(by=["citationCount"], ascending=False, inplace=True)
for i, paper in df_papers.head(10).iterrows():
    print(f"'{paper['title']}' has {int(paper['citationCount'])} citations.")

'CRITICAL QUESTIONS FOR BIG DATA' has 3422 citations.
'I tweet honestly, I tweet passionately: Twitter users, context collapse, and the imagined audience' has 3042 citations.
'Exposure to ideologically diverse news and opinion on Facebook' has 1925 citations.
'The sharing economy: Why people participate in collaborative consumption' has 1901 citations.
'The role of social networks in information diffusion' has 1430 citations.
'Social Network Sites as Networked Publics: Affordances, Dynamics, and Implications' has 1349 citations.
'Multiscale mobility networks and the spatial spreading of infectious diseases' has 1156 citations.
'The Leverage Cycle' has 1145 citations.
'The ethics of algorithms: Mapping the debate' has 973 citations.
'The Presentation of Self in the Age of Social Media: Distinguishing Performances and Exhibitions Online' has 954 citations.


In [85]:
# Get unique authors 
authors = set() 

for i, paper in df_papers.iterrows(): 
    paper_authors = set(ast.literal_eval(paper["authors"]))
    authors.update(paper_authors)
    
if None in authors: # If no is in the author list 
    authors.remove(None)

authors = set([int(id) for id in authors]) # convert strings to ints to compare them
total_CSS_authors = set([int(author) for author in df_author["id"]])

CSS_authors = authors & total_CSS_authors
    
print(f"From the {len(df_papers)} papers there are {len(authors)} unique authors, but only {len(CSS_authors)} are from our original data frame.")

From the 5029 papers there are 13338 unique authors, but only 1466 are from our original data frame.
