### Note to reader 
We found it helpfull to make the exercises as scripts for the different weeks, so that it is easier to run on a new device. The scripts used can be found in this github aswell and are the reason that much of our code is seperated into functions. 

In [1]:
# Setup - all imports 
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import urllib3
import time
import numpy as np
import os 

### Webscraping 

In [2]:
# This code was needed to run on certain devices / disables certain warnings that will stop the script
def disable_warnings():
    requests.packages.urllib3.disable_warnings()
    requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL'
    try:
        requests.packages.urllib3.contrib.pyopenssl.util.ssl_.DEFAULT_CIPHERS += ':HIGH:!DH:!aNULL'
    except AttributeError:
    # no pyopenssl support used / needed / available
        pass
    

In [3]:
# 2019 posters 
def get_2019_posters(verbose=False):
    """
    Get all names from the webpage : https://2019.ic2s2.org/posters/
    
    Returns:
        set: A set of all unique names occuring in the page
    """
    # Get the webpage data
    LINK = "https://2019.ic2s2.org/posters/"
    r = requests.get(LINK)
    soup = BeautifulSoup(r.content, features="html.parser")

    # Get each bullet point under class "col-md-8" (all the names are here)
    text = soup.find("div", {"class": "col-md-8"})
    items = text.find_all("li")
    item_list = [str(item) for item in items]

    # Get content between > and <, and seperate at , 
    regex_compiler = re.compile("(?<=\>)(.*?)(?=\<)")

    names = [regex_compiler.findall(item)[0] for item in item_list]
    ind_names = [re.split(', | and', name) for name in names]
    persons =  []
    
    # Collect to one list
    for list in ind_names:
        for name in list:
            persons.append(name)

    # Verbose 
    people_set = set(persons)
    if verbose: print(f"There are {len(people_set)} different people and {len(persons)} name occurences in {LINK}")
    return people_set


In [4]:
# 2019 oral presentations 
def get_2019_oral(verbose=False):
    """
    Get all names from the webpage : https://2019.ic2s2.org/oral-presentations/
    
    Returns:
        set: A set of all unique names occuring in the page
    """
    def find_between( s, first, last ):
        """
        Returns the part of a string that is in the middel of first and last (substrings)

        Args:
            s (String): The string
            first (String): the start "token"
            last (String): the end "token"

        Returns:
            _type_: Substring between "first" and "last"
        """
        try:
            start = s.index( first ) + len( first )
            end = s.index( last, start )
            return s[start:end]
        except ValueError:
            return ""
    
    # Get the webpage data
    LINK = "https://2019.ic2s2.org/oral-presentations/"
    r = requests.get(LINK)
    soup = BeautifulSoup(r.content) 
    
    # All names are between these two titles 
    bet = find_between(str(soup),"1A Misinformation","Evidence of Influence Hierarchies in GitHub’s Cryptocurrency Community" )
    spli = bet.split("<p>") # each <p> has it's own section with a chairname and a list of presenters
    
    # Get the chairname 
    chair_names = []
    for i in spli:
        e = find_between(i, "Chair:", "</em>")
        chair_names.append(e)
    
    # Get the other occupants 
    new_list = []
    for d in spli:
        new_list.append(find_between(d,"</em><br/>","</p>"))
    
    newer_list = []
    #Remove known non-name including files
    for i in range(20): 
        abe = new_list[i].split("<br/>")
        for x in abe:
            x = x[16:]
            if str(x) == "No Presentation":
                pass
            else:
                newer_list.append(x)
    
    # Seperate into two schools - the ones that end with .  and the ones with - 
    # Seperate names at , and remove empty names. 
    # If there is a : then it is not a name, only take what is after. 
    names_list = []
    for i in newer_list:
        if str(i[-1]) == "–":
            lol = str(i[:-2]).split(",")
            for m in lol:
                if m != "":
                    if ":" in m:
                        m = m[m.find(":")+1:]
                    names_list.append(m)
        else:
            imp_ful = 0
            for E in range(len(i)):
                if i[E] == "." and i[E-2] != " ":
                    imp_ful = E
                    break
            namees = i[:imp_ful].split(",")
            for O in namees:
                if O != "":
                    if ":" in O:
                        O = O[O.find(":")+2:]
                    names_list.append(O)
    
    # Delete "No presentation (cancelled)" entries 
    for i in range(len(names_list)):
        if names_list[i-1] == "No presentation (cancelled)":
            names_list.pop(i-1)
    
    # Merge the two lists 
    final_list = []
    for i in names_list+chair_names:
        if i[0] == " ":
            final_list.append(i[1:])
        else:
            final_list.append(i)
        
    return set(final_list)


In [5]:
# 2020 
def get_2020_all(verbose=False):
    """
    Get all names from the webpage : "https://ic2s2.mit.edu/program"
    
    Returns:
        set: A set of all unique names occuring in the page
    """
    # Get page content 
    LINK = "https://ic2s2.mit.edu/program"
    req = requests.get(LINK, verify= False)
    soup = BeautifulSoup(req.content, features="html.parser")
    
    # Scrape link to the page with the actuel content
    text = soup.find("div", {"class": "article-content"})
    str_text = str(text).split("src=")
    docs_link = str_text[-1].split(" ")[0][1:-1]
    
    # All names are stored in the table in the class "waffle" 
    r = requests.get(docs_link)
    soup = BeautifulSoup(r.content, features="html.parser")
    table = soup.find("table", {"class": "waffle"})
    table_rows = table.find_all("tr") # Get all rows 
    
    # Go through each row and put the data into a list (row of dataframe)
    rows = []
    for tr in table_rows[1:]:
        tds = tr.find_all('td')
        row = [td.text.replace("\n","") for td in tds]
        rows.append(row)
    
    # Make the dataframe 
    df = pd.DataFrame(rows)#, columns=header[0:5])
    # Extract names from column 2 (zero indexed) starting from the 1 (zero indexed) element
    names_plus = [name.split(', ') for name in list(df.iloc[:,2][1:])]
    names = []
    for name in names_plus:
        for n in name:
            if len(n) :
                names.append(n)
    if verbose: f"There are {len(set(names))} different people and {len(names)} name occurences in {LINK}"
    return list(set(names))


In [6]:
# 2021 
def get_2021_all(verbose=False): 
    """
    Get all names from the webpage : "https://easychair.org/smart-program/IC2S2-2021/talk_author_index.html"
    
    Returns:
        set: A set of all unique names occuring in the page
    """
    # Get page content 
    LINK = "https://easychair.org/smart-program/IC2S2-2021/talk_author_index.html"
    request = requests.get(LINK)
    soup = BeautifulSoup(request.content)
    
    # All names are in the table "index" and we split at "tr"; each containing one name
    contents = soup.find("table", "index")
    contents = contents.find_all("tr")
    
    # Regex compiler that finds elements between > < (not including)
    regex_compiler = re.compile("\>(.*?)\<")
    names = set()
    counter = 0
    
    # Go through each tr (statement) and split at each td find the first name and surname
    for content in contents: 
        person = str(content.find_all("td")[0])
        titles = regex_compiler.findall(person)[2:-1]
        if len(titles) == 2: # There is not exactly two elements it is not a name (but a Alphabetic code)
            name = titles[1] + " " + titles[0]
            name = name.replace(",", "")
            name = name.strip()
            
            names.add(name) 
            counter +=1

    if verbose: f"There are {len(names)} different people and {counter} name occurences in {LINK}"
    return names


In [7]:
# Helper functions, as the datasets will need to be stored underway as they are very large
def save_data(data, file_name): 
    """
    A function to save a dictionary (or set) 

    Args:
        ids (data): data/set that needs to be stored
        file_name (str): file name 
    """
    np.save(f"{file_name}.npy", data)

def load_data(file_name):
    """
    Loads a data object (dict or set) 

    Args:
        file_name (str): file_name (without prefix)
    Returns:
        set or dict: data in the file
    """
    
    data = np.load(f"{file_name}.npy", allow_pickle=True).item()
    
    return data


In [8]:
# Scrape data from all three years 
def get_all_names(verbose=False, Body = False):
    """
    Collects all four datasets's names into one set 

    Args:
        verbose (bool, optional): True -> prints comments about how many times names appear and how many there are
    """
    if Body: disable_warnings()

    # Run the previous four methods
    names_2019_poster = get_2019_posters(verbose)
    names_2019_oral = get_2019_oral(verbose)
    names_2020_all = get_2020_all(verbose)
    names_2021_all = get_2021_all(verbose)
    
    # Collect into one set
    names_all = set()
    names_all.update(names_2019_poster, names_2019_oral, names_2020_all, names_2021_all)
    
    if verbose: print(f"There are {len(get_all_names())} unique names in total")
    return names_all

# To not repeat this time consuming process we save the results and load them if possible
science_people_file_name = "names_week_1"
if os.path.isfile(science_people_file_name + ".npy"):
    science_people = load_data(science_people_file_name)
else: 
    science_people = get_all_names()
    save_data(science_people, science_people_file_name)

print(f"In total we have {len(science_people)} unique computational social scientists across the three years")



In total we have 2133 unique computational social scientists across the three years


In [9]:
# How many unique authors are there in 2019 split across oral and poster? 
authors_oral_2019 = get_2019_oral(verbose=False)
authors_poster_2019 = get_2019_posters(verbose=False)

print(f'There are {len(authors_oral_2019)} unique authors in the oral presentation for 2019')
print(f'There are {len(authors_poster_2019)} unique authors in the poster presentation for 2019')
print(f'There are {len(set(authors_oral_2019).union(authors_poster_2019))} unique authors in total')

There are 355 unique authors in the oral presentation for 2019
There are 471 unique authors in the poster presentation for 2019
There are 774 unique authors in total


#### Conclusion from week 1
We have found 774 unique authors in the year 2019 with webscraping, and across all three years we have 2133 authors. 

### Find coauthors 

In [12]:
# Function for getting all coauthors to a set of names  
def get_ids_and_coauthors(names, file_name, load_previous=False, verbose=False):
    """
    Takes in a set of names and returns a set of ids of the author and the coauthors to the authors papers

    Args:
        names (set): names of the authors
        file_name (str): file name of where to save progress
        verbose (Boolean): whether the function should speak or not
        
    Returns: 
        ids (set): Set of author ids for all "names" and coauthors on all papers of "names"
        nin_names (set): Set of names that was not in the sematic scholar database 
    """ 
    
    # Base address for requests
    BASE_URL = "https://api.semanticscholar.org/graph/"
    VERSION  = "v1/"
    RESOURCE = "author/search?query="
    ADDITION = "&fields=papers.authors"
    complete_url = BASE_URL + VERSION + RESOURCE
    
    # The set of ids 
    ids = set()
    evaluated_names = set() 
    nin_names = set()
    
    # Check if the problem has been worked on previously 
    if load_previous:
        try:    
            ids = load_data(file_name=file_name)
            evaluated_names = load_data(file_name=file_name + "_evaluated_names")
            nin_names = load_data(file_name=file_name + "_nin_names")
            if verbose: print(f"{len(evaluated_names)} already evaluated of {len(names)}") 
            names = names - evaluated_names
            if verbose:  print(f"Hence there are {len(names)} left ")
        except:
            if verbose : print(f"There are not any progress previously achieved")
        
    
    # Loop over authors 
    for i, name in enumerate(names):
        # We can only do 150 request each five minuts, use the down time to save progress 
        if i % 150 == 149: 
            start_time = time.time()
            # printing 
            if verbose: 
                print(f"Completed searches for {i} out of {len(names)}, but reached limit")
            # Save prograss 
            save_data(ids, file_name=file_name)
            save_data(evaluated_names, file_name=file_name + "_evaluated_names")
            save_data(nin_names, file_name=file_name + "_nin_names")
            time.sleep(60*5+10 + start_time - time.time()) # the +10 is a buffer 
        
        # Make request 
        # print(complete_url + name + ADDITION) # Debugging
        response = requests.get(complete_url + name + ADDITION).json()
        
        # If something goes wrong, it will be reported here 
        try: 
            for paper in response["data"][0]["papers"]: 
                for author in paper["authors"]: 
                    ids.add(author["authorId"]) 
        except: # Usually only occurs if the author has not realeased any papers or is not found 
            print(f"The error occured at search number {i}, the name {name} and the response is: \n {response}")
            nin_names.add(name)
        # In either case the name has been evaluated
        evaluated_names.add(name) 
               
    # Just to not mess whith the other parts of the code (amount of requests)
    if len(names) % 150 > 50: 
        time.sleep(60*5) 
    
    # Save progress for next time 
    save_data(ids, file_name=file_name)
    save_data(evaluated_names, file_name=file_name + "_evaluated_names")
    save_data(nin_names, file_name=file_name + "_nin_names")
            
    return ids, nin_names


In [13]:
# Get all coauthors to the 2133 authors found in week 1 
ids, nin_names = get_ids_and_coauthors(science_people, file_name="ids_dict", load_previous=True, verbose=True) 

print(f"We have found {len(ids)} coauthors of the {len(science_people)} from week 1")
print(f"Of the {len(science_people)} authors, {len(nin_names)} where not found")

There are not any progress previously achieved
The error occured at search number 0, the name  and the response is: 
 {'error': "Missing required parameter: 'query'"}
The error occured at search number 7, the name Mu-jung Cho and the response is: 
 {'total': 0, 'offset': 0, 'data': []}
The error occured at search number 50, the name Ho-Chun Herbert Chang and the response is: 
 {'total': 0, 'offset': 0, 'data': []}
The error occured at search number 53, the name Steve R. Scheinert and the response is: 
 {'total': 0, 'offset': 0, 'data': []}
The error occured at search number 82, the name Deniz Gezerli and the response is: 
 {'total': 0, 'offset': 0, 'data': []}
The error occured at search number 99, the name Mohammed Aleinzi and the response is: 
 {'total': 0, 'offset': 0, 'data': []}
The error occured at search number 127, the name Sandor Lera and the response is: 
 {'total': 0, 'offset': 0, 'data': []}
The error occured at search number 137, the name Aaron Cluaset and the response is:

#### Conclusion and thoughts 
We see that we have now gathered a dataset of more than 100000 authors, which is quite a lot. We have the issue that we can only make about 150 requests to semantic scholar every five minutes and hence we need to batch our id searches, as the process will otherwise take to long. 

This is however not easy as semantic scholar can only handle batches of size 100 when we are also asking for the papers written by each author. 

##### Thoughts after initial struggles 
The amount of data that we want to gather from sematic scholar is quite large, as a dictionary containing the first 10000 author id's as keys takes up 1.7 GB of data. 
Furthermore, semantic scholar can often not handle batches of size 100 if there is two much data in the batch, hence the error handeling in the next couple of methods. 

In [14]:
# Helper function for formatting authors into a dataframe
def format_authors(ids_dict):
    """
    Formats a dictionary with all data of the authors into a simple dataframe

    Args:
        ids_dict (dict): a dictionary with author ids and their papers

    Returns:
        df: a dataframe of the given authors with the data; id, name, alias, citationCount, field
    """
    # Create people dataframe 
    zero_data = np.zeros((len(ids_dict.keys()), 5))
    zero_data[:] = np.nan
    df = pd.DataFrame(zero_data, columns=["id", "name", "aliases", "citationCount", "field"])

    for i, id in enumerate(ids_dict.keys()):
        # ID
        df["id"][i] = id 
        information = ids_dict[str(id)]
        # Name
        df["name"][i] = information["name"]
        # Aliases
        df["aliases"][i] = information["aliases"] 
        # citation count 
        citation_count = 0
        for paper in information["papers"]: 
            citation_count += paper["citationCount"]
        df["citationCount"][i] = citation_count
        # field - count each occurence and take the maximum 
        potential_fields = {}
        for paper in information["papers"]: 
            for fields in paper["s2FieldsOfStudy"]:
                field = fields["category"]
                try:
                    potential_fields[field] += 1
                except: 
                    potential_fields[field] = 1
        if potential_fields == {}: 
            pass
        else: 
            # This is a spicy way to do this o.0
            df["field"][i] = max(potential_fields, key=potential_fields.get)

    return df


In [15]:
# Helper function for formatting papers into a dataframe
def format_papers(ids_dict):
    """
    Formats a dictionary with all data of the authors into a simple dataframe

    Args:
        ids_dict (dict): a dictionary with author ids and their papers

    Returns:
        df: a dataframe of the given authors's papers with the data; id, title, year, DOI, citationCount, field, authors
    """
    # Start by making a dictionary of papers instead of authors
    papers = {}
    for id in ids_dict.keys():
        # get the papers 
        information = ids_dict[str(id)]
        for paper in information["papers"]:
            if paper["paperId"] in papers:
                pass
            else: 
                # Find the author id's
                authors = set()
                for author in paper["authors"]:
                    authors.add(author["authorId"])
                # Make new elements in the dictionary 
                papers[paper["paperId"]] = {"id": paper["paperId"],
                                            "title": paper["title"], 
                                            "year": paper["year"], 
                                            "doi": paper["externalIds"],
                                            "citationCount": paper["citationCount"], 
                                            "field": paper["s2FieldsOfStudy"], 
                                            "authors": authors}
    # Create the paper dataframe 
    zero_data = np.zeros((len(papers.keys()), 7))
    zero_data[:] = np.nan
    df = pd.DataFrame(zero_data, columns=["id", "title", "year", "doi", "citationCount", "field", "authors"])

    for i, id in enumerate(papers.keys()):
        # ID
        df["id"][i] = id
        information = papers[str(id)]
        # title
        df["title"][i] = information["title"]
        # Aliases
        df["year"][i] = information["year"] 
        # DOI 
        df["doi"][i] = [information["doi"]] # Can't have a dict, but it is okay to wrap it with list
        # citation count 
        df["citationCount"][i] = information["citationCount"]
        # field 
        df["field"][i] = information["field"] # Does this work? Yes somehow 
        # authors 
        df["authors"][i] = list(information["authors"])

    return df


In [16]:
# Helper function for formatting paper abstracts into a dataframe
def format_paper_abstracts(ids_dict):
    """
    Formats a dictionary with all data of the authors into a simple dataframe

    Args:
        ids_dict (dict): a dictionary with author ids and their papers

    Returns:
        df: a dataframe of the given authors's papers with the data; id, abstract
    """
    # Start by making a dictionary of papers instead of authors
    papers = {}
    for id in ids_dict.keys():
        # get the papers 
        information = ids_dict[str(id)]
        for paper in information["papers"]:
            if paper["paperId"] in papers:
                pass
            else:
                # Make new elements in the dictionary 
                papers[paper["paperId"]] = {"id": paper["paperId"],
                                            "title": paper["abstract"]}
    # Create the paper dataframe 
    zero_data = np.zeros((len(papers.keys()), 2))
    zero_data[:] = np.nan
    df = pd.DataFrame(zero_data, columns=["id", "abstract"])

    for i, id in enumerate(papers.keys()):
        # ID
        df["id"][i] = id
        information = papers[str(id)]
        # abstract
        df["abstract"][i] = information["abstract"]

    return df


In [17]:
# Helper function that formats a dictionary into dataframes and stores them
def create_dataframes(ids_dict, prefix="", verbose=False):
    """
    Takes in a dictionary of authors and their papers and generates two datasets and stores these

    Args:
        ids_dict (dict): The key is an auther id, the contents is a dictionary with three atributes, "name", "aliases" and "papers".
        prefix (str): If the function will be called multiple times, this is to not overwrite previous stored files
    
    Return: 
        df_author: The above specified dataframe for authors
        df_paper: The above specified dataframe for papers
    """
    # Create dataframes
    # Authors 
    df_author = format_authors(ids_dict)
    if verbose: print("Formatted author dataframe")
    pd.DataFrame.to_csv(df_author, f"df_author{prefix}.csv")
    if verbose: print("Saved author dataframe")
    
    # Papers
    df_paper = format_papers(ids_dict)
    if verbose: print("Formatted paper dataframe")
    pd.DataFrame.to_csv(df_author, f"df_paper{prefix}.csv")
    if verbose: print("Saved paper dataframe")
    
    # Paper abstracts 
    df_paper = format_paper_abstracts(ids_dict)
    if verbose: print("Formatted paper dataframe")
    pd.DataFrame.to_csv(df_author, f"df_paper{prefix}.csv")
    if verbose: print("Saved paper dataframe")
    
    return df_author, df_paper


In [18]:
# The function that finds all papers from each author
def get_data_from_ids(ids, verbose=False, load_previous=True, file_name="ids_enumerated_dict"): 
    """
    Returns a complete data frame of 
    authors (id, name, alias, citationCount, field) and
    papers (id, title, year, DOI, citationCount, field, authors)

    Args:
        ids (set): ids of the authors in question

    Returns:
        file_extension (int): The number of dataframes created
        nin_ids (set): The set of ids that could not be processed  
    """
    ids_dict = {} # This will continually be reset, otherwise it would take up to much space
    evaluated_ids = set()
    nin_ids = set()
    ids_dict["file_extension"] = 0
    
    # Check if the problem has been worked on previously 
    if load_previous:
        try: 
            ids_dict = load_data(file_name=file_name)
            evaluated_ids = load_data(file_name=file_name + "_evaluated_ids")
            nin_ids = load_data(file_name=file_name + "_nin_ids")
            if verbose: print(f"{len(evaluated_ids)} already evaluated of {len(ids)}") 
            ids = ids - evaluated_ids
            if verbose: print(f"Hence there are {len(ids)} left ")
        except:
            print(f"There are no previous progress made")
    
    # Partition the ids into batches of 20 ids, because semantic scholar can only take that many 
    ids = list(ids) # temporary to get results
    default_batch_size = 64
    batch_size = default_batch_size
    n_batches = len(ids) // batch_size + 1
    batches_left = True 
    sent_requests = 0
    index = 0 
    if verbose: print(f"Total number of batches are {n_batches}")
        
    #  Current file name extension
    file_extension = ids_dict["file_extension"]
    file_start = 0 
    file_size = 8000 # Hope this is small enough 
    
    # Use a while loop to go through each batch of ids so that we can change sizes dynamically
    # (This stems from the fact that semantic scholar will return errors if we ask for too much data)
    while(batches_left):
        # To avoid memory overflow convert to pandas 
        if index > file_start + file_size: 
            # Note that the return dataframes from create_dataframes are not used, as we do not have memory enough to keep them
            # (they are only stored to physical memory)
            create_dataframes(ids_dict=ids_dict, prefix=file_extension, verbose=verbose)
            file_start += file_size
            file_extension += 1
            ids_dict = {}
            ids_dict["file_extension"] = file_extension
        
        # Make request for batch 
        batch_url = "https://api.semanticscholar.org/graph/v1/author/batch"
        data = {"ids": ids[index:min(index + batch_size, len(ids))]}
        params = {"fields": "aliases,papers.title,papers.year,papers.externalIds,papers.s2FieldsOfStudy,papers.citationCount,papers.abstract,name,papers.authors"}
        response = requests.post(batch_url, json=data, params=params).json()
        sent_requests += 1
        
        # Assert the response
        if response == {'message': 'Internal server error'}:
            if verbose: print(f"Server error for index {index} with batch size {batch_size}")
            batch_size = batch_size // 2 # Half batch size and try again 
            
            if batch_size == 0: 
                # Save the faulty element that makes semantic scholar give internal errors 
                # and proceed to the next one
                if verbose: print(f"Had to remove {index} which is {ids[index]}")
                batch_size = 1 
                evaluated_ids.update(set(ids[index:min(index + batch_size, len(ids))]))
                nin_ids.update(set(ids[index:min(index + batch_size, len(ids))]))
                index += batch_size 
        else: 
            # If there is something wrong with the request
            try: 
                for person in response: 
                    # Update dictionary 
                    # If something goes wrong, it will be reported here 
                    try: 
                        ids_dict[person["authorId"]] = {"name": person["name"], 
                                "aliases": person["aliases"],
                                "papers": person["papers"]}
                    except: # Usually only occurs if the author has not realeased any papers or is not found 
                        print("Something is wrong with this person (usually it is a None value somehow?)")
                        print(f"The index is {index} with batch size {batch_size}")
            except: 
                # If it messes up print the request and put the ids in nin 
                if verbose:  
                    print(response)
                    print(f"The index is {index} with batch size {batch_size}")
                nin_ids.update(set(ids[index:min(index + batch_size, len(ids))]))
            
            # Update processed ids 
            evaluated_ids.update(set(ids[index:min(index + batch_size, len(ids))]))
            index += batch_size
            batch_size = default_batch_size
        
        # If there has been too many request we need to break - we can probably skip this now because requests take so long 
        if sent_requests % 150 == 149: 
            start_time = time.time()
            # printing 
            if verbose: 
                print(f"Completed searches for {index} out of {len(ids)}, but reached limit")
            # Save prograss 
            save_data(ids_dict, file_name=file_name)
            save_data(evaluated_ids, file_name=file_name + "_evaluated_ids")
            save_data(nin_ids, file_name=file_name + "_nin_ids")
            time.sleep(max(60*5+10 + start_time - time.time(), 0)) # the +10 is a buffer
        
    # Create and save the final dataframes 
    create_dataframes(ids_dict=ids_dict, prefix=file_extension, verbose=verbose)
    
    return file_extension, nin_ids


In [None]:
# Generate all dataframes 
n_files, nin_ids = get_data_from_ids(ids, load_previous=True, verbose=True)

print(f"Of the {len(ids)} authors, {len(nin_ids)} could not be found in the dataset")
print(f"There are {n_files} dataframes of authors, papers and paper abstracts that needs to be merged")

In [None]:
# Merge dataframes 
# USE drop_duplicates after merging all the n_files dataframes. 