# Stanford Encyclopedia of Philosophy - Diversity Analysis
## Data Extraction, Preprocessing and Cleaning

We begin by loading the libraries required for this analysis, as well as the function we defined in an external python file.

In [1]:
import math

import pandas as pd
import requests
import unidecode
from bs4 import BeautifulSoup

from sep_functions import extract_year, extract_authors, extract_title, get_bib_elements

### 1. Scraping the *Stanford Encyclopedia of Philosophy* 

We begin by scraping all the URLs to articles on the Stanford Encyclopedia of Philosophy from its "contents" page.

In [2]:
# Url with overview of all the entries
url = "https://plato.stanford.edu/contents.html"

# Load html content, get page text, parse HTML
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page, 'html.parser')

# Find <a href> elements which begin with "plato.stanford.edu/entries"
links = soup.findAll('a')
entries = []
for link in links:
    try:
        link_text = link.get("href")
        # If the link text is an entry, and it hasn't been added to the list before
        if "entries/" in link_text  and link_text not in entries:
            entries.append(link_text) 
    except:
        pass
    
print(len(entries))    

1740


In [4]:
# Initialize list of bib elements
bib_elements = []

# Initialize a count for the entries
num_entry = 1

# Iterate through the entries and extract all bib elements
for entry in entries:
    print(f"Extracting from entry #{num_entry}: {entry}")
    num_entry +=1
    url = f"https://plato.stanford.edu/{entry}"
    li_elements = get_bib_elements(url)
    print(f"Appending {len(li_elements)} references")
    for li_element in li_elements:
        bib_elements.append(li_element)

Extracting from entry #1: entries/abduction/
Appending 111 references
Extracting from entry #2: entries/abelard/
Appending 89 references
Extracting from entry #3: entries/abhidharma/
Appending 67 references
Extracting from entry #4: entries/abilities/
Appending 77 references
Extracting from entry #5: entries/abner-burgos/
Appending 43 references


In [None]:
# Save the raw reference data to txt file
textfile = open("data/bib_items_complete.txt", "w")
for element in bib_elements:
    textfile.write(str(element) + "\n")
textfile.close()

### 2. Turning the raw text into structured data

In [5]:
# Load the data with the raw references
filepath = "data/bib_items_complete.txt"

# Import references from raw text file
file = open(filepath, "r")
content = file.read()

# Split the content where a li element ends and a new line starts
references = content.split("</li>\n")
file.close()

In [4]:
# Initialize lists to store publication years, author lists, and titles in, as well as different counts
years, authors, titles = [], [], []
yearless = 0
count = 0
fail_count = 0

# Iterate through references
for reference in references:
    
    try:
        # Get publication year
        year = extract_year(reference)
        # Skip if there's no year
        if not year:
            print(f"This is number {count}:")
            print("Yearless")
            print(reference)
            yearless+=1
            continue
        
        # Get publication name
        title  = extract_title(reference)
         
        # Get author names
        # If the author name is ––– (indicating its the same as in the previous ref)
        # grab the author from before (last item in authors list)
        if "–––" in reference:
            names = authors[-1]
        else:
            names = extract_authors(reference, year)
        
        # Append all
        years.append(year)
        titles.append(title)
        authors.append(names)
        
        count+=1
        print(f"{year}, {title}, {names}")
        
    except:
        print(f"This is number {count}:")
        print("General Failure")
        print(reference)
        fail_count+=1
        
# Make sure the list of authors and the list of years is equally long
assert len(authors)==len(years) and len(titles)==len(years)

2001, The Book of Evidence, ['P. Achinstein']
1994, Testimony, Trust, Knowing, ['J. Adler']
1979, Linguistic Communication and Speech Acts, ['K. Bach', ' R.  Harnish']
1998, Philosophy of Science, ['A. Bird']
2010, Quine, Mereology, and Inference to the Best Explanation, ['J. Bigelow']


In [None]:
# Turn into pandas dataframe
df = pd.DataFrame({'Authors' : authors,
                   'Year' : years,
                   'Title' : titles }, 
                  columns=['Authors','Year', 'Title'])

# Write to csv
df.to_csv("data/sep_reference_data.csv")

### 3. Preprocessing the data

In [54]:
# Load data
df = pd.read_csv("data/sep_reference_data.csv")[["Authors", "Year", "Title"]]

# Save initial number of columns
initial_len = len(df)

# Reformat Authors column
df["Authors"] = df["Authors"].apply(lambda x: x[2:-2].replace("'",""))
print(f"Data set with {initial_len} rows.")
df.head()

Data set with 180403 rows.


Unnamed: 0,Authors,Year,Title
0,P. Achinstein,2001,The Book of Evidence
1,J. Adler,1994,"Testimony, Trust, Knowing"
2,"K. Bach, R. Harnish",1979,Linguistic Communication and Speech Acts
3,A. Bird,1998,Philosophy of Science
4,J. Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex..."


We begin by checking how many entries are invalid, i.e. don't have a title, year, or author names. Those rows we will drop.

In [55]:
# Drop rows without author name
df = df[df["Authors"]!="(No names)"]

# Drop rows without title
df[df["Title"]=="(Couldn't determine title)"]
print(f"Number of dropped rows: {initial_len-len(df)}. This is ~{round((initial_len-len(df))/initial_len,3)*100}% of the original data set.")


Number of dropped rows: 8971. This is ~5.0% of the original data set.


In [56]:
# Get first author
df["First author"] = df["Authors"].apply(lambda x: x.split(",")[0])
df.head()

Unnamed: 0,Authors,Year,Title,First author
0,P. Achinstein,2001,The Book of Evidence,P. Achinstein
1,J. Adler,1994,"Testimony, Trust, Knowing",J. Adler
2,"K. Bach, R. Harnish",1979,Linguistic Communication and Speech Acts,K. Bach
3,A. Bird,1998,Philosophy of Science,A. Bird
4,J. Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex...",J. Bigelow


We're now checking whether there are columns where the First author name is suspiciously long (over 35 characters) to analyze those. It will presumably be reasonable to drop them, as they will most likely indicate that something went wrong. (Note: as I do this in preprocessing, I can probably delete this)

In [57]:
# Mark all where first author name is longer than 35 characters
df["Overlong"] = df["First author"].apply(lambda x: len(x)>35)

# How many are we dealing with?
print(f"There are {len(df[df['Overlong']])} overly long first author entries.")

# Print the top ones
df[df["Overlong"]].head()

# Drop them
df = df[df["Overlong"]==False][["Authors","Year","Title","First author"]]

There are 6 overly long first author entries.


As we want to focus only on publications after 1900, we drop all data points with earlier publication year.

In [58]:
df[df["Year"]<"1900"].head()

Unnamed: 0,Authors,Year,Title,First author
261,David Hume,1748,An Enquiry Concerning Human Understanding,David Hume
288,Thomas Reid,1788,Essays on the Active Powers of Man,Thomas Reid
394,Bernhard Zimmels,1886,Leo Hebraeus: Ein Judischer Philosoph der Rena...,Bernhard Zimmels
395,Bernhard Zimmels,1892,"Leone Hebreo, Neue Studien",Bernhard Zimmels
416,Bernard Bolzano,1837,Wissenschaftslehre,Bernard Bolzano


In [59]:
df = df[df["Year"]>="1900"]
len(df)

166437

This should be a reasonable pre-processed and filtered data set. It contains 166 437 entries. We will now move to the feature engineering part.

### 4. Obtain full author names

The idea here is to get a df with two columns, one with the name as they are found in the original df, and one with the full name.

In [60]:
# Number of different first authors (55.9k)
print(f"There are {len(df['First author'].unique())} unique 'Authors' entries")

There are 55970 unique 'Authors' entries


#### 4.1 Match different versions of names

A quick look at the author names reveals that there are often different versions of the same name in the data set (e.g. "Richard Boyd", "Richard N. Boyd"). The first step will be to remove middle names.

In [61]:
# Split authors into list divided by whitespaces, then only retain first and last
df["First author"] = df["First author"].apply(lambda x: x.split(" ")[0]+" "+x.split(" ")[-1])
# Check how many unique names we have now
len(df["First author"].unique())

49775

Next we're removing punctuation to unify abbreviated names (e.g. "L Bovens", "L. Bovens"). Moreover, we're replacing letters with accent (e.g. "A. Hajék").

In [62]:
df["First author"] = df["First author"].apply(lambda x: x.replace(".",""))
# Check how many unique names we have now
len(df["First author"].unique())

49035

In [63]:
# Replace each name by its closest ASCII representation
df["First author"] = df["First author"].apply(lambda x: unidecode.unidecode(x))
# Check how many unique names we have now
len(df["First author"].unique())

48775

#### 4.2 Find the full name in list and match

In [64]:
# Split into two lists, one where the full name is complete 
# and one where its only initials, then try to match to each other

# Initialize empty lists for full names and for initials
full_names = []
abr_names = []

# Sort into full and abbreviated names by checking whether what is before the first whitespace is only uppercase letters
for name in df['First author'].unique():
    if name.split(" ")[0].isupper():
        abr_names.append(name)
    else:
        full_names.append(name)
        
print(f"Full names: {len(full_names)}, Abbreviated names: {len(abr_names)}")    

Full names: 26230, Abbreviated names: 22545


We now try to match abbreviated names to full names. We only do so if there is a unique match for the abbreviated name (e.g. "N Cartwright", "Nancy Cartwright").

In [65]:
# Initialize a list to store matches in
matches = []

# Iterate through abbreviated names
for name in abr_names:
    # extract first letter of first name and last name
    initial = name[0]
    last_name = name.split(" ")[-1]
    # Boolean flag to indicate whether there is a unique match
    matches_temp = []
    # Iterate through full name list
    for full_name in full_names:
        initial_f = full_name[0]
        last_name_f = full_name.split(" ")[-1]
        # Match if it's the same initial, same last name (+ add that first name has to have at least two letters without )
        if initial==initial_f and last_name==last_name_f:
            #print(f"Abbreviated: {name}, Full: {full_name}")
            matches_temp.append(full_name)
    # If there are any specific matches, append to match list
    if matches_temp:
        matches.append([name, matches_temp])

# Find unique matches among all matches
unique_matches = [match for match in matches if len(match[1])==1]

print(f"We found matches for a total of {len(matches)} abbreviated names, of which {len(unique_matches)} are unique.")
        
    

We found matches for a total of 10893 abbreviated names, of which 9035 are unique.


Next, we want to replace all the abbreviated names in our data set that have unique matching full names.

In [66]:
# Define a function which takes as input an abbreviated name and returns a full name if a unique match is available
# and the initial name otherwise
def get_full_name(name):
    # Check whether name is an abbreviated name
    if name.split(" ")[0].isupper():
        # Check whether there's a unique match
        for unique_match in unique_matches:
            # If the name matches, return that name
            if name==unique_match[0]:
                return unique_match[1][0]
    return name

df["First author"] = df["First author"].apply(lambda x: get_full_name(x))
df.head()

#### 4.3 Retrieving full first names from an external website

This works so far - have to save to CSV in steps of 1000s and start from where the list left off to be able to do this in chunks. (WORK IN PROGRESS)

In [85]:
# Determine the names we still need to find
df_temp = df
# Column of Boolean values to flag which names are abbreviated
df_temp["Abr"] = df_temp["First author"].apply(lambda x: x.split(" ")[0].isupper())
# Retrict to abbreviated names
df_temp = df_temp[df_temp["Abr"]].copy().reset_index(drop=True)
# Print length
print(f"There are still {len(df_temp)} abbreviated names.")
df_temp.head()

There are still 33614 abbreviated names.


Unnamed: 0,Authors,Year,Title,First author,Abr
0,J. Adler,1994,"Testimony, Trust, Knowing",J Adler,True
1,J. Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex...",J Bigelow,True
2,R. Boyd,1981,Scientific Realism and Naturalistic Epistemology,R Boyd,True
3,R. Boyd,1984,The Current Status of Scientific Realism,R Boyd,True
4,R. Boyd,1985,Lex Orandi est Lex Credendi,R Boyd,True


In [20]:
# Define headers for URL request
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [17]:
# Open df_temp
df_temp = pd.read_csv("df_temp.csv")[["Authors", "Year", "Title","First author", "Abr"]]
len(df_temp[df_temp["Abr"]==True])
df_temp.shape

(33614, 5)

In [37]:
# Initialize count
count = 0

# Find index of first abbreviated name and add count to it (THIS IS WRONG - WE NEED POSITION, NOT)
first_abr = df_temp[df_temp["Abr"]].index[0]
print(first_abr)


# Loop through
for count in range(2):
    
    # Get index by adding count to the determined index of the first abr name 
    index = first_abr+count
    print(index)
    # Get row from data frame
    row = df_temp.iloc[index,:]
    print(row)
    # Retrieve author
    author = row["First author"]
    # Retrieve title and remove HTML tags
    title = BeautifulSoup(row["Title"],"lxml").text
    query = f"{author} {title}".replace(" ","%20")
    path = "https://philpapers.org/s/"+query
    print(path)
    try:
        req = requests.get(path, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        entry = soup.find_all("li", class_="entry")[0]
    
        # Loop through all names one the site
        for entry in entry.findChildren("span" ,class_="name"):
            name = entry.getText()
            # Set up checks whether name not too long, whether it matches with initials and last name
            length_check = len(name)<50
            first_initial_check = name.split(" ")[0][0]==author.split(" ")[0][0]
            lastname_check = name.split(" ")[-1]==author.split(" ")[-1]
            # Run checks
            if length_check and first_initial_check and lastname_check:
                # Add name to the author column
                df_temp["First author"][index] = name
                # Change the Boolean flag to false
                df_temp["Abr"][index] = False
                # Break loop as soon as a match is found
                print(f"Success: {name}")
                break
    except:
        print(f"BeautifulSoup error with: {author}, {title}")
    # If still no name is found, declare error
    if df_temp["Abr"][index]==True:
        print(f"Error: {name}")
        # Declare error
        df_temp["First author"][index] = "(Error)"
        # Change the Boolean flag to false
        df_temp["Abr"][index] = False

df_temp.head(15)

assert df_temp.shape == (33614, 5)
# Save to file remove for final version
#df_temp.to_csv("df_temp.csv")
    

33612
33612
Authors                                         S. Yablo
Year                                                1993
Title           Philosophy and Phenomenological Research
First author                                     S Yablo
Abr                                                 True
Name: 33612, dtype: object
https://philpapers.org/s/S%20Yablo%20Philosophy%20and%20Phenomenological%20Research
Error: Alonzo Church
Abr name: S Yablo, full name: Alonzo Church, length: True, initials: False, last name: False
(33614, 5)
33613
Authors                                         S. Yablo
Year                                                1999
Title           Philosophy and Phenomenological Research
First author                                     S Yablo
Abr                                                 True
Name: 33613, dtype: object
https://philpapers.org/s/S%20Yablo%20Philosophy%20and%20Phenomenological%20Research
Error: Alonzo Church
Abr name: S Yablo, full name: Alonzo Church,

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp["First author"][index] = "(Error)"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp["Abr"][index] = False


In [41]:
# How many unsuccessful attempts to retrieve?
len(df_temp[df_temp["First author"]!="(Error)"])

14942

We were able to retrieve 14942 full names. Let us analyze these:

In [44]:
df_temp[df_temp["First author"]=="(Error)"].head()

Unnamed: 0,Authors,Year,Title,First author,Abr
11,G. Harman,1973,Thought,(Error),False
16,D. J. Koehler,1991,"Explanation, Imagination, and Confidence in Ju...",(Error),False
17,"B. Koslowski, J. Marasia, M. Chelenza, R....",2008,Information Becomes Evidence when an Explanati...,(Error),False
19,"K Krzyżanowska, S. Wenmackers, I. Douven",2014,Rethinking Gibbard’s Riverboat Argument,(Error),False
30,H. Putnam,1981,"Reason, Truth and History",(Error),False
31,B. Russell,1912,The Problems of Philosophy,(Error),False
37,"A. Minnis, A. B. (eds.) Scott",1988,Medieval Literary Theory and Criticism 1100–1375,(Error),False
38,D. P. Henry,1985,Abelard’s Mereological Terminology,(Error),False
40,L. Cousins,1981,The <em>Paṭṭhāna</em> and the Development of t...,(Error),False
41,L. Cousins,1983,Pali Oral Literature,(Error),False


Now we will match the obtained data with the existing data set:

In [51]:
# Postprocess: deunicode, remove middle name, remove punctuation
# Find closest ASCII representation
df_temp["First author"] = df_temp["First author"].apply(lambda x: unidecode.unidecode(x))
# Remove punctuation
df_temp["First author"] = df_temp["First author"].apply(lambda x: x.replace(".","").replace(";","").replace(",",""))
# Remove middle name
df_temp["First author"] = df_temp["First author"].apply(lambda x: x.split(" ")[0]+" "+x.split(" ")[-1] if x!="(Error)" else x)
# Drop duplicate entries
df_temp = df_temp.drop_duplicates()

df_temp.head()

In [149]:
# Merge the two data sets 
df_updated = pd.merge(
    df,
    df_temp,
    how="left",
    on=["Authors","Year","Title"],
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=False,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)

In [150]:
# if first author_y NaN or author_y "(Error)": first author = first author_x; else first author = first_author_y
df_updated["First author"] = df_updated.apply(lambda x: x["First author_x"] if (str(x["First author_y"])=="nan" or x["First author_y"]=="(Error)") else x["First author_y"], axis = 1)
df_updated = df_updated[["First author","Year", "Title"]]
df_updated.head()


Unnamed: 0,First author,Year,Title
0,Peter Achinstein,2001,The Book of Evidence
1,Jonathan Adler,1994,"Testimony, Trust, Knowing"
2,Kent Bach,1979,Linguistic Communication and Speech Acts
3,Alexander Bird,1998,Philosophy of Science
4,John Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex..."


In [151]:
# Split into two lists, one where the full name is complete (same as above, except df_updated rather than df - put in function)
# and one where its only initials, then try to match to each other

# Initialize empty lists for full names and for initials
full_names = []
abr_names = []

# Sort into full and abbreviated names by checking whether what is before the first whitespace is only uppercase letters
for name in df_updated['First author'].unique():
    if name.split(" ")[0].isupper():
        abr_names.append(name)
    else:
        full_names.append(name)
        
print(f"Full names: {len(full_names)}, Abbreviated names: {len(abr_names)}")    

Full names: 29362, Abbreviated names: 10908


In [152]:
# Initialize a list to store matches in (same as above, no changes)
matches = []

# Iterate through abbreviated names
for name in abr_names:
    # extract first letter of first name and last name
    initial = name[0]
    last_name = name.split(" ")[-1]
    # Boolean flag to indicate whether there is a unique match
    matches_temp = []
    # Iterate through full name list
    for full_name in full_names:
        initial_f = full_name[0]
        last_name_f = full_name.split(" ")[-1]
        # Match if it's the same initial, same last name (+ add that first name has to have at least two letters without )
        if initial==initial_f and last_name==last_name_f:
            #print(f"Abbreviated: {name}, Full: {full_name}")
            matches_temp.append(full_name)
    # If there are any specific matches, append to match list
    if matches_temp:
        matches.append([name, matches_temp])

# Find unique matches among all matches
unique_matches = [match for match in matches if len(match[1])==1]

print(f"We found matches for a total of {len(matches)} abbreviated names, of which {len(unique_matches)} are unique.")
        
    

We found matches for a total of 2381 abbreviated names, of which 923 are unique.


In [153]:
# Define a function which takes as input an abbreviated name and returns a full name if a unique match is available
# and the initial name otherwise (same as above - put all together in function)
def get_full_name(name):
    # Check whether name is an abbreviated name
    if name.split(" ")[0].isupper():
        # Check whether there's a unique match
        for unique_match in unique_matches:
            # If the name matches, return that name
            if name==unique_match[0]:
                return unique_match[1][0]
    return name

df_updated["First author"] = df_updated["First author"].apply(lambda x: get_full_name(x))
df_updated.head()

Unnamed: 0,First author,Year,Title
0,Peter Achinstein,2001,The Book of Evidence
1,Jonathan Adler,1994,"Testimony, Trust, Knowing"
2,Kent Bach,1979,Linguistic Communication and Speech Acts
3,Alexander Bird,1998,Philosophy of Science
4,John Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex..."


We will conduct a final test to see how many entries have full names, and how many have abbreviated names. We'll drop the abbreviated names.

In [167]:
df_updated["Abr"] = df_updated["First author"].apply(lambda x: x.split(" ")[0].isupper())
keep_count = len(df_updated[~df_updated["Abr"]])
drop_count = len(df_updated[df_updated["Abr"]])
print(f" Number of entries to keep: {keep_count}. Number of entries to drop: {drop_count} ({round(drop_count/(keep_count+drop_count),2)*100}%)")

 Number of entries to keep: 146472. Number of entries to drop: 0 (0.0%)


This means we have to drop another 19988 or 12% of entries.

In [155]:
df_updated = df_updated[~df_updated["Abr"]][["First author","Year","Title"]]
df_updated.shape

(146472, 3)

In [157]:
# Save the data set to CSV
df_updated.to_csv("data/dataset_full_names.csv")