# Stanford Encyclopedia of Philosophy - Diversity Analysis
## Data Extraction, Preprocessing and Cleaning

We begin by loading the libraries required for this analysis, as well as the function we defined in an external python file.

In [2]:
import math

import pandas as pd
import requests
import unidecode
from bs4 import BeautifulSoup

from sep_functions import *

### 1. Scraping the *Stanford Encyclopedia of Philosophy* 

We begin by scraping all the URLs to articles on the Stanford Encyclopedia of Philosophy (SEP) from its "contents" page.

In [2]:
# Url with overview of all the entries
url = "https://plato.stanford.edu/contents.html"

# Load html content, get page text, parse HTML
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page, 'html.parser')

# Find <a href> elements which begin with "plato.stanford.edu/entries"
links = soup.findAll('a')
entries = []
for link in links:
    try:
        link_text = link.get("href")
        # If the link text is an entry, and it hasn't been added to the list before
        if "entries/" in link_text  and link_text not in entries:
            entries.append(link_text) 
    except:
        pass
    
print(len(entries))    

1742


Next, we extract all the references from each of the 1740 articles on the SEP.

In [4]:
# Initialize list of bib elements
bib_elements = []

# Initialize a count for the entries
num_entry = 1

# Iterate through the entries and extract all bib elements
for entry in entries:
    print(f"Extracting from entry #{num_entry}: {entry}")
    num_entry +=1
    url = f"https://plato.stanford.edu/{entry}"
    li_elements = get_bib_elements(url)
    print(f"Appending {len(li_elements)} references")
    for li_element in li_elements:
        bib_elements.append(li_element)

Extracting from entry #1: entries/abduction/
Appending 111 references
Extracting from entry #2: entries/abelard/
Appending 89 references
Extracting from entry #3: entries/abhidharma/
Appending 67 references
Extracting from entry #4: entries/abilities/
Appending 77 references
Extracting from entry #5: entries/abner-burgos/
Appending 43 references


We will save the extracted references to a .txt file, so we can return to the raw data if needed.

In [None]:
# Save the raw reference data to txt file
textfile = open("data/bib_items_complete.txt", "w")
for element in bib_elements:
    textfile.write(str(element) + "\n")
textfile.close()

### 2. Turning the raw text into structured data

Now we will focus on turning the raw references into structured tabular data. To this end, we first load the previously saved .txt file.

In [5]:
# Load the data with the raw references
filepath = "data/bib_items_complete.txt"

# Import references from raw text file
file = open(filepath, "r")
content = file.read()

# Split the content where a li element ends and a new line starts
references = content.split("</li>\n")
file.close()

For each line (and hence reference) in the .txt file, we try to extract the publication year, the name of the authors, and the title of the publication. 

In [4]:
# Initialize lists to store publication years, author lists, and titles in, as well as different counts
years, authors, titles = [], [], []
yearless = 0
count = 0
fail_count = 0

# Iterate through references
for reference in references:
    
    try:
        # Get publication year
        year = extract_year(reference)
        # Skip if there is no publication year in the reference
        if not year:
            yearless+=1
            continue
        
        # Get publication name
        title  = extract_title(reference)
         
        # Check if the author name is "–––", indicating its the same as in the previous ref
        if "–––" in reference:
            # Assign the author from the previous reference
            names = authors[-1]
        else:
            # Get author name from the reference
            names = extract_authors(reference, year)
        
        # Append all
        years.append(year)
        titles.append(title)
        authors.append(names)
        
        # Increment the counter
        count+=1
        
    except:
        print(f"This is number {count}:")
        print("General Failure")
        print(reference)
        fail_count+=1
        
# Make sure the list of authors and the list of years is equally long
assert len(authors)==len(years) and len(titles)==len(years)

2001, The Book of Evidence, ['P. Achinstein']
1994, Testimony, Trust, Knowing, ['J. Adler']
1979, Linguistic Communication and Speech Acts, ['K. Bach', ' R.  Harnish']
1998, Philosophy of Science, ['A. Bird']
2010, Quine, Mereology, and Inference to the Best Explanation, ['J. Bigelow']


After extracting the publication year, the authors of the publication, and the publication title, we save the structured, tabular data to a .csv file. 

In [None]:
# Turn into pandas dataframe
df = pd.DataFrame({'Authors' : authors,
                   'Year' : years,
                   'Title' : titles }, 
                  columns=['Authors','Year', 'Title'])

# Write to csv
df.to_csv("data/sep_reference_data.csv")

### 3. Preprocessing the data

Again, we begin by loading the data as saved in the previous section, where we have extracted 180 403 references. 

In [3]:
# Load data
df = pd.read_csv("data/sep_reference_data.csv")[["Authors", "Year", "Title"]]

# Save initial number of columns
initial_len = len(df)

# Reformat Authors column
df["Authors"] = df["Authors"].apply(lambda x: x[2:-2].replace("'",""))
print(f"Data set with {initial_len} rows.")
df.head()

Data set with 180400 rows.


Unnamed: 0,Authors,Year,Title
0,P. Achinstein,2001,The Book of Evidence
1,J. Adler,1994,"Testimony, Trust, Knowing"
2,"K. Bach, R. Harnish",1979,Linguistic Communication and Speech Acts
3,A. Bird,1998,Philosophy of Science
4,J. Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex..."


We begin by checking how many entries are invalid, i.e. don't have a title, year, or author names. Those rows we will drop.

In [4]:
# Drop rows without author name
df = df[df["Authors"]!="(No names)"]

# Drop rows without title
df[df["Title"]=="(Couldn't determine title)"]
print(f"Number of dropped rows: {initial_len-len(df)}. This is ~{round((initial_len-len(df))/initial_len,3)*100}% of the original data set.")


Number of dropped rows: 8973. This is ~5.0% of the original data set.


Next, we will create a new column for the first author of a publication.

In [5]:
# Get first author
df["First author"] = df["Authors"].apply(lambda x: x.split(",")[0])
df.head()

Unnamed: 0,Authors,Year,Title,First author
0,P. Achinstein,2001,The Book of Evidence,P. Achinstein
1,J. Adler,1994,"Testimony, Trust, Knowing",J. Adler
2,"K. Bach, R. Harnish",1979,Linguistic Communication and Speech Acts,K. Bach
3,A. Bird,1998,Philosophy of Science,A. Bird
4,J. Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex...",J. Bigelow


We're now checking whether there are columns where the First author name is suspiciously long (over 35 characters) to analyze those. It will presumably be reasonable to drop them, as they will most likely indicate that something went wrong.

In [6]:
# Mark all where first author name is longer than 35 characters
df["Overlong"] = df["First author"].apply(lambda x: len(x)>35)

# How many are we dealing with?
print(f"There are {len(df[df['Overlong']])} overly long first author entries.")

# Print the top ones
df[df["Overlong"]].head()

# Drop them
df = df[df["Overlong"]==False][["Authors","Year","Title","First author"]]

There are 6 overly long first author entries.


As we want to focus only on publications after 1900, we drop all data points with earlier publication year.

In [7]:
df[df["Year"]<"1900"].head()

Unnamed: 0,Authors,Year,Title,First author
261,David Hume,1748,An Enquiry Concerning Human Understanding,David Hume
288,Thomas Reid,1788,Essays on the Active Powers of Man,Thomas Reid
394,Bernhard Zimmels,1886,Leo Hebraeus: Ein Judischer Philosoph der Rena...,Bernhard Zimmels
395,Bernhard Zimmels,1892,"Leone Hebreo, Neue Studien",Bernhard Zimmels
416,Bernard Bolzano,1837,Wissenschaftslehre,Bernard Bolzano


In [8]:
df = df[df["Year"]>="1900"]
len(df)

166433

This should be a reasonable pre-processed and filtered data set. It contains 166 437 entries. We will now move to the feature engineering part.

### 4. Obtain full author names

As a majority of references do not contain the full first names of authors (but only initials), we now need to turn to the problem of obtaining full author names. Let us first get an idea how many unique first author entries we have, to then see how many of those contain the full first names.

In [9]:
# Number of different first authors (55.9k)
print(f"There are {len(df['First author'].unique())} unique 'Authors' entries")

There are 55968 unique 'Authors' entries


#### 4.1 Match different versions of names

A quick look at the author names reveals that there are often different versions of the same name in the data set (e.g. "Richard Boyd", "Richard N. Boyd"). The first step will be to remove middle names.

In [10]:
# Split authors into list divided by whitespaces, then only retain first and last
df["First author"] = df["First author"].apply(lambda x: x.split(" ")[0]+" "+x.split(" ")[-1])
# Check how many unique names we have now
len(df["First author"].unique())

49774

Next, we're removing punctuation to unify abbreviated names (e.g. "L Bovens", "L. Bovens"). This reduces the number of unique author entries by roughly 700 entries. 

In [11]:
df["First author"] = df["First author"].apply(lambda x: x.replace(".",""))
# Check how many unique names we have now
len(df["First author"].unique())

49034

Moreover, we're replacing letters with accent (e.g. "A. Hajék"). This reduces the number if unique author entries by roughly another 300 entries.

In [12]:
# Replace each name by its closest ASCII representation
df["First author"] = df["First author"].apply(lambda x: unidecode.unidecode(x))
# Check how many unique names we have now
len(df["First author"].unique())

48774

#### 4.2 Find the full name in list and match

The first thing we will do is to try to find the full name of entries where only an authors first name initial is available within the data set. To this end, we split the unique first author entries into two lists: one with entries with full first names, and one with entries with abbreviated first names.

We can then try to match abbreviated names to full names. We only do so if there is a unique match for the abbreviated name (e.g. "N Cartwright", "Nancy Cartwright").

In [13]:
# Get the list of unique "First author" entries
unique_names = df['First author'].unique()

# Split the unique author names into abbreviated names (e.g. L Bovens) and full names (e.g. Luc Bovens)
full_names, abr_names = sort_names(unique_names)
print(f"Full names: {len(full_names)}, Abbreviated names: {len(abr_names)}")  

# Match abbreviated names to corresponding full names
matches = get_matches(full_names, abr_names)

# Find unique matches among all matches
unique_matches = [match for match in matches if len(match[1])==1]
print(f"We found matches for a total of {len(matches)} abbreviated names, of which {len(unique_matches)} are unique.")

Full names: 26230, Abbreviated names: 22544
We found matches for a total of 10893 abbreviated names, of which 9035 are unique.


Next, we want to replace all the abbreviated names in our data set that have unique matching full names.

In [14]:
# Define a function which takes as input an abbreviated name and returns a full name if a unique match is available
# and the initial name otherwise

df["First author"] = df["First author"].apply(lambda x: get_full_name(x, unique_matches))
df.head()

Unnamed: 0,Authors,Year,Title,First author
0,P. Achinstein,2001,The Book of Evidence,Peter Achinstein
1,J. Adler,1994,"Testimony, Trust, Knowing",J Adler
2,"K. Bach, R. Harnish",1979,Linguistic Communication and Speech Acts,Kent Bach
3,A. Bird,1998,Philosophy of Science,Alexander Bird
4,J. Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex...",J Bigelow


#### 4.3 Retrieving full first names from an external website

This works so far - have to save to CSV in steps of 1000s and start from where the list left off to be able to do this in chunks. (WORK IN PROGRESS)

In [16]:
# Determine the names we still need to find
df_temp = df[["Authors", "Year", "Title","First author", "Abr"]]

# Column of Boolean values to flag which names are abbreviated
df_temp["Abr"] = df_temp["First author"].apply(lambda x: x.split(" ")[0].isupper())

# Retrict to abbreviated names
df_temp = df_temp[df_temp["Abr"]].copy().reset_index(drop=True)

# Print length
print(f"There are still {len(df_temp)} abbreviated names.")

There are still 33613 abbreviated names.


(33613, 5)

In [15]:
# Define headers for URL request
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

In [37]:
# Initialize count
count = 0

# Find index of first abbreviated name and add count to it (THIS IS WRONG - WE NEED POSITION, NOT)
first_abr = df_temp[df_temp["Abr"]].index[0]
print(first_abr)

# Loop through
for count in range(2):
    
    # Get index by adding count to the determined index of the first abr name 
    index = first_abr+count
    # Get row from data frame
    row = df_temp.iloc[index,:]
    # Retrieve author
    author = row["First author"]
    # Retrieve title and remove HTML tags
    title = BeautifulSoup(row["Title"],"lxml").text
    query = f"{author} {title}".replace(" ","%20")
    path = "https://philpapers.org/s/"+query
    
    try:
        req = requests.get(path, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        entry = soup.find_all("li", class_="entry")[0]
    
        # Loop through all names on the site
        for entry in entry.findChildren("span" ,class_="name"):
            name = entry.getText()
            # Set up checks whether name not too long, whether it matches with initials and last name
            length_check = len(name)<50
            first_initial_check = name.split(" ")[0][0]==author.split(" ")[0][0]
            lastname_check = name.split(" ")[-1]==author.split(" ")[-1]
            # Run checks
            if length_check and first_initial_check and lastname_check:
                # Add name to the author column
                df_temp["First author"][index] = name
                # Change the Boolean flag to false
                df_temp["Abr"][index] = False
                # Break loop as soon as a match is found
                print(f"Success: {name}")
                break
    except:
        print(f"BeautifulSoup error with: {author}, {title}")
    # If still no name is found, declare error
    if df_temp["Abr"][index]==True:
        print(f"Error: {name}")
        # Declare error
        df_temp["First author"][index] = "(Error)"
        # Change the Boolean flag to false
        df_temp["Abr"][index] = False

df_temp.head(15)

assert df_temp.shape == (33614, 5)

33612
33612
Authors                                         S. Yablo
Year                                                1993
Title           Philosophy and Phenomenological Research
First author                                     S Yablo
Abr                                                 True
Name: 33612, dtype: object
https://philpapers.org/s/S%20Yablo%20Philosophy%20and%20Phenomenological%20Research
Error: Alonzo Church
Abr name: S Yablo, full name: Alonzo Church, length: True, initials: False, last name: False
(33614, 5)
33613
Authors                                         S. Yablo
Year                                                1999
Title           Philosophy and Phenomenological Research
First author                                     S Yablo
Abr                                                 True
Name: 33613, dtype: object
https://philpapers.org/s/S%20Yablo%20Philosophy%20and%20Phenomenological%20Research
Error: Alonzo Church
Abr name: S Yablo, full name: Alonzo Church,

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp["First author"][index] = "(Error)"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp["Abr"][index] = False


In [41]:
# How many unsuccessful attempts to retrieve?
len(df_temp[df_temp["First author"]!="(Error)"])

14942

We were able to retrieve 14942 full names. Now we will match the obtained data with the existing data set.

In [51]:
# Postprocess: deunicode, remove middle name, remove punctuation
# Find closest ASCII representation
df_temp["First author"] = df_temp["First author"].apply(lambda x: unidecode.unidecode(x))
# Remove punctuation
df_temp["First author"] = df_temp["First author"].apply(lambda x: x.replace(".","").replace(";","").replace(",",""))
# Remove middle name
df_temp["First author"] = df_temp["First author"].apply(lambda x: x.split(" ")[0]+" "+x.split(" ")[-1] if x!="(Error)" else x)
# Drop duplicate entries
df_temp = df_temp.drop_duplicates()

df_temp.head()

In [149]:
# Merge the two data sets 
df_updated = pd.merge(
    df,
    df_temp,
    how="left",
    on=["Authors","Year","Title"],
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=False,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)

In [150]:
# if first author_y NaN or author_y "(Error)": first author = first author_x; else first author = first_author_y
df_updated["First author"] = df_updated.apply(lambda x: x["First author_x"] if (str(x["First author_y"])=="nan" or x["First author_y"]=="(Error)") else x["First author_y"], axis = 1)
df_updated = df_updated[["First author","Year", "Title"]]
df_updated.head()


Unnamed: 0,First author,Year,Title
0,Peter Achinstein,2001,The Book of Evidence
1,Jonathan Adler,1994,"Testimony, Trust, Knowing"
2,Kent Bach,1979,Linguistic Communication and Speech Acts
3,Alexander Bird,1998,Philosophy of Science
4,John Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex..."


In [153]:
# Get the list of unique "First author" entries
unique_names = df_updated['First author'].unique()

# Split the unique author names into abbreviated names (e.g. L Bovens) and full names (e.g. Luc Bovens)
full_names, abr_names = sort_names(unique_names)
print(f"Full names: {len(full_names)}, Abbreviated names: {len(abr_names)}")  

# Match abbreviated names to corresponding full names
matches = get_matches(full_names, abr_names)

# Find unique matches among all matches
unique_matches = [match for match in matches if len(match[1])==1]
print(f"We found matches for a total of {len(matches)} abbreviated names, of which {len(unique_matches)} are unique.")

# Replace abbreviated author names with full author names
df_updated["First author"] = df_updated["First author"].apply(lambda x: get_full_name(x, unique_matches))
df_updated.head()

Unnamed: 0,First author,Year,Title
0,Peter Achinstein,2001,The Book of Evidence
1,Jonathan Adler,1994,"Testimony, Trust, Knowing"
2,Kent Bach,1979,Linguistic Communication and Speech Acts
3,Alexander Bird,1998,Philosophy of Science
4,John Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex..."


We will conduct a final test to see how many entries have full names, and how many have abbreviated names. We'll drop the abbreviated names.

In [167]:
df_updated["Abr"] = df_updated["First author"].apply(lambda x: x.split(" ")[0].isupper())
keep_count = len(df_updated[~df_updated["Abr"]])
drop_count = len(df_updated[df_updated["Abr"]])
print(f" Number of entries to keep: {keep_count}. Number of entries to drop: {drop_count} ({round(drop_count/(keep_count+drop_count),2)*100}%)")

 Number of entries to keep: 146472. Number of entries to drop: 0 (0.0%)


This means we have to drop another 19988 or 12% of entries.

In [155]:
df_updated = df_updated[~df_updated["Abr"]][["First author","Year","Title"]]
df_updated.shape

(146472, 3)

This will be the data set we will use for our analysis. Of the initial ~180 000 raw entries we were able to extract valid information (including finding the full first name) for more than 145 000. This is over 80% of the initial, raw data. 

In [157]:
# Save the data set to CSV
df_updated.to_csv("data/dataset_full_names.csv")

### 4.4 Retrieving information on affiliation

In [5]:
df = pd.read_csv("data/dataset_full_names.csv")[["First author","Year","Title"]]
df.head()

Unnamed: 0,First author,Year,Title
0,Peter Achinstein,2001,The Book of Evidence
1,Jonathan Adler,1994,"Testimony, Trust, Knowing"
2,Kent Bach,1979,Linguistic Communication and Speech Acts
3,Alexander Bird,1998,Philosophy of Science
4,John Bigelow,2010,"Quine, Mereology, and Inference to the Best Ex..."


In [6]:
# Get list of unique names (29k)
unique_names = df["First author"].unique()

# set URL
url = "https://philpeople.org/find-philosopher/search?utf8=%E2%9C%93&button=&keywords="

In [72]:
# Create empty data frame and save to csv 
#df_aff = pd.DataFrame(columns=["Name","Affiliation"])
#df_aff.to_csv("data/affiliations.csv")

In [8]:
# df_aff = pd.read_csv("data/affiliations.csv")[["Name","Affiliation"]]

index = len(df_aff)
interval = 5_000

# Create empty list
affiliations_list = []

for name in unique_names[index:]:
    # Define philosopher-specific query
    path = url+name.replace(" ","+")
    
    try:
        req = requests.get(path, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
    
        profile_name_tags = soup.find_all("div", class_="profile-name")
        print(f"Name: {name}")
        print(f"Number of profile names: {len(profile_name_tags)}")
        # If there's no profile, append "None and skip to next"
        if len(profile_name_tags)==0:
            affiliations_list.append([name, "None"])
            continue   
        
        for profile_name_tag in profile_name_tags:
            # Extract name on website in ASCII
            profile_name = unidecode.unidecode(profile_name_tag.get('title','No title attribue'))
            print(profile_name)
        
            # Check whether first and last name match
            cond_first_name = profile_name.split(" ")[0]==name.split(" ")[0]
            cond_last_name = profile_name.split(" ")[-1]==name.split(" ")[-1]
            if cond_first_name and cond_last_name:
                # Get the affiliation
                affiliation_tag = profile_name_tag.find_next("span", class_="affil")
                affiliation = affiliation_tag.get('title','None')
                affiliations_list.append([name, affiliation])
                print(f"Result: {name}, Affiliation: {affiliation}")
                break
                
    except:
        affiliations_list.append([name, "None"])

# Create temporary data frame with new entries        
df_aff_temp = pd.DataFrame(affiliations_list, columns=["Name","Affiliation"])

# Concatenate to existing entries
df_aff = pd.concat([df_aff,df_aff_temp])

# Save to CSV
# df_aff.to_csv("data/affiliations.csv")

In [None]:
df_aff = df_aff.rename(columns={"Name":"First author"})

In [38]:
df_counts = df.groupby("First author").count().reset_index()

df_full = pd.merge(
    df_counts,
    df_aff,
    how="left",
    on=["First author"],
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=False,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
)

In [41]:
df_full = df_full.rename(columns={"Year":"Count"})[["First author", "Count","Affiliation"]]

In [79]:
df_grouped = df_full.groupby("Affiliation").sum().sort_values("Count", ascending=False).reset_index()
df_grouped.head()

Unnamed: 0,Affiliation,Count
0,,75459
1,New York University,1915
2,Rutgers University - New Brunswick,1824
3,Oxford University,1569
4,University of Notre Dame,1485


In [120]:
# Next step: manually sort out duplicates

replace = {
    "\(PhD\)":"",
    "\(DPhil\)":"",
    "University of Oxford":"Oxford University",
    "University Of Oxford":"Oxford University",
    "UCLA":"University of California, Los Angeles",   
}

# Iterate through the replacements
for k, v in replace.items():
    df_grouped["Affiliation"] = df_grouped["Affiliation"].str.replace(k,v).str.strip()

# Put back together    
df_grouped = df_grouped.groupby("Affiliation").sum().sort_values("Count", ascending=False).reset_index()

  df_grouped["Affiliation"] = df_grouped["Affiliation"].str.replace(k,v).str.strip()


In [123]:
# Save to CSV
df_grouped.to_csv("data/university_counts.csv")