In [5]:
import numpy as np
import pandas as pd
import wikipedia
import re

nat_df = pd.read_csv("corpus_creation/nationality_info.csv")
countries = nat_df.iloc[:,0].str.strip()
adjectivals = nat_df.iloc[:,1].str.strip()

nat_info = dict(zip(adjectivals, countries)) # maps the adjectival to the country

In [6]:
from pathlib import Path

def find_summary(name):
  name = re.sub(r"[\"\',]", "", name)
  results = wikipedia.search(name)
  for page in results:
    try:
      summary = wikipedia.summary(title=page, auto_suggest=False)
      if "baseball" in summary.lower():
        return summary
    except wikipedia.exceptions.DisambiguationError as e:
      continue
  print(f"\"baseball\" not found in any summary for {name}")
  return "None"


######################################################
### make a dictionary of names mapped to summaries ###
######################################################


def get_summaries(names):
  sums = []
  for name in names:
    sums.append([name, find_summary(name)])
  sums_df = pd.DataFrame(sums, columns=["name", "summary"], dtype=str)
  return sums_df

# make something like:
# if "corpus_creation/summaries.csv" does not exist then run the above code
# otherwise just load it into a df and then turn it into a dict using to_dict
def make_summaries():
  file = Path("corpus_creation/summaries.csv")
  if not file.is_file():
    # make the summaries csv
    names = pd.read_csv("corpus_creation/interviewee_names.csv")['0']
    summaries = get_summaries(names)
    summaries.to_csv("corpus_creation/summaries.csv")

  summaries = pd.read_csv("corpus_creation/summaries.csv")

  summaries_dict = dict(zip(summaries['name'], summaries['summary']))
  return summaries_dict

In [None]:
def nationality(name, dict):
  print(name)
  summary = dict[name]
  if not isinstance(summary, str):
    return None
  pattern = '(?:% s)' % '|'.join(adjectivals)

  # print(f"name: {name}\nsummary:{summary}\n")

  pos = summary.find(")")
  match = re.search(pattern, summary[pos:])
  if match is None:
    # print(f"nationality not found for {name}.\nSummary: {summary}")
    return None
  return nat_info[match.group(0)]

In [8]:
names = pd.read_csv("corpus_creation/interviewee_names.csv")['0']
sums = pd.read_csv("corpus_creation/summaries.csv")
summaries_dict = make_summaries()
nats = names.apply(nationality, args=(summaries_dict,))
df = pd.DataFrame({'name' : names,
              'nationality' : nats})
display(df)

Unnamed: 0,name,nationality
0,Michael A. Taylor,United States
1,Hank Aaron,United States
2,Commissioner Rob Manfred,United States
3,Jose Altuve,Venezuela
4,Giancarlo Stanton,United States
...,...,...
1413,Carlos Zambrano,Venezuela
1414,Don Zimmer,United States
1415,Jordan Zimmermann,United States
1416,Ben Zobrist,United States
