In [5]:
import numpy as np
import pandas as pd
import re
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from pathlib import Path

# An ASAP Sports Page for a sport is organized in this manner:
# Each letter of the alphabet -> Each player with surname beginning with that letter ->
# Each interview in which that player appeared.

# get the raw text from the interview page. Additionally, get the event, date, and all names
# involved in that interview and return a row of our eventual dataframe
def parse_interview(url, data):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  section = soup.find(attrs={'style':'padding: 10px;', 'valign':'top'})
  event = soup.find('h1').get_text()
  date = soup.find('h2').get_text()
  items = soup.find_all("h3")
  names = [item.get_text() for item in items]
  for p in soup.find_all(["strong", "i", "h1", "h2", "h3", "br", "a"]):
    p.decompose()
  paragraphs = section.find_all(string=True)
  output = ""
  for p in paragraphs:
    text = p.get_text()
    output = output + text
  data.append([output, event, date, names])

# get link for each interview on the player page and run parse_interview on each
def parse_player(url, data):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  table = soup.find('table', attrs={'width':'100%', 'cellspacing':'0', 
	                  'cellpadding':'3', 'border':'0'})
  if table is None:
    return None
  links = table.find_all('a', href=True)
  for link in links:
    parse_interview(link['href'], data)

# get link for each player on the letter page and run parse_player on each
def parse_letter(url, data):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  table = soup.find('table', attrs={'width':'100%', 'cellspacing':'0', 
	                  'cellpadding':'3', 'border':'0'})
  if table is None:
    return None
  links = table.find_all('a', href=True)
  for link in tqdm(links):
    parse_player(link['href'], data)

# get link for each letter of the alphabet on the sport page and run parse_letter on each
def parse_sport(url):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  table = soup.find_all('table', attrs={'width':'100%', 'cellspacing':'0', 
	                  'cellpadding':'5', 'border':'0'})[0]
  links = table.find_all('a', href=True)
  data = []
  for link in links:
    parse_letter(link['href'], data)
  return data

# assembles raw interview data
def scrape(weblink, rewrite=False):
  file = Path("corpus_creation/summaries.csv")
  if not file.is_file() or rewrite:
    # make the summaries csv
    data = parse_sport(weblink)
    df = pd.DataFrame(data, columns=['text', 'event', 'date', 'names'])
    df.to_csv("corpus_creation/interviews_raw.csv", index=False)
    return df
  df = pd.read_csv("corpus_creation/interviews_raw.csv", converters={'names': pd.eval})
  return df

In [6]:
import Levenshtein

# splits an interview into blocks of question-answer pairs
def chunk(row):
  text = row.iloc[0]
  text = re.sub(r"â€™", "\'", text)
  text = re.sub(r"Â", "", text)
  text = re.sub(r"Q\.", "999PLACEHOLDER999 Q.", text) # rather crude way of making the questions and answers easier to split
  QnA = re.split(r"999PLACEHOLDER999", text)[1:] # blocks of Questions and Answers
  return QnA

# helper function to fix OCR errors for and standardize interviewee names.
def find_name(interviewee, names):
  interviewee_lower = interviewee.lower()
  closest_name = None
  min = 10000
  for name in names:
    dist = Levenshtein.distance(interviewee_lower, name.lower())
    if dist < min:
      min = dist
      closest_name = name
  return closest_name

# helper function to separate the questions and answers,
# such that each row has columns of questions, answers, event, date, and interviewee name
# as well as the player nationality.

def separate(row, nationalities_dict):
  interview = row.iloc[0]
  event = row.iloc[1]
  date = row.iloc[2]
  names = row.iloc[3]
  output = []
  for text in interview: # for each question and its following response(s)
    q_and_a = re.split(r"\n(?=[A-ZÀ-Ÿ ,.-]+:)", text)
    question = re.sub(r"Q\.", "", q_and_a[0]).strip()
    for answer in q_and_a[1:]: # go through the responses in case there are multiple responders
      interviewee = re.search(r"([A-ZÀ-Ÿ ,.-]+)(:)", answer)
      # use levenshtein distance to standardize the names
      name = find_name(interviewee.group(1), names)
      nationality = nationalities_dict[name] # use the nationality dict to get the player nationality
      answer_noname = re.sub(interviewee.group(0), "", answer).strip()
      output.append([question, answer_noname, event, date, name, nationality])
  return output

# filter out non-MLB events
# as well as events centering on non-players, e.g. Winter Meetings, in which
# franchise owners and the MLB commissioner are asked much of the questions
def filter(data):
  raw_data = data.drop_duplicates(subset='text')

  mlb_data = raw_data[raw_data['event'].str.contains("MLB |NL |AL |WORLD SERIES|HOME RUN CHASE|MEDIA CONFERENCE", case=False, regex=True)]
  to_drop = mlb_data[mlb_data['event'].str.contains("NCAA|UNIVERSITY|COLLEGE|COLLEGIATE|STATE|MUNDIAL|ATLANTIC COAST|WINTER MEETINGS")].index
  mlb_data = mlb_data.drop(to_drop)
  return mlb_data

# return the unique events in the data
def get_events(data, rewrite=False):
  file = Path("corpus_creation/events.csv")
  if not file.is_file() or rewrite:
    events = pd.Series(pd.unique(data['event']))
    events.to_csv("corpus_creation/events.csv", header=False, index=False)
    return events
  events = pd.read_csv("corpus_creation/events.csv", header=None)
  return events[0]

# return the unique names in the data
def get_names(data, rewrite=False):
  file = Path("corpus_creation/interviewee_names.csv")
  if not file.is_file() or rewrite:
    names = pd.Series(data['names'].explode().unique())
    names.to_csv("corpus_creation/interviewee_names.csv", header=False, index=False)
    return names
  names = pd.read_csv("corpus_creation/interviewee_names.csv", header=None)
  return names[0]

# applies the entire process of filtering, cleaning, and splitting the raw data
def process(data, nationalities_dict):
  data['text'] = data.apply(chunk, axis=1)
  chunked = data[data['text'].map(len) > 0]

  separated = chunked.apply(separate, args=(nationalities_dict,), axis=1)
  separated_flattened = [x for xs in separated for x in xs]
  
  df = pd.DataFrame(separated_flattened, columns=['question', 'answer', 'event', 'date', 'name', 'nationality'])
  return df

In [7]:
import wikipedia

# using the nationality_info.csv file, make a dictionary of adjectivals (e.g. Mexican, Japanese)
# mapped to country (e.g. Mexico, Japan)
def get_nat_info():
  nat_df = pd.read_csv("corpus_creation/nationality_info.csv") # need this file
  countries = nat_df.iloc[:,0].str.strip()
  adjectivals = nat_df.iloc[:,1].str.strip()
  nat_info = dict(zip(adjectivals, countries)) # maps the adjectival to the country
  return adjectivals, nat_info

# use wikipedia API to find the summary for a player.
# if the word "baseball" does not appear in the summary for a player,
# we assume that player is not an MLB player and should therefore be excluded.
def find_summary(name):
  name = re.sub(r"[\"\',]", "", name)
  results = wikipedia.search(name)
  for page in results:
    try:
      summary = wikipedia.summary(title=page, auto_suggest=False)
      if "baseball" in summary.lower():
        return summary
    except wikipedia.exceptions.DisambiguationError as e:
      continue
  print(f"\"baseball\" not found in any summary for {name}")
  return "None"

# gets summaries for each player
def get_summaries(names):
  sums = []
  for name in names:
    sums.append([name, find_summary(name)])
  sums_df = pd.DataFrame(sums, columns=["name", "summary"], dtype=str)
  return sums_df

# returns dictionary mapping the player name to the wikipedia summary
def make_summaries(names, rewrite=False):
  file = Path("corpus_creation/summaries.csv")
  if not file.is_file() or rewrite:
    # make the summaries csv
    summaries = get_summaries(names)
    summaries.to_csv("corpus_creation/summaries.csv", index=False)

  summaries = pd.read_csv("corpus_creation/summaries.csv")

  summaries_dict = dict(zip(summaries['name'], summaries['summary']))
  return summaries_dict

# uses summary dictionary, which usually contains the nationality for the player
# at a very predictable position of the summary.
# returns the determined nationality of the player.
def nationality(name, dict, adjectivals, nat_info):
  summary = dict[name]
  if not isinstance(summary, str):
    return None
  pattern = '(?:% s)' % '|'.join(adjectivals)

  # print(f"name: {name}\nsummary:{summary}\n")

  pos = summary.find(")")
  match = re.search(pattern, summary[pos:])
  if match is None:
    # print(f"nationality not found for {name}.\nSummary: {summary}")
    return None
  return nat_info[match.group(0)]

# assembles and returns the dictionary mapping player name to player nationality.
def get_nat_dict(names, summaries_dict, rewrite=False):
  file = Path("corpus_creation/player_nationalities.csv")
  if not file.is_file() or rewrite:
    adjectivals, nat_info = get_nat_info()
    nationalities = names.apply(nationality, args=(summaries_dict, adjectivals, nat_info))
    df = pd.DataFrame({'name'        : names,
                       'nationality' : nationalities})
    df.to_csv("corpus_creation/player_nationalities.csv", index=False)
    nationalities_dict = dict(zip(names, nationalities))
    return nationalities_dict
  
  player_nats = pd.read_csv("corpus_creation/player_nationalities.csv")
  players = player_nats.iloc[:,0].str.strip()
  nationalities = player_nats.iloc[:,1].str.strip()
  nationalities_dict = dict(zip(players, nationalities))

  return nationalities_dict

In [8]:
# rewrite flag decides whether to rewrite the csv or not
# flipping this flag to True will make the code take a long time,
# making the data from scratch.
rw_flag = False

# putting it all together
weblink = "https://www.asapsports.com/showcat.php?id=2"
raw_data = scrape(weblink, rewrite=rw_flag)
mlb_data = filter(raw_data)

events = get_events(mlb_data, rewrite=rw_flag)
names = get_names(mlb_data, rewrite=rw_flag)

summaries_dict = make_summaries(names, rewrite=rw_flag)
nationalities_dict = get_nat_dict(names, summaries_dict, rewrite=rw_flag)

processed_mlb_data = process(mlb_data, nationalities_dict).dropna()
processed_mlb_data.to_csv("data/sportsQnA.csv", index=False)
display(processed_mlb_data)

Unnamed: 0,question,answer,event,date,name,nationality
0,"From 2019, you guys had multiple games where y...",I think very similar to what we were thinking ...,AL WILD CARD SERIES: BLUE JAYS VS TWINS,"October 4, 2023",Michael A. Taylor,United States
1,"When you came in here, you kind of had the exp...",It's tough. I see the work that he puts in to...,AL WILD CARD SERIES: BLUE JAYS VS TWINS,"October 4, 2023",Michael A. Taylor,United States
2,"We were discussing about how, when you make a ...","I'm just looking to make the play, and I want ...",AL WILD CARD SERIES: BLUE JAYS VS TWINS,"October 4, 2023",Michael A. Taylor,United States
3,"It seems like kind of a curse, is it?",It is. I get excited. I'm happy to make the ...,AL WILD CARD SERIES: BLUE JAYS VS TWINS,"October 4, 2023",Michael A. Taylor,United States
4,"When you did get traded over, (indiscernible) ...",I really didn't know what to expect. I knew t...,AL WILD CARD SERIES: BLUE JAYS VS TWINS,"October 4, 2023",Michael A. Taylor,United States
...,...,...,...,...,...,...
77744,A lot has been made all year of how flexible t...,"Yeah, using guys in different leverage situati...",AL DIVISION SERIES: RAYS VS RED SOX,"October 11, 2021",Mike Zunino,United States
77745,"After catching 13 innings, is there any though...","Yeah, there will be plenty of time to rest in ...",AL DIVISION SERIES: RAYS VS RED SOX,"October 11, 2021",Mike Zunino,United States
77746,I know you spoke of how good of a team Boston ...,"Yeah, it just sort of speaks about the game of...",AL DIVISION SERIES: RAYS VS RED SOX,"October 11, 2021",Mike Zunino,United States
77747,Schwarber playing with the crowd last night. ...,"Yeah, I think it shows that the atmosphere the...",AL DIVISION SERIES: RAYS VS RED SOX,"October 11, 2021",Mike Zunino,United States
