In [1]:
import numpy as np
import pandas as pd
import re
import Levenshtein

def chunk(row):
  text = row.iloc[0]
  text = re.sub(r"â€™", "\'", text)
  text = re.sub(r"Â", "", text)
  text = re.sub(r"Q\.", "999PLACEHOLDER999 Q.", text)
  QnA = re.split(r"999PLACEHOLDER999", text)[1:] # blocks of Questions and Answers
  return QnA

# helper function to fix OCR errors for and standardize interviewee names
def find_name(interviewee, names):
  interviewee_lower = interviewee.lower()
  closest_name = None
  min = 10000
  for name in names:
    dist = Levenshtein.distance(interviewee_lower, name.lower())
    if dist < min:
      min = dist
      closest_name = name
  return closest_name

# helper function to separate the questions and answers
# such that it's like a pair of questions and answers with the event, date, and interviewee attached

def separate(row):
  # input is a row
  interview = row.iloc[0]
  event = row.iloc[1]
  date = row.iloc[2]
  names = row.iloc[3]
  output = []
  for text in interview: # for each question and its following response(s)
    q_and_a = re.split(r"\n(?=[A-ZÀ-Ÿ ,.-]+:)", text)
    question = re.sub(r"Q\.", "", q_and_a[0]).strip()
    for answer in q_and_a[1:]: # go through the responses in case there are multiple responders
      interviewee = re.search(r"([A-ZÀ-Ÿ ,.-]+)(:)", answer)
      # use levenshtein distance to standardize the names
      name = find_name(interviewee.group(1), names)
      answer_noname = re.sub(interviewee.group(0), "", answer).strip()
      output.append([question, answer_noname, event, date, name])
  return output

In [None]:
data = pd.read_csv("corpus_creation/interviews_raw.csv", converters={'names': pd.eval})
data.drop_duplicates(subset='text', inplace=True)

mlb_data = data[data['event'].str.contains("MLB |NL |AL |WORLD SERIES|HOME RUN CHASE|MEDIA CONFERENCE", case=False, regex=True)]
to_drop = mlb_data[mlb_data['event'].str.contains("NCAA|UNIVERSITY|COLLEGE|COLLEGIATE|STATE|MUNDIAL|ATLANTIC COAST|WINTER MEETINGS")].index
mlb_data = mlb_data.drop(to_drop)
mlb_data['text'] = mlb_data.apply(chunk, axis=1)
chunked = mlb_data[mlb_data['text'].map(len) > 0]
separated = chunked.apply(separate, axis=1)
separated_flattened = [x for xs in separated for x in xs]
# print(separated_flattened)
df = pd.DataFrame(separated_flattened, columns=['question', 'answer', 'event', 'date', 'name'])
display(df)

1795


In [None]:
names = pd.Series(pd.unique(df['name'])) # so inefficient it's embarrassing
names.to_csv("corpus_creation/interviewee_names.csv", index=False)
events = pd.Series(pd.unique(df['event'])) # so inefficient it's embarrassing
events.to_csv("corpus_creation/events.csv", index=False)
df.to_csv("corpus_creation/sportsQnA.csv", index=False)