In [None]:
import numpy as np
import pandas as pd
import re
import requests
import string
import argparse
from tqdm import tqdm
from bs4 import BeautifulSoup

def parse_interview(url, data):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  section = soup.find(attrs={'style':'padding: 10px;', 'valign':'top'})
  event = soup.find('h1').get_text()
  date = soup.find('h2').get_text()
  items = soup.find_all("h3")
  names = [item.get_text() for item in items]
  for p in soup.find_all(["strong", "i", "h1", "h2", "h3", "br", "a"]):
    p.decompose()
  paragraphs = section.find_all(string=True)
  output = ""
  for p in paragraphs:
    text = p.get_text()
    output = output + text
  data.append([output, event, date, names])

def parse_player(url, data):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  table = soup.find('table', attrs={'width':'100%', 'cellspacing':'0', 
	                  'cellpadding':'3', 'border':'0'})
  if table is None:
    return None
  links = table.find_all('a', href=True)
  for link in links:
    parse_interview(link['href'], data)

def parse_letter(url, data):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  table = soup.find('table', attrs={'width':'100%', 'cellspacing':'0', 
	                  'cellpadding':'3', 'border':'0'})
  if table is None:
    return None
  links = table.find_all('a', href=True)
  for link in tqdm(links):
    parse_player(link['href'], data)

def parse_sport(url):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html.parser')
  table = soup.find_all('table', attrs={'width':'100%', 'cellspacing':'0', 
	                  'cellpadding':'5', 'border':'0'})[0]
  links = table.find_all('a', href=True)
  data = []
  for link in links:
    parse_letter(link['href'], data)
  return data

def scrape():
  
  data = parse_sport("https://www.asapsports.com/showcat.php?id=2")
  df = pd.DataFrame(data, columns=['text', 'event', 'date', 'names'])
  df.to_csv("corpus_creation/interviews_raw.csv", index=False)

100%|██████████| 157/157 [05:32<00:00,  2.12s/it]
100%|██████████| 320/320 [12:10<00:00,  2.28s/it]  
100%|██████████| 319/319 [12:00<00:00,  2.26s/it]
100%|██████████| 195/195 [05:48<00:00,  1.79s/it]
100%|██████████| 86/86 [02:36<00:00,  1.82s/it]
100%|██████████| 145/145 [05:54<00:00,  2.45s/it]
100%|██████████| 245/245 [09:47<00:00,  2.40s/it]
100%|██████████| 291/291 [09:25<00:00,  1.94s/it]
100%|██████████| 26/26 [00:44<00:00,  1.73s/it]
100%|██████████| 107/107 [03:30<00:00,  1.97s/it]
100%|██████████| 146/146 [04:24<00:00,  1.81s/it]
100%|██████████| 212/212 [08:39<00:00,  2.45s/it]
100%|██████████| 430/430 [17:56<00:00,  2.50s/it]  
100%|██████████| 83/83 [02:10<00:00,  1.58s/it]
100%|██████████| 81/81 [02:59<00:00,  2.21s/it]
100%|██████████| 204/204 [07:11<00:00,  2.12s/it]
100%|██████████| 11/11 [00:28<00:00,  2.62s/it]
100%|██████████| 229/229 [08:39<00:00,  2.27s/it]
100%|██████████| 392/392 [14:06<00:00,  2.16s/it]  
100%|██████████| 146/146 [06:35<00:00,  2.71s/it]
100%