In [1]:
import os
import json
import numpy as np
import requests
from bs4 import BeautifulSoup

# Get Transcripts (~ 20-25 mins)

In [2]:
def extract_speech(p_tag):
    speaker = p_tag.find('span', class_='speaker')
    speech_content = ''.join(str(tag) for tag in speaker.next_siblings)
    soup = BeautifulSoup(speech_content, 'html.parser')
    return soup.get_text().strip(': ')

def get_transcript(base_url, video_link):
    url = base_url + video_link
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    texts = soup.find_all('p')
    transcript = []
    speakers = set()
    for text in texts:
        if text.span is None:
            continue
        if text.span.get('class')[0] != 'speaker':
            continue
        speaker = str(text.span.text).strip()
        content = extract_speech(text)
        if 'Steve Jobs' in speaker or 'SJ' in speaker:
            speaker = 'SJ' #standardize for steve jobs, our target
        transcript_item = {'speaker': speaker, 'content': content}
        transcript.append(transcript_item)
        speakers.add(speaker)
    if len(speakers) < 2:
        return []
    return transcript

In [3]:
base_url = 'https://allaboutstevejobs.com/videos/misc/'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
videos = soup.find_all('a', class_='box-link')
print(len(videos))
video_links = [video.attrs['href'][18:] for video in videos]

71


In [4]:
transcripts = {}
for video_link in video_links:
    transcript = get_transcript(base_url, video_link)
    if transcript:
        transcripts[str(video_link).strip()] = transcript
print(len(transcripts))
json.dump(transcripts, open('transcripts.json', 'w'))

  soup = BeautifulSoup(speech_content, 'html.parser')


8


# QA Pairs (~10-15 mins)

In [9]:
def get_qa(transcript):
    short_qas = []
    long_qas = []
    all_qas = []
    for i in range(1, len(transcript)):
        if transcript[i]['speaker'] == 'SJ' and transcript[i-1]['speaker'] != 'SJ':
            item = {'question': transcript[i-1]['content'], 'answer': transcript[i]['content']}
            if len(transcript[i]['content']) < 100:
                short_qas.append(item)
            else:
                long_qas.append(item)
            all_qas.append(item)
    return short_qas, long_qas, all_qas

In [11]:
short_qa_pairs = {}
long_qa_pairs = {}
all_qa_pairs = {}
transcripts = json.load(open('transcripts.json'))
for video, transcript in transcripts.items():
    short_qas, long_qas, all_qas = get_qa(transcript)
    short_qa_pairs[video] = short_qas
    long_qa_pairs[video] = long_qas
    all_qa_pairs[video] = all_qas #in order

json.dump(short_qa_pairs, open('short_qa_pairs.json', 'w'))
json.dump(long_qa_pairs, open('long_qa_pairs.json', 'w'))
json.dump(all_qa_pairs, open('all_qa_pairs.json', 'w'))

In [16]:
#basic filtering on qa pairs for some decent quality
useful_qa_pairs = []
for video, qas in long_qa_pairs.items():
    for qa in qas:
        if qa['question'][-1] == '?':
            useful_qa_pairs.append((qa, video))
print(len(useful_qa_pairs))
json.dump(useful_qa_pairs, open('useful_qa_pairs.json', 'w'))

108
