# ![](https://ga-dash.s3.amazonaws.com/production/assets/logo-9f88ae6c9c3871690e33280fcf557f33.png) Capstone Project

Notebook 1: Data Retrieving - Web Scraping

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def get_script(name):
    
    # accessing the script page
    url = 'https://www.imsdb.com/TV/'+name+'.html'
    res = requests.get(url)
    if res.status_code == 200:
        print(f'ACCESSING <{name.upper()}> SCRIPT PAGE...')
    else:
        print(f'UNABLE TO ACCESS {name.upper()} SCRIPT PAGE...')
    soup = BeautifulSoup(res.content, 'lxml')
    
    # locate the actual link for scripts
    link = soup.find_all('a')[64:-7]
    print(f'{len(link)} SCRIPTS FOUND!')

    # find the right url to each episode webpage
    ep_urls = []
    for episode in list(range(len(link))):
        branch = link[episode].attrs['href']
        # format the scraped link to match with the actual link
        branch_1 = branch[4:24].replace('T', 't')
        branch_2 = branch[27:-12].replace(' ', '-')
        branch_new = branch_1 + '-' + branch_2 + '.html'
        branch_url = url[:-16]
        ep_url = branch_url + branch_new
        ep_urls.append(ep_url)
    print(f'ON OUR WAY TO EPISODES!')
    
    # access each episodes page url using BeautifulSoup
    
    script_collection=[]
    ep_no = 0
    for sub in ep_urls:
        ep_no += 1
        ep_res = requests.get(sub)
        if ep_res.status_code != 200:
            print('ERROR ACCESSING EPISODE SCRIPTS...')
        else:
            ep_soup = BeautifulSoup(ep_res.content, 'lxml')
            script = ep_soup.find('td', {'class': 'scrtext'})
            script_new = script.find_all('pre')[0]
            
        # scrape script for each episode
        # credit to Dan Wilhelm for helping me out on these codes. Thanks so much!
        tags = []
        actor = ''
        lines = []
        scripts = []
        
        for tag in script_new.contents:
            if tag.name == 'b':
                if tag.text.strip()!= '':
                    if lines:
                        scripts.append((actor, ' '.join(lines).replace('\n', ' ').replace('  ', '')))
                        lines = []
                    actor = tag.text.strip()
            else:
                text = tag.strip()
                if len(text)>0:
                    lines.append(text)
        scripts.append((actor, ' '.join(lines).replace('\n', ' ').replace('  ', '')))
        script_collection.append(scripts)
        print(f'GENERATING SCRIPT OF EP.{ep_no} of {len(ep_urls)}')
    return script_collection
    print(f'ALL SCRIPTS RETRIEVED!')

In [None]:
all_scripts = get_script('Seinfeld')

ACCESSING <SEINFELD> SCRIPT PAGE...
176 SCRIPTS FOUND!
ON OUR WAY TO EPISODES!
GENERATING SCRIPT OF EP.1 of 176
GENERATING SCRIPT OF EP.2 of 176
GENERATING SCRIPT OF EP.3 of 176
GENERATING SCRIPT OF EP.4 of 176


In [None]:
all_scripts[0][1]

In [None]:
# converting scraped scripts to dataframe

def convert_df(all_scripts, with_title_end=False):
    df = pd.DataFrame(columns = ['character', 'line'])
    for script in all_scripts:
        
        # exclude the title and the end of each script
        if with_title_end == False:
            df_script = pd.DataFrame(script[1:-2], columns = ['character', 'line'])
            df = pd.concat([df, df_script], axis = 0)
            
        # include the title and the end of each script
        else:
            df_script = pd.DataFrame(script, columns = ['character', 'line'])
            df = pd.concat([df, df_script], axis = 0)
    return df

In [None]:
# generate dataframe with all lines

without_title = convert_df(all_scripts)
without_title.head()

In [None]:
# generate dataframe with all lines and titles & ends for each episode

with_title = convert_df(all_scripts, with_title_end=True)
with_title.head()

In [None]:
# save dataframes

without_title.to_csv('../data/seinfeld_scripts_no_title.csv', index=False)
with_title.to_csv('../data/seinfeld_scripts_with_title.csv', index=False)