In [32]:
import pandas as pd
import re 
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin

In [33]:
# dictionary of doctors: key is the Doctor "name" from the site we scrape, value - his/her number
doctors = {
        'Jodie Whittaker':13,
        'Twelfth Doctor':12,
        'Seventh Doctor':7,
        'First Doctor':1,
        'Second Doctor':2,
        'Eleventh Doctor':11,
        'Third Doctor':3,
        'Fourth Doctor':4,
        'Eighth Doctor':8,
        'Tenth Doctor':10,
        'Sixth Doctor':6,
        'Fifth Doctor':4,
        'Ninth Doctor':9
    }

In [34]:
'''
This function gets the link and splits it
Examples:

1-0.html    -> 1, 0. 0
1-1-1.html  -> 1, 1, 1
A.html      -> 0, 0, 0

The numeric episodes are required in order to sort the dataset by the appearence of episodes.
It is critical for dataset built with scrapy because scrapy crawls the link asynchroniously

'''

def get_episode(episode_link):
    splitted = re.split('[-|.]', episode_link)
    if len(splitted) == 2:
        return (0, 0, 0)
    elif len(splitted) == 4:
        return (int(splitted[0]), int(splitted[1]), int(splitted[2]))
    else:
        return (int(splitted[0]), int(splitted[1]), 0)

In [35]:
'''
This function gets the text separated by new line and sequuentally builds a dataset from it in he following way:
1. Marks and enumerates parts and scenese by id and name
2. Tags If the sentence is a talk, context or location
3. Splits the talk sentences into "talker" and "talk" iself

Part_id and part_name are required for marking teh scripts devided into parts or episodes
Example

(narrative)
[Tardis]
DOCTOR: Hello, Dalek
DALEK: EXTERMINATE

'part_id', 'part_name', 'scene_id','scene_name', 'text',        'phrase_type', 'detail'
 0          ''           0          0             narrative      context       NaN
 0          ''           1          Tardis        NaN            location      NaN
 0          ''           1          Tardis        Hello, Dalek   talk          DOCTOR
 0          ''           1          Tardis        EXTERMINATE    talk          DALEK 

'''
def parse_lines(lines):
    details = ''
    phrase_type = 'talk'
    scene_id = 0
    scene_name = ''
    text = ''
    part_id=0
    part_name = ''
    results = []
    for line in lines:
        #scene (location) is found
        if line.startswith('['):
            scene_id += 1
            scene_name = line.replace('\n',' ').strip()
            details = ''
            text = ''
            phrase_type = 'location'
        #context is found    
        elif line.startswith('('):
            details = ''
            text = line.replace('\n',' ').strip()   
            phrase_type = 'context'
        #talk is found
        elif len(re.findall("[A-Z]*: ", line)) > 0:
            sent = re.split(r"([A-Z]*: )", line)
            details = sent[1].split(':')[0].strip()
            text = sent[2].strip()
            phrase_type = 'talk'
        #back link is found (to handle some xpth problem, relevant for scrapy)   
        elif line.startswith('<Back'):
            break
        #episode or part is found    
        elif line.startswith('Episode') or line.startswith('Part'):
            #print(line)
            part_id+=1
            part_name = line.replace('\n',' ').strip()        
            details = ''
            text = ''
            phrase_type = 'episode'
        #handle current line as it belongs to previously found type    
        else:
            text = line.replace('\n',' ').strip()   
        results.append([part_id,part_name,scene_id, scene_name, text, phrase_type,details])
    return results

In [36]:
start_url = 'http://www.chakoteya.net/DoctorWho/'
#gets request from url
req = urlopen(start_url)
#builds sup object based on the results, 'html5lib' is used because it knows how to handle unmatched html taks
soup = BeautifulSoup(req, 'html5lib')
results = []
#handle main page
#get all a tags
all_l = soup.find_all('a')
counter = 0
for a in all_l:    
    if counter == 13:
        break
    #try fetch image from the a tag: we are in terested only in <a href=....><img/></a>    
    img = a.find('img')    
    if img is not None:
        counter +=1
        #extract link
        link = a.attrs['href']
        #extract text
        doctor_name = img.attrs['alt']
        #get Doctor's id by name
        doctor_id = doctors[doctor_name]
        #scrape the content of the found link:
        #build url
        full_url = urljoin(start_url,link)        
        req = urlopen(full_url)
        #build new soup object
        soup = BeautifulSoup(req, 'html5lib')
        #get the table with border="1" - it is a table that contains a list of episodes
        tables = soup.find_all('table',{"border": "1"})        
        #fetch all rows
        trs = tables[0].find_all('tr')
        #for each ror
        for tr in trs:
            #find all cells
            tds = tr.find_all('td')
            if len(tds)==3:
                #if the cell belongs to the body of the table (e can cehck it by bgcolor attribute) 
                #and contains link to the scripts
                if tds[0].attrs['bgcolor']!='#006b9f' and len(tds[0].find_all('a'))>0 :                    
                    #get link
                    episode_link = tds[0].find_all('a')[0].attrs['href']
                    #get episodes' name
                    episode_name = tds[0].find_all('a')[0].text
                    #parse link to get season and episodes ids
                    ord_season_id, episode_id_1, episode_id_2 = get_episode(episode_link)
                    #build url to the scripts page
                    full_url = urljoin(full_url,episode_link)                    
                    req = urlopen(full_url)
                    #get new soup object
                    soup = BeautifulSoup(req, 'html5lib')                    
                    td = soup.find_all('td')[0]          
                    #find td element - it is single in the page
                    #soup = BeautifulSoup(td.text, 'html5lib')
                    #clean all tags and styles, so we ge a list of plain texts
                    for script in td(['script','style']):
                        script.decompose()  # rip it out                   
                    #process the list of sentences to build the features we need    
                    processed_text = parse_lines(td.stripped_strings)           
                    #for each feature set build a record fopr our future dataset
                    for record in processed_text:
                        results.append([doctor_name,doctor_id,episode_name, episode_link.split('.')[0],
                                ord_season_id,episode_id_1,episode_id_2,
                                record[0],record[1],record[2],record[3],record[4],
                                record[5],record[6]])     
                  

In [37]:
len(results)

331896

In [38]:
#len(results)

331896

In [39]:
#build the final dataset                        
ds = pd.DataFrame(results,columns=['doctor_name', 'doctor_id', 'episode_name', 'episodid','ord_season_id',
       'episode_id_1', 'episode_id_2', 'part_id', 'part_name', 'scene_id',
       'scene_name', 'text', 'phrase_type', 'detail'])     

In [40]:
ds.head()

Unnamed: 0,doctor_name,doctor_id,episode_name,episodid,ord_season_id,episode_id_1,episode_id_2,part_id,part_name,scene_id,scene_name,text,phrase_type,detail
0,First Doctor,1,Pilot - An Unearthly Child,1-0,1,0,0,0,,1,[Scrap Yard],,location,
1,First Doctor,1,Pilot - An Unearthly Child,1-0,1,0,0,0,,1,[Scrap Yard],"(Night, a policeman is patrolling his beat pas...",context,
2,First Doctor,1,Pilot - An Unearthly Child,1-0,1,0,0,0,,2,[School],,location,
3,First Doctor,1,Pilot - An Unearthly Child,1-0,1,0,0,0,,2,[School],(The bell is ringing for end of classes),context,
4,First Doctor,1,Pilot - An Unearthly Child,1-0,1,0,0,0,,2,[School],"Wait in here please, Susan. I won't be long.",talk,BARBARA


In [41]:
#ds[(ds['doctor_id']==1) & (ds['ord_season_id']=="3")] 

In [42]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331896 entries, 0 to 331895
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   doctor_name    331896 non-null  object
 1   doctor_id      331896 non-null  int64 
 2   episode_name   331896 non-null  object
 3   episodid       331896 non-null  object
 4   ord_season_id  331896 non-null  int64 
 5   episode_id_1   331896 non-null  int64 
 6   episode_id_2   331896 non-null  int64 
 7   part_id        331896 non-null  int64 
 8   part_name      331896 non-null  object
 9   scene_id       331896 non-null  int64 
 10  scene_name     331896 non-null  object
 11  text           331896 non-null  object
 12  phrase_type    331896 non-null  object
 13  detail         331896 non-null  object
dtypes: int64(6), object(8)
memory usage: 35.5+ MB
