# Twilight Zone

In [3]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import time
import re
import os
import nltk

## Web scrapping 

In [4]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
def countWords(string): 
    OUT = False
    IN = True
    state = OUT 
    wc = 0
  
    # Scan all characters one by one 
    for i in range(len(string)): 
  
        # If next character is a separator,  
        # set the state as OUT 
        if (string[i] == ' ' or string[i] == '\n' or
            string[i] == '\t'): 
            state = OUT 
  
        # If next character is not a word  
        # separator and state is OUT, then  
        # set the state as IN and increment  
        # word count 
        elif state == OUT: 
            state = IN 
            wc += 1
            
    return wc

In [6]:
#pip install wikipedia -- allows for use of Wikipedia API
import wikipedia
titles = ['Where Is Everybody?','One for the Angels','Walking Distance','The Lonely (The Twilight Zone)','Time Enough at Last',
         'A Stop at Willoughby','The After Hours','The Howling Man','Eye of the Beholder (1960 Twilight Zone episode)',
         'Nick of Time (The Twilight Zone)', 'Will the Real Martian Please Stand Up?', "It's a Good Life (The Twilight Zone)",'Nothing in the Dark',
         'To Serve Man (The Twilight Zone)', 'Nightmare at 20,000 Feet'] #real episode list here

In [7]:
def twilight_zone_nes(episodes):
    casts = []
    for ep in episodes:
        casts.append(wikipedia.WikipediaPage(title = ep).section('Cast'))
        
    nes = []
    for cast in casts:
        if cast is not None:
            chars = cast.split('\n')
            for char in chars:
                nes.append(char.split('as')[-1].strip())
        else:
            print('No cast info for episode: %s...\nCheck article name or fandom wiki...'%episodes[casts.index(cast)])
    print(nes)
    nes.append('Narrator')
    return list(set(nes))

In [8]:
n = twilight_zone_nes(titles)


No cast info for episode: Time Enough at Last...
Check article name or fandom wiki...
No cast info for episode: Time Enough at Last...
Check article name or fandom wiki...
No cast info for episode: Time Enough at Last...
Check article name or fandom wiki...
['Mike Ferris', 'General', 'Colonel', 'Lewis J. "Lou" Bookman', 'Mr. Death', 'Maggie Polanski', 'Martin Sloan', "Martin's Dad", "Martin's Mom", 'The Wilcox Boy', 'Young Martin', 'Charlie', 'Mr. Wilson', 'James A. Corry', 'Alicia', 'Allenby', 'Adams', 'Carstairs', 'David Ellington', 'Brother Jerome', 'The Howling Man', 'Brother Christophorus', 'Housekeeper', 'Janet Tyler (under bandages)', 'ked)', 'Doctor Bernardi', 'Nurse', 'Walter Smith', 'The Leader', 'Nurse #2', 'Don Carter', 'Pat Carter', 'Mechanic (Lars)', 'Counter Man', 'Man', 'Woman', 'Ross, the businessman', 'Ethel McConnell, the dancer', 'Avery, the crazy man', 'Haley, the cook', 'Trooper Bill Padgett', 'Olmstead, the bus driver', 'Trooper Dan Perry', 'Rose Kramer, the olde

In [9]:
missing_eps = ['Time_Enough_at_Last','A Stop at Willoughby','The After Hours']

for episode in missing_eps:
    time.sleep(10)
    wikis = 'https://twilightzone.fandom.com/wiki/' + episode
    s = simple_get(wikis)
    html = BeautifulSoup(s, 'html.parser')
    title = str(html.find("meta",  property="og:title"))

    cast = re.search(r'(id="Cast"[\<\w\>\s\=\"\-\(\)\/;\[\]\'\.\:]*)(<\/ul>)',str(html))[0]
    markup = cast.encode("utf-8")
    content = markup.decode("utf-8", "ignore")
    content = content.replace('\n','')
    names = re.findall(r'(as )(<a[\s\w\"\=\/\(\)\-]*>)?([\w\s\"\.\-]*)',str(content))
    #names = re.findall(r'(as )(<a[\w\s\=\/\"\(\)\-]*>)([\w\s]*)', str(content))
    #print(content)
    #print(names)
    
    for i,p in enumerate(names):
        #print(p[-1]) 
        n.append(p[-1])
# two characters the regex missed for some reason        
n.append('Mr. Carsville')
n.append('Mrs. Chester')
# actor the regex picked up
n.remove('Mavis Neal') 
# wiki pulled only a part of the name
n.remove('ked)')
n.append('Janet Tyler (unmasked)')

In [10]:
named_entity_list = []
for i,j in enumerate(n):
    
    #cleaning up the names a bit
    main_name = re.sub(r'(, [\w\s]*)','',j)
    main_name = re.sub(r'(\s\([\w\s]*\))','',main_name)
    main_name = re.sub(r'\)','',main_name)
    
    fin_tag = ''
        
    if (countWords(main_name.strip()) == 1) or (countWords(main_name.strip()) >= 3):
        fin_tag = ',' + re.sub(r'([\w]{3})([\w\s]*)',r'\1',main_name).upper()
    elif countWords(main_name.strip()) == 2:
        fin_tag = ',' + re.sub(r'([\w]{1})([\w\']*)\s([\w]*)',r'\1\3', main_name)
    
    main = main_name + ',CHR' + fin_tag 
    
    named_entity_list.append(main)
    
for i in named_entity_list:
    if i == ',CHR':
        named_entity_list.remove(i)

tupled_list = []
for i,p in enumerate(named_entity_list):
    tupled_list.append(tuple(named_entity_list[i].split(',')))


In [11]:
print(tupled_list)

[('The Wilcox Boy', 'CHR', 'THE'), ('Haley', 'CHR', 'HAL'), ('Rose Kramer', 'CHR', 'RKramer'), ('Peter Kramer', 'CHR', 'PKramer'), ('Anthony Fremont', 'CHR', 'AFremont'), ('Olmstead', 'CHR', 'OLM'), ('Alicia', 'CHR', 'ALI'), ('Janet Tyler', 'CHR', 'JTyler'), ('Avery', 'CHR', 'AVE'), ('Counter Man', 'CHR', 'CMan'), ('The Leader', 'CHR', 'TLeader'), ('Man', 'CHR', 'MAN'), ('Allenby', 'CHR', 'ALL'), ('Leveque', 'CHR', 'LEV'), ('Gremlin', 'CHR', 'GRE'), ('Trooper Bill Padgett', 'CHR', 'TRO'), ('Don Carter', 'CHR', 'DCarter'), ('Nurse', 'CHR', 'NUR'), ("Martin's Mom", 'CHR', 'MMom'), ('Pat Carter', 'CHR', 'PCarter'), ('Mr. Death', 'CHR', 'Mr. Death'), ('Young Martin', 'CHR', 'YMartin'), ('Scientist', 'CHR', 'SCI'), ('Colonel', 'CHR', 'COL'), ('Maggie Polanski', 'CHR', 'MPolanski'), ('Julia Wilson', 'CHR', 'JWilson'), ('Walter Smith', 'CHR', 'WSmith'), ('James A. Corry', 'CHR', 'JAM. COR'), ('Flight Engineer', 'CHR', 'FEngineer'), ('Patty', 'CHR', 'PAT'), ('Ross', 'CHR', 'ROS'), ('Theodore M

In [12]:
tupled_list.sort()

f = open('named_entity_tz.txt','w+', encoding='utf-8')
for s in tupled_list:
    f.write("%s, %s, %s\n" % s)
f.close()