# Hitchhiker's Guide

In [2]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import time
import re
import os

## Grabbing Data from Wiki

In [3]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def countWords(string): 
    OUT = False
    IN = True
    state = OUT 
    wc = 0
  
    # Scan all characters one by one 
    for i in range(len(string)): 
  
        # If next character is a separator,  
        # set the state as OUT 
        if (string[i] == ' ' or string[i] == '\n' or
            string[i] == '\t'): 
            state = OUT 
  
        # If next character is not a word  
        # separator and state is OUT, then  
        # set the state as IN and increment  
        # word count 
        elif state == OUT: 
            state = IN 
            wc += 1
            
    return wc

In [4]:
hh = 'https://hitchhikers.fandom.com/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy_(book)'
hh_html = simple_get(hh)
hh_soup = BeautifulSoup(hh_html,'html.parser')

named_entity_list = []

### Character: Main(with aliases) and Minor

In [5]:
hh_char = re.search(r'(<h3><span\s[\w\=\"\-\s\_]*>[\w\s\<\/\>\=\"]*<a\s[\w\=\"\/\_\%\(\)\?\&\;\s\>\<\-\:\,]*<\/h3>)([\<\w\>\s\=\"\/\_\-\(\)]*)(<\/ul>)',str(hh_soup))[0]
all_char = re.findall(r'(<a\s[\w\=\/\"\s\(\)\-]*>)([\w\s]*)(<\/a>)([\w\s\(\)\;]*)',hh_char)

hh_char_list = []
for i,j in enumerate(all_char):
    main_name = j[1]
    
    #print(main_name)
    fin_tag = ''
        
    if (countWords(main_name.strip()) == 1) or (countWords(main_name.strip()) >= 3):
        fin_tag = ',' + re.sub(r'([\w]{3})([\w\s]*)',r'\1',main_name).upper()
    elif countWords(main_name.strip()) == 2:
        fin_tag = ',' + re.sub(r'([\w]{1})([\w\']*)\s([\w]*)',r'\1\3', main_name)
    
    main = main_name + ',CHR' + fin_tag 
    
    named_entity_list.append(main)
    
        
mains = ['Arthur_Dent','Ford_Prefect','Zaphod_Beeblebrox']

for i in mains:
    time.sleep(10)
    url = 'https://hitchhikers.fandom.com/wiki/' + i
    html = simple_get(url)
    soup = BeautifulSoup(html,'html.parser')
    pi = soup.find_all('div', {'class','pi-data-value pi-font'})
    s = re.findall(r'<div class[\w\=\"\-\s]*>([\w\<\>\"\s\(\=\/\)\-\#\[\]]*)',str(pi))
    
    aliases = []
    if len(s) < 7:
        aliases.append(s[0])
    elif len(s) == 7:
        aliases.append(s[1])
    
    a = re.findall(r'(<li>[\"\w\s\-]*)',str(aliases))
    a = re.findall(r'(<li>\")([\w\s\-]*)', str(a))
    
    
    for i,p in enumerate(a):
        #print(len(a)) ADent = 8, FPrefect = 2, ZBeeblebrox = 3
        nick = p[1]
        
        if len(a) == 8:
            name = nick +',CHR,ADent'
        elif len(a) == 2:
            name = nick +',CHR,FPrefect'
        elif len(a) == 3:
            name = nick +',CHR,ZBeeblebrox'
            
        named_entity_list.append(name)
        
    
#print(hh_char_list)
#named_entity_list.append(hh_char_list)

### Creatures and Animals

In [6]:
hh_cran = re.search(r'(<h3><span\s[\w\=\"\-]*\sid\=\"Races\_and\_species\">[\w\s\<\/\>\=\"\%\(\)\?\&\;\:\,\-]*<\/h3>)([\<\s\w\>\=\"\/\-\(\)\%\']*)(<\/ul>)',str(hh_soup))[0]
all_cran = re.findall(r'(<a\s[\w\=\/\"\s\(\)\-\%\\\']*>)([\w\s\\\']*)(<\/a>)([\w\s\(\)\;]*)',hh_cran)

hh_cran_list = []
for i,j in enumerate(all_cran):
    name = j[1] + ',CrAn,RACES'
    #hh_cran_list.append(name)
    named_entity_list.append(name)
    #print(name)
#print(hh_cran_list)


### Locations

In [7]:
hh_loc = re.search(r'(<h3><span\s[\w\=\"\-]*\sid\=\"Places\">[\w\s\<\/\>\=\"\%\(\)\?\&\;\:\,\-]*<\/h3>)([\<\s\w\>\=\"\/\-\(\)\%\']*)(<\/ul>)',str(hh_soup))[0]
all_loc = re.findall(r'(<a\s[\w\=\/\"\s\(\)\-\%\\\']*>)([\w\s\\\']*)(<\/a>)([\w\s\(\)\;]*)',hh_loc)

hh_loc_list = []
for i,j in enumerate(all_loc):
    name = j[1] + ',LOC'
    named_entity_list.append(name)
    #print(name)
#print(hh_loc_list)
#named_entity_list.append(hh_loc_list)

### Misc

#### Ships

In [8]:
hh_ships = re.search(r'(<h3><span\s[\w\=\"\-]*\sid\=\"Ships\">[\w\s\<\/\>\=\"\%\(\)\?\&\;\:\,\-]*<\/h3>)([\<\s\w\>\=\"\/\-\(\)]*)(<\/ul>)',str(hh_soup))[0]
all_ships = re.findall(r'(<a\s[\w\=\/\"\s\(\)\-\%\\\']*>)([\w\s\\\']*)(<\/a>)([\w\s\(\)\;]*)', hh_ships)

hh_ships_list = []
for i,j in enumerate(all_ships):
    name = j[1] + ',MISC,VEH'
    hh_ships_list.append(name)
    #print(name)
#print(hh_ships_list)

#### Devices

In [9]:
hh_devs = re.search(r'(<h3><span\s[\w\=\"\-]*\sid\=\"Devices\">[\w\s\<\/\>\=\"\%\(\)\?\&\;\:\,\-]*<\/h3>)([\<\s\w\>\=\%\_\'\"\/\-\(\)]*)(<\/ul>)',str(hh_soup))[0]
all_devs = re.findall(r'(<a\s[\w\=\/\"\s\(\)\-\%\\\']*>)([\-\w\s\']*)(<\/a>)([\w\s\(\)\;]*)', hh_devs)

hh_devs_list = []
for i,j in enumerate(all_devs):
    name = j[1]
    
    hh_devs_list.append(name)

hh_devs_tagged = []
for i in hh_devs_list:
    
    if i == ("The Hitchhiker's Guide to the Galaxy") :
        name = i + ',MISC,TEXT'
        
    elif (i == 'Towel') or (i == 'Hyperspace'):
        name = i + ',MISC'
        
    else:
        name = i + ',MISC,ELEC'
    #print(name)
    hh_devs_tagged.append(name)
#print(hh_devs_tagged)

#### Publications

In [10]:
hh_pubs = re.search(r'(<h3><span\s[\w\=\"\-]*\sid\=\"Publications\">[\w\s\<\/\>\=\"\%\(\)\?\&\;\:\,\-]*<\/h3>)([\s\<\w\>\=\"\(\)\-\/\%\'\?]*)(<\/ul>)',str(hh_soup))[0]
all_pubs = re.findall(r'(<a\s[\w\=\/\"\s\(\)\-\%\\\']*>)([\-\w\s\']*)(<\/a>)([\w\s\(\)\;]*)', hh_pubs)

hh_pubs_list = []
for i,j in enumerate(all_pubs):
    name = j[1] + ',MISC,TEXT'
    print(name)
    hh_pubs_list.append(name)

#print(hh_pubs_list)

Celestial Home Care Omnibus,MISC,TEXT
Encyclopedia Galactica,MISC,TEXT
Fifty More Things to do in Zero Gravity,MISC,TEXT
Where God Went Wrong,MISC,TEXT
Some More of God's Greatest Mistakes,MISC,TEXT
Well That About Wraps It Up For God,MISC,TEXT
My Favourite Bathtime Gurgles,MISC,TEXT
Veet Voojagig's Story,MISC,TEXT


In [11]:
lists = [hh_devs_tagged, hh_pubs_list, hh_ships_list]

hh_misc_list = []
for l in lists:
    for j in l:
        print(j)
        #hh_misc_list.append(j)
        named_entity_list.append(j)
#print(hh_misc_list)
#named_entity_list.append(hh_misc_list)

The Hitchhiker's Guide to the Galaxy,MISC,TEXT
Towel,MISC
Sub-Etha Sens-O-Matic,MISC,ELEC
Electronic Thumb,MISC,ELEC
Hyperspace,MISC
Digital Watch,MISC,ELEC
Infinite Improbability Drive,MISC,ELEC
Nutri-Matic Drink Synthesizer,MISC,ELEC
Kill-o-Zap Gun,MISC,ELEC
Celestial Home Care Omnibus,MISC,TEXT
Encyclopedia Galactica,MISC,TEXT
Fifty More Things to do in Zero Gravity,MISC,TEXT
Where God Went Wrong,MISC,TEXT
Some More of God's Greatest Mistakes,MISC,TEXT
Well That About Wraps It Up For God,MISC,TEXT
My Favourite Bathtime Gurgles,MISC,TEXT
Veet Voojagig's Story,MISC,TEXT
Heart of Gold,MISC,VEH
Vogon Constructor Fleet,MISC,VEH
Blagulon Kappa Policecraft,MISC,VEH


In [16]:
tupled_list = []
for i,p in enumerate(named_entity_list):
    tupled_list.append(tuple(named_entity_list[i].split(',')))
tupled_list

[('Admiral', 'CHR', 'ZBeeblebrox'),
 ('Ape Man', 'CHR', 'ADent'),
 ('Arcturan MegaDonkey', 'CrAn', 'RACES'),
 ('Arcturus', 'CrAn', 'RACES'),
 ("Arthur Dent's House", 'LOC'),
 ('Arthur Dent', 'CHR', 'ADent'),
 ('Azgoth', 'CrAn', 'RACES'),
 ('Babel Fish', 'CrAn', 'RACES'),
 ('Bang Bang', 'CHR', 'BBang'),
 ('Barman', 'CHR', 'BAR'),
 ('Benjy', 'CHR', 'BEN'),
 ('Betelgeuse V', 'LOC'),
 ('Betelgeuse VII', 'LOC'),
 ('Betelgeusian', 'CrAn', 'RACES'),
 ('Bethselamin', 'LOC'),
 ('Blagulon Kappa Policecraft', 'MISC', 'VEH'),
 ('Blagulon Kappa', 'CrAn', 'RACES'),
 ('Blagulon Kappa', 'LOC'),
 ('Bowl of Petunias', 'CHR', 'BOW'),
 ('Brantisvogan', 'LOC'),
 ('Celestial Home Care Omnibus', 'MISC', 'TEXT'),
 ('Chimp Man', 'CHR', 'ADent'),
 ('Cruxwan University', 'LOC'),
 ('Damogran Frond Crested Eagle', 'CrAn', 'RACES'),
 ('Damogran', 'LOC'),
 ('Dangrabad Beta', 'LOC'),
 ('Deep Thought', 'CHR', 'DThought'),
 ('Dentrassi', 'CrAn', 'RACES'),
 ('Digital Watch', 'MISC', 'ELEC'),
 ('Dolphin', 'CrAn', 'RACES'

In [20]:
len(tupled_list[11])

2

In [21]:
tupled_list.sort()

f = open('named_entity_hhgttg.txt','w+')
for s in tupled_list:
    if len(s) == 3:
        f.write("%s, %s, %s\n" % s)
    elif len(s) == 2:
        f.write("%s, %s\n" % s)
f.close()