In [88]:
#re.findall('\(.*?\)',b1)
re.findall('\(.*?\)',f1)

['(2004)']

In [738]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

import isbn_hyphenate

In [655]:
wikipages = ['https://en.wikipedia.org/wiki/List_of_children%27s_books_made_into_feature_films?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(0%E2%80%939,_A%E2%80%93C)?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(D%E2%80%93J)?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(K%E2%80%93R)?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(S%E2%80%93Z)?oldformat=true'
            ]

In [656]:
def add_to_rows(page_soup):
    '''
    Method extracts row elements from the soup tags of individual pages 
    and appends them to the `rows` list.
    
    '''
    
    global rows
    
    tables = page_soup.find_all('table', {'class': 'wikitable'})

    for alphabet in tables:
        rows += alphabet.findAll('tr')
    
    print("Tables added: {}, Rows added: {}".format(len(tables), len(rows)))

In [663]:
def working_book_link(book_link):
    if '/wiki/' in book_link:
        return 'https://en.wikipedia.org' + book_link
    return book_link

In [658]:
def add_to_book_film_dict():
    '''
    Method parses newly updated rows for book/film title, book author,
    number of adaptions, and list of adaptations, and adds them to the
    `book_film_dict` as a nested dictionary. 
    '''
    global sum_adaptations
    
    for row in rows:
        cols = row.findAll('td')

        try:
            if (len(cols[0].findAll('a')) == 2) and (cols[1].find('a') is not None):
                series = cols[0].text.replace('\n', '').split(',')[0]

                book_title = cols[0].findAll('a')[0]['title']
                author = cols[0].findAll('a')[-1]['title']

                book_wiki_url = working_book_link(cols[0].findAll('a')[0]['href'])

                try:
                    adaptations = clean_adaptations(get_adaptations(cols[1].findAll('a')))
                except KeyError:
                    adaptations = []
                    pass

                sum_adaptations += len(adaptations)

                book_film_dict[series] = {'author': author,
                                               'book_title': book_title,
                                               'book_wiki_url': book_wiki_url,
                                               'count': len(adaptations),
                                               'adaptations': adaptations}

        except IndexError or KeyError:
            pass
        
    print("No. of rows/original books in dict: {}, No. of adaptations: {} \n"
          .format(len(book_film_dict), sum_adaptations))

In [659]:
def get_adaptations(href_list):
    adaptations = []
    
    if len(href_list) == 1:
        adaptations.append(href_list[0]['title'])
        return adaptations
        
    for index, tag in enumerate(href_list):
        year_search = re.search(r'\d\d\d\d', tag['href'])
        
        if year_search is not None:  #There is a 4 digit number in the title
            year_search = year_search.group()
        
            if year_search not in tag['title']:
                version = tag['title'] + ' (' + year_search + ')'
                adaptations.append(version)
            else:
                adaptations.append(tag['title'])
        else:
            adaptations.append(tag['title'])
            
    return adaptations

def clean_adaptations(adaptations):
    cleaned = []
    
    for title in adaptations:
        if 'TV' in title or 'miniseries' in title:
            continue
        if 'page does not exist' in title:
            continue
        else:
            cleaned.append(title)
            
    return cleaned

In [664]:
rows = []
book_film_dict = {}
sum_adaptations = 0

for url in wikipages:
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    
    add_to_rows(soup)
    add_to_book_film_dict()
    
print("Of the {} rows parsed, {} were added to the dictionary ~ {}%"
      .format(len(rows), len(book_film_dict), (len(book_film_dict) * 100/len(rows))))

Tables added: 25, Rows added: 376
No. of rows/original books in dict: 259, No. of adaptations: 754 

Tables added: 3, Rows added: 798
No. of rows/original books in dict: 553, No. of adaptations: 2046 

Tables added: 7, Rows added: 1245
No. of rows/original books in dict: 813, No. of adaptations: 3917 

Tables added: 8, Rows added: 1662
No. of rows/original books in dict: 1056, No. of adaptations: 6426 

Tables added: 7, Rows added: 2034
No. of rows/original books in dict: 1237, No. of adaptations: 9466 

Of the 2034 rows parsed, 1237 were added to the dictionary ~ 60.816125860373646%


## Don't touch above

In [665]:
book_film_dict['Adventures of Huckleberry Finn (1884)']['book_wiki_url']

'https://en.wikipedia.org/wiki/Adventures_of_Huckleberry_Finn'

In [785]:
i = 0
for key in book_film_dict.keys():
    
    x = book_film_dict[key]['book_wiki_url']
    print(x, get_isbn(x))
    i += 1
    
    if (i > 30):
        break

https://en.wikipedia.org/wiki/Adventures_of_Huckleberry_Finn 29489461
https://en.wikipedia.org/wiki/The_Adventures_of_Pinocchio not formatted correctly
https://en.wikipedia.org/wiki/The_Adventures_of_Tom_Sawyer 47052486
https://en.wikipedia.org/wiki/Alexander_and_the_Terrible,_Horrible,_No_Good,_Very_Bad_Day 0-689-30072-7
https://en.wikipedia.org/wiki/Alice_series not formatted correctly
https://en.wikipedia.org/wiki/Alisa_Selezneva not formatted correctly
https://en.wikipedia.org/wiki/Angus,_Thongs_and_Full-Frontal_Snogging 1-85340-519-1
https://en.wikipedia.org/wiki/Anne_of_Green_Gables not formatted correctly
https://en.wikipedia.org/wiki/The_Apple_Dumpling_Gang not formatted correctly
https://en.wikipedia.org/wiki/Aquamarine_(novel) 978-1-4052-0363-0
https://en.wikipedia.org/wiki/Argument_About_Basia not formatted correctly
https://en.wikipedia.org/wiki/Arrow_to_the_Sun 0-670-13369-8
https://en.wikipedia.org/wiki/Arthur_Read not formatted correctly
https://en.wikipedia.org/wiki/Bal

we lose a bunch of old books, but it's okay since inflation would have destroyed old movie sales as well. i guess it wokrs out

In [784]:
def get_isbn(wiki_url):
    page = requests.get(wiki_url).text
    soup = BeautifulSoup(page, 'lxml')
    
    infobox = soup.find('table', {'class': 'infobox vcard'})
    
    if infobox is None:
        return 'not formatted correctly'
    
    for row in infobox.findAll('tr'):
        #ISBN directly available
        if ('ISBN' in row.text):
            isbn = re.sub('[^0-9]','', row.findAll('a')[-1].text)
            try:
                isbn = isbn_hyphenate.hyphenate(isbn)
                return isbn
            except:
                return 'not formatted correctly'
                
        #only OCLC available
        elif row.find('a', {'title': 'OCLC'}) is not None:
            return (row.find('td').text)
        
    #Nothing found directly on Infobox
    return 'not formatted correctly'      