In [88]:
#re.findall('\(.*?\)',b1)
re.findall('\(.*?\)',f1)

['(2004)']

In [738]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

import isbn_hyphenate

In [655]:
wikipages = ['https://en.wikipedia.org/wiki/List_of_children%27s_books_made_into_feature_films?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(0%E2%80%939,_A%E2%80%93C)?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(D%E2%80%93J)?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(K%E2%80%93R)?oldformat=true',
             'https://en.wikipedia.org/wiki/List_of_fiction_works_made_into_feature_films_(S%E2%80%93Z)?oldformat=true'
            ]

In [656]:
def add_to_rows(page_soup):
    '''
    Method extracts row elements from the soup tags of individual pages 
    and appends them to the `rows` list.
    
    '''
    
    global rows
    
    tables = page_soup.find_all('table', {'class': 'wikitable'})

    for alphabet in tables:
        rows += alphabet.findAll('tr')
    
    print("Tables added: {}, Rows added: {}".format(len(tables), len(rows)))

In [663]:
def working_book_link(book_link):
    if '/wiki/' in book_link:
        return 'https://en.wikipedia.org' + book_link
    return book_link

In [789]:
def add_to_book_film_dict():
    '''
    Method parses newly updated rows for book/film title, book author,
    number of adaptions, and list of adaptations, and adds them to the
    `book_film_dict` as a nested dictionary. 
    '''
    global sum_adaptations
    
    for row in rows:
        cols = row.findAll('td')

        try:
            if (len(cols[0].findAll('a')) == 2) and (cols[1].find('a') is not None):
                series = cols[0].text.replace('\n', '').split(',')[0]

                book_title = cols[0].findAll('a')[0]['title']
                author = cols[0].findAll('a')[-1]['title']

                book_wiki_url = working_book_link(cols[0].findAll('a')[0]['href'])

                try:
                    adaptations = clean_adaptations(get_adaptations(cols[1].findAll('a')))
                except KeyError:
                    adaptations = []
                    pass

                sum_adaptations += len(adaptations)

                book_film_dict[series] = {'author': author,
                                            'book_title': book_title,
                                           'book_wiki_url': book_wiki_url,
                                           'count': len(adaptations),
                                           'adaptations': adaptations,
                                            'isbn': '', 'oclc':'', 'usable': False}

        except IndexError or KeyError:
            pass
        
    print("No. of rows/original books in dict: {}, No. of adaptations: {} \n"
          .format(len(book_film_dict), sum_adaptations))

In [659]:
def get_adaptations(href_list):
    adaptations = []
    
    if len(href_list) == 1:
        adaptations.append(href_list[0]['title'])
        return adaptations
        
    for index, tag in enumerate(href_list):
        year_search = re.search(r'\d\d\d\d', tag['href'])
        
        if year_search is not None:  #There is a 4 digit number in the title
            year_search = year_search.group()
        
            if year_search not in tag['title']:
                version = tag['title'] + ' (' + year_search + ')'
                adaptations.append(version)
            else:
                adaptations.append(tag['title'])
        else:
            adaptations.append(tag['title'])
            
    return adaptations

def clean_adaptations(adaptations):
    cleaned = []
    
    for title in adaptations:
        if 'TV' in title or 'miniseries' in title:
            continue
        if 'page does not exist' in title:
            continue
        else:
            cleaned.append(title)
            
    return cleaned

In [809]:
rows = []
book_film_dict = {}
sum_adaptations = 0

for url in wikipages:
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    
    add_to_rows(soup)
    add_to_book_film_dict()
    
print("Of the {} rows parsed, {} were added to the dictionary ~ {}%"
      .format(len(rows), len(book_film_dict), (len(book_film_dict) * 100/len(rows))))

Tables added: 25, Rows added: 376
No. of rows/original books in dict: 259, No. of adaptations: 754 

Tables added: 3, Rows added: 798
No. of rows/original books in dict: 553, No. of adaptations: 2046 

Tables added: 7, Rows added: 1245
No. of rows/original books in dict: 813, No. of adaptations: 3917 

Tables added: 8, Rows added: 1662
No. of rows/original books in dict: 1056, No. of adaptations: 6426 

Tables added: 7, Rows added: 2034
No. of rows/original books in dict: 1237, No. of adaptations: 9466 

Of the 2034 rows parsed, 1237 were added to the dictionary ~ 60.816125860373646%


## Don't touch above

In [828]:
get_isbn_oclc('https://en.wikipedia.org/wiki/Adventures_of_Huckleberry_Finn')

['broken', 'broken']

In [837]:
broken_count = 0
usable_count = 0
index = 0

for key in book_film_dict.keys():
    
    link = book_film_dict[key]['book_wiki_url']
    
    try:
        book_film_dict[key]['isbn'] = get_isbn(link)
    
        if book_film_dict[key]['isbn'] == 'broken':
            book_film_dict[key]['oclc'] = get_oclc(link)
            if book_film_dict[key]['oclc'] == 'broken':
                book_film_dict[key]['usable'] = False
                #Nothing changes, the book is unusable, and the 'usable' flag remains False
        else:
            book_film_dict[key]['usable'] = True
            usable_count += 1
    except NameError:
        pass
        
    if (index % 50 == 0):
        broken_count = index - usable_count
        print("Usable: {}, Broken: {}, Total: {}".format(usable_count, broken_count, index))
    
    index += 1
        


Usable: 0, Broken: 0, Total: 0
Usable: 22, Broken: 28, Total: 50
Usable: 46, Broken: 54, Total: 100
Usable: 69, Broken: 81, Total: 150
Usable: 93, Broken: 107, Total: 200
Usable: 114, Broken: 136, Total: 250
Usable: 135, Broken: 165, Total: 300
Usable: 155, Broken: 195, Total: 350
Usable: 165, Broken: 235, Total: 400
Usable: 178, Broken: 272, Total: 450
Usable: 188, Broken: 312, Total: 500
Usable: 204, Broken: 346, Total: 550
Usable: 227, Broken: 373, Total: 600
Usable: 248, Broken: 402, Total: 650
Usable: 272, Broken: 428, Total: 700
Usable: 287, Broken: 463, Total: 750
Usable: 310, Broken: 490, Total: 800
Usable: 328, Broken: 522, Total: 850
Usable: 346, Broken: 554, Total: 900
Usable: 363, Broken: 587, Total: 950
Usable: 380, Broken: 620, Total: 1000
Usable: 393, Broken: 657, Total: 1050
Usable: 414, Broken: 686, Total: 1100
Usable: 437, Broken: 713, Total: 1150
Usable: 453, Broken: 747, Total: 1200


we lose a bunch of old books, but it's okay since inflation would have destroyed old movie sales as well. i guess it wokrs out

In [823]:
def get_isbn(wiki_url):
    try:
        page = requests.get(wiki_url).text
        soup = BeautifulSoup(page, 'lxml')
    except MissingSchema:
        return 'broken'  
    
    infobox = soup.find('table', {'class': 'infobox vcard'})
    
    if infobox is None:
        return 'broken'
    
    for row in infobox.findAll('tr'):
        #ISBN directly available
        if ('ISBN' in row.text):
            isbn = re.sub('[^0-9]','', row.findAll('a')[-1].text)
            try:
                isbn = isbn_hyphenate.hyphenate(isbn)
                return isbn
            except:
                return 'broken'
        
    #Nothing found directly on Infobox
    return 'broken' 

In [824]:
# Only to be used if get_isbn() returns 'not formatted correctly'
def get_oclc(wiki_url):
    try:
        page = requests.get(wiki_url).text
        soup = BeautifulSoup(page, 'lxml')
    except MissingSchema:
        return 'broken'

    infobox = soup.find('table', {'class': 'infobox vcard'})
    
    if infobox is None:
        return 'broken'
    
    for row in infobox.findAll('tr'):
        #only OCLC available
        if row.find('a', {'title': 'OCLC'}) is not None:
            return (row.find('td').text)
        
    #Nothing found directly on Infobox
    return 'broken'     

## Don't touch above

In [850]:
book_film_dict['Adventures of Huckleberry Finn (1884)']

{'author': 'Mark Twain',
 'book_title': 'Adventures of Huckleberry Finn',
 'book_wiki_url': 'https://en.wikipedia.org/wiki/Adventures_of_Huckleberry_Finn',
 'count': 14,
 'adaptations': ['Huck and Tom',
  'Huckleberry Finn (1920 film)',
  'Huckleberry Finn (1931 film)',
  'The Adventures of Huckleberry Finn (1939 film)',
  'The Adventures of Huckleberry Finn (1955 film)',
  'The Adventures of Huckleberry Finn (1960 film)',
  'Huckleberry Finn (1974 film)',
  'Hopelessly Lost',
  'Huckleberry Finn (1975 film)',
  'The Adventures of Mark Twain (1985 film)',
  'Back to Hannibal: The Return of Tom Sawyer and Huckleberry Finn',
  'The Adventures of Huck Finn (1993 film)',
  'Tom and Huck',
  "Tomato Sawyer and Huckleberry Larry's Big River Rescue"],
 'isbn': 'broken',
 'oclc': '29489461',
 'usable': True}

In [874]:
book_film = pd.DataFrame()
rows = []

for key in book_film_dict.keys():
    for movie in book_film_dict[key]['adaptations']:
        row = [key, book_film_dict[key]['author'], 
               movie, 
               book_film_dict[key]['count'], 
               book_film_dict[key]['isbn'],
               book_film_dict[key]['oclc'],
               book_film_dict[key]['usable']]
        rows.append(row)
    
book_film = pd.DataFrame(data=rows)
book_film.rename(columns={0: 'book_title', 
                          1: 'author', 
                          2: 'movie', 
                          3: 'total_ad_count', 
                          4: 'isbn', 5: 'oclc', 
                          6: 'usable'},
                inplace = True)

book_film.loc[book_film['usable'] == True][35:60]

Unnamed: 0,book_title,author,movie,total_ad_count,isbn,oclc,usable
74,Beastly (2007),Alex Flinn,Beastly,1,1-4178-2861-7,broken,True
75,Because of Winn-Dixie (2000),Kate DiCamillo,Because of Winn-Dixie (film),1,0-7636-0776-2,41601218,True
80,The BFG (1982),Roald Dahl,The BFG (1989 film),2,0-224-02040-4,broken,True
81,The BFG (1982),Roald Dahl,The BFG (2016 film),2,0-224-02040-4,broken,True
82,Big Fish: A Novel of Mythic Proportions (1998),Daniel Wallace (author),Big Fish,1,1-56512-217-8,39269578,True
83,The Big Six (1940),Arthur Ransome,Swallows and Amazons Forever!,1,broken,9647950,True
93,Blood and Chocolate (1997),Annette Curtis Klause,Blood & Chocolate (film),1,0-440-22668-6,42412510,True
94,The Book Thief (2005),Markus Zusak,The Book Thief (film),1,broken,183612599,True
95,The Borrowers (1952),Mary Norton (author),The Borrowers (1973 film),3,broken,7557055,True
96,The Borrowers (1952),Mary Norton (author),The Borrowers (1997 film),3,broken,7557055,True
