In [39]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
import re

# Scraping missing data
In this notebook, we are scrappping missing data for our data story: analysing the ideal actor profile for each genre over time.

The missing data is:
1. height of every actor
2. movie success metric: imdb review score
3. ...???

## Scraping the height of every actor
Firstly, we check which actor's height is unknown in the given data. Secondly, we scrape for the unknown heigths, using imdb as source. 

In [3]:
# import data sets (column names are based on readme file of data set publisher)
character_metadata = pd.read_csv('./data/character.metadata.tsv', sep='\t', names=[
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie release date',
    'Character name',
    'Actor date of birth',
    'Actor gender',
    'Actor height (in meters)',
    'Actor ethnicity (Freebase ID)',
    'Actor name',
    'Actor age at movie release',
    'Freebase character/actor map ID',
    'Freebase character ID',
    'Freebase actor ID',
])

In [4]:
# determmine amount of actor whoe height is unknown
actors_unknown_height = character_metadata[np.isnan(character_metadata["Actor height (in meters)"])][["Actor name"]].drop_duplicates()
actors_unknown_height.head()

Unnamed: 0,Actor name
7,Richard Cetrone
9,Duane Davis
10,Lobo Sebastian
11,Rodney A. Grant
13,Rick Edelstein


In [5]:
actors_unknown_height.shape

(121417, 1)

In [6]:
# get imdb actor list (source: https://datasets.imdbws.com/)
actors_imdb = pd.read_csv('./data/name.basics.tsv/data.tsv', sep='\t')
actors_imdb.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0072308,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0071877,tt0037382,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0049189,tt0056404,tt0057345,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0080455,tt0078723,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0060827,tt0050986,tt0050976"


In [7]:
actors_imdb.shape

(12081796, 6)

In [8]:
#filter for actors whose height is unknown
actors_unknown_height = actors_unknown_height.merge(actors_imdb, left_on="Actor name", right_on='primaryName', how="left")
actors_unknown_height.head()

# note: one actor has several imdb ids from time to time

Unnamed: 0,Actor name,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,Richard Cetrone,nm0149150,Richard Cetrone,1961,\N,"stunts,actor","tt3498820,tt3778644,tt0451279,tt2975590"
1,Duane Davis,nm0204523,Duane Davis,\N,\N,"actor,producer","tt0228333,tt0107889,tt0094721,tt0095742"
2,Duane Davis,nm12443572,Duane Davis,\N,\N,actor,tt13620374
3,Duane Davis,nm2483556,Duane Davis,\N,\N,actor,tt0816237
4,Duane Davis,nm2836184,Duane Davis,\N,\N,actor,tt1141256


In [9]:
actors_unknown_height.shape

(393614, 7)

In [52]:
# input string of imdb actor id and output is an array of birthday, birthplace, height (in this order)
def getImdbActorInfo(imdbActorId):
    try:
        page = requests.get('https://www.imdb.com/name/'+imdbActorId+'/bio/')
        soup = BeautifulSoup(page.text, 'html.parser')
    
        table = soup.find('table', attrs={'id':'overviewTable'})
        birthday = table.find("time")['datetime']
        birthplace = "".join([a.contents for a in table.find_all("a", href=True) if "birth_place" in a["href"]][0])
        height = int(re.findall(r'\(.*?\)', table.find_all('td')[[idx for idx, td in enumerate(table.find_all('td')) if "Height" in td.contents][0]+1].contents[0])[0].replace("(", "").replace(")", "").replace(".", "").replace("\xa0", "").replace("m", ""))
        return [birthday, birthplace, height]
    except:
        return "NA"

In [53]:
#test upper function
print(getImdbActorInfo("nm0000226"))

['1968-9-25', 'Philadelphia, Pennsylvania, USA', 187]


In [58]:
# scrape for personal info of actor on imdb
# TODO make more efficient takes way too long time
birthdays = []
birthplaces = []
heights = []
for idx, actor in enumerate(actors_unknown_height["nconst"]):
    info = getImdbActorInfo(actor)
    if info == "NA": 
        birthdays.append("NA")
        birthplaces.append("NA")
        heights.append("NA")
    else:
        birthdays.append(info[0])
        birthplaces.append(info[1])
        heights.append(info[2])
actors_unknown_height["birthday"] = birthdays
actors_unknown_height["birthplace"] = birthplaces
actors_unknown_height["height"] = heights

## Scraping imdb score of every given movie
untested code which is obsolete:
```
def getImdbRating(imdbMovieId):
    try:
        page = requests.get('https://www.imdb.com/title/'+imdbMovieId+'/')
        soup = BeautifulSoup(page.text, 'html.parser')

        imdbScore = soup.find('div', attrs={'data-testid':'hero-rating-bar__aggregate-rating__score'}).find("span").contents
        return(imdbScore)
    except:
        return("NA")

# get imdb score from this example url: https://www.imdb.com/title/tt0116629/
page = requests.get('https://www.imdb.com/title/tt0116629/')
soup = BeautifulSoup(page.text, 'html.parser')
imdbScore = soup.find('div', attrs={'data-testid':'hero-rating-bar__aggregate-rating__score'}).find("span").contents
print(imdbScore)

#test upper scraping function
print(getImdbRating("tt0116629"))
```