In [12]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
def retrieve_person_details(person_id):
    url = f"https://www.imdb.com/name/{person_id}/"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    script_tag = soup.find('script', {'type': 'application/ld+json'})
    
    if script_tag is None:
        print(f"No script tag found on {url}")
        return None

    data = json.loads(script_tag.string)
    
    if 'mainEntity' not in data:
        print(f"No 'mainEntity' data found on {url}")
        return None
    
    name = data['mainEntity'].get('name')
    birthdate = data['mainEntity'].get('birthDate')
    
    if name is None or birthdate is None:
        print(f"Incomplete data on {url}")
        return None
    
    return {'person_id': person_id, 'name': name, 'birthdate': birthdate}

# Read data
df = pd.read_csv('./data/movie_to_persons.csv')

# Get unique person_ids
unique_person_ids = df['person_id'].unique()

# For the first 5 unique person_ids
for person_id in unique_person_ids[:5]:
    print(retrieve_person_details(person_id))


Failed to retrieve https://www.imdb.com/name/nm0000122/
None
Failed to retrieve https://www.imdb.com/name/nm0841501/
None
Failed to retrieve https://www.imdb.com/name/nm0615306/
None
Failed to retrieve https://www.imdb.com/name/nm0074788/
None
Failed to retrieve https://www.imdb.com/name/nm0906618/
None


In [13]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

def retrieve_person_details(person_id):
    url = f"https://www.imdb.com/name/{person_id}/"
    try:
        response = requests.get(url)
        response.raise_for_status()  # This line will raise an exception for 4xx and 5xx status codes
    except (requests.RequestException, ValueError):
        print(f"Failed to retrieve {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    script_tag = soup.find('script', {'type': 'application/ld+json'})
    if script_tag is None:
        print(f"'application/ld+json' not found in {url}")
        return None

    try:
        data = json.loads(script_tag.string)
        name = data['mainEntity']['name']
        birthdate = data['mainEntity'].get('birthDate', 'N/A')  # Some pages may not have birthdate information
    except (json.JSONDecodeError, AttributeError):
        print(f"Failed to parse JSON or access its 'name' or 'birthDate' fields from {url}")
        return None

    return {"person_id": person_id, "name": name, "birthdate": birthdate}

unique_person_ids = ['nm0000122', 'nm0841501']  # Just using 2 IDs as an example
data = []
for person_id in unique_person_ids:
    person_data = retrieve_person_details(person_id)
    if person_data is not None:
        data.append(person_data)

df = pd.DataFrame(data)
df.to_csv("./data/person_details.csv", index=False)


Failed to retrieve https://www.imdb.com/name/nm0000122/
Failed to retrieve https://www.imdb.com/name/nm0841501/


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm

def retrieve_person_details(person_id):
    url = f"https://www.imdb.com/name/{person_id}/"
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    script = soup.find('script', type='application/ld+json')
    if script:
        data = json.loads(script.string)
        name = data.get('name')
        birth_date = data.get('birthDate')

        if birth_date:
            birth_year = birth_date.split('-')[0]
        else:
            birth_year = None

        return {"person_id": person_id, "name": name, "birth_year": birth_year}

    return {"person_id": person_id, "name": None, "birth_year": None}


# Load data from csv
df = pd.read_csv('./data/movie_to_persons.csv')

# Create a set to hold unique person_ids
unique_person_ids = set(df['person_id'])

# Prepare list to hold person details
person_details = []

# Retrieve details for each unique person_id
for person_id in tqdm(unique_person_ids, desc="Processing persons", unit="person"):
    person_details.append(retrieve_person_details(person_id))

# Create a DataFrame from the details and save it to a csv
person_df = pd.DataFrame(person_details)
person_df.to_csv('./data/person_details.csv', index=False)
