In [1]:
# from imdb_helper_functions import *
import urllib
import re

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import pickle

def get_actor_soup(actor_url, sleep_timer=0.5):
    response = requests.get(actor_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        time.sleep(sleep_timer)
        return soup
    else:
        print(f'Error: Failed to get actor soup for {actor_url}')
        return None

def get_movie_soup(movie_url, sleep_timer=0.5):
    full_url = movie_url + 'fullcredits/'
    response = requests.get(full_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        time.sleep(sleep_timer)
        return soup
    else:
        print(f'Error: Failed to get movie soup for {movie_url}')
        return None

def get_actor_name(actor_url, sleep_timer=0.5):
    name = None
    response = requests.get(actor_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
    else:
        print(f'Error: Failed to get actor soup for {actor_url}')
        return name
        
    name = soup.find_all('h1')[0].find('span').text

    if not name:
        print(f'Error: Failed to parse name from page {actor_url}')

    time.sleep(sleep_timer)

    return name

def get_from_cache(file_name):
    try:
        with open(file_name, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        return dict()

def store_to_cache(file_name, data):
    try:
        with open(file_name, 'wb') as f:
            pickle.dump(data, f)
    except Exception as e:
        print(f'Error: Failed to store cache data - error {e}')
        
def get_movie_description(movie_url):
    descr = None
    try:
        response = requests.get(movie_url)
        assert response.status_code == 200
        m_soup = BeautifulSoup(response.text, "html.parser")
        if m_soup:
            section = m_soup.find('div', attrs={'class': 'ipc-html-content ipc-html-content--base'})
            descr = section.find('div').text
        if descr is None:
            descr = ''
    except:
        print(f'Failed to get movie description for {movie_url}')
        return ''

    return descr

In [3]:
def get_actors_by_movie_soup(cast_page_soup, num_of_actors_limit=None, logging=False):
    main_url = 'https://imdb.com'
    cast_list = []
    count = 1
    
    cast = cast_page_soup.find('table', attrs={'class': 'cast_list'})

    cast_members = cast.find_all('tr', attrs={'class': ['odd', 'even']})

    if not cast_members:
        if logging:
            print(f'Error: Failed to get cast info')
        return []

    for m in cast_members:

        name = m.select('tr > td')[1].get_text().replace('\n', '').strip()
        link = m.select('a')[1]['href']

        if name and link:
            cast_list.append((name, urllib.parse.urljoin(main_url, link)))
            count += 1
            if logging:
                print((name, urllib.parse.urljoin(main_url, link)))
        else:
            print(f'Error: could not get name or link info for {m}')

        if num_of_actors_limit:
            if count > num_of_actors_limit:
                break

    return cast_list

In [4]:
def get_movies_by_actor_soup(actor_page_soup, num_of_movies_limit=None, logging=True):
    movie_list = []
    main_url = 'https://imdb.com'
    count = 1

    films = actor_page_soup.find_all('div', attrs={'class': ['filmo-row odd', 'filmo-row even']})

    if not films:
        return []

    for film in films: 
        tag_id = film["id"].split('-')
        
        if tag_id[0] in ['actress', 'actor']:
            f_attr = film.find_all('a')
            if len(f_attr) == 1 and len(film.find_all(string=re.compile("\("))) == 0:
                film_name = f_attr[0].text.replace('\n', '').strip()
                film_link = f_attr[0]['href']
                if logging:
                    print((film_name, urllib.parse.urljoin(main_url, film_link)))

                movie_list.append((film_name, urllib.parse.urljoin(main_url, film_link)))
                count += 1

                if num_of_movies_limit:
                    if count > num_of_movies_limit:
                        break

    return movie_list


In [5]:
def get_movie_distance(actor_start_url, actor_end_url,
                       num_of_actors_limit=None, num_of_movies_limit=None, logging=False):
    actors_curr_level = []
    m_count = 1
    a_count = 1
    seen_actors = set()
    seen_movies = set()

    c_movies = get_from_cache('c_movies')
    c_actors = get_from_cache('c_actors')

    start_name = get_actor_name(actor_start_url)
    a_soup = get_actor_soup(actor_start_url)
    end_name = get_actor_name(actor_end_url)

    if logging:
        print(f'Start name: {start_name}, End name: {end_name}')

    # pre-seed level 0
    movies_curr_level = c_actors.get(start_name)

    if not movies_curr_level:
        movies_curr_level = get_movies_by_actor_soup(a_soup, num_of_movies_limit, False)
        c_actors[start_name] = movies_curr_level

    seen_actors.add(start_name)

    for level in range(1, 4):

        if logging:
            print(f'*** Starting level {level} ***')

        # create list of actors for the current level
        total_m_count = len(movies_curr_level)

        for movie in movies_curr_level:

            if movie[0] not in seen_movies:
                actors = c_movies.get(movie[0])

                if not actors:
                    m_soup = get_movie_soup(movie[1])
                    if m_soup:
                        actors = get_actors_by_movie_soup(m_soup, num_of_actors_limit, False)
                seen_movies.add(movie[0])
                
                for actor in actors:
                    if actor[0] == end_name:
                        store_to_cache('c_movies', c_movies)
                        store_to_cache('c_actors', c_actors)
                        return level

                c_movies[movie[0]] = actors
                actors_curr_level += actors

                if logging:
                    print(f'Getting actors for movie {m_count} out of {total_m_count} >>> {movie[0]}')
            m_count += 1

        movies_curr_level = []
        m_count = 1

        # check actors and build movies list for the next level
        total_a_count = len(actors_curr_level)
        for actor in actors_curr_level:

            if actor[0] not in seen_actors:

                movies = c_actors.get(actor[0])

                if not movies:
                    a_soup = get_actor_soup(actor[1])
                    if a_soup:
                        movies = get_movies_by_actor_soup(a_soup, num_of_movies_limit, False)
                seen_actors.add(actor[0])
                c_actors[actor[0]] = movies
                movies_curr_level += movies

                if logging:
                    print(f'... Getting movies for actor {a_count} out of {total_a_count} >>> {actor[0]}')
            a_count += 1

        actors_curr_level = []
        a_count = 1

    store_to_cache('c_movies', c_movies)
    store_to_cache('c_actors', c_actors)
    return -1


In [6]:
def get_movie_descriptions_by_actor_soup(actor_page_soup, logging=False):
    d_result = []
    m_count = 0

    movies = get_movies_by_actor_soup(actor_page_soup, None, logging)
    movies_len = len(movies)

    if movies:
        for movie in movies:
            m_count += 1
            decription = get_movie_description(movie[1])

            if len(decription) > 0:
                d_result.append(decription)

                if logging:
                    print(f'Getting description for movie - {movie[0]} - {m_count} out of {movies_len} movies')
                    print(decription)
    else:
        print(f'Can not get movies by actor soup')

    return d_result

In [8]:
import itertools
import datetime

in_names = ['Dwayne Johnson', 'Chris Hemsworth', 'Robert Downey Jr.', 'Akshay Kumar', 'Jackie Chan',
           'Bradley Cooper', 'Adam Sandler', 'Scarlett Johansson', 'Sofia Vergara', 'Chris Evans']

in_links = ['https://www.imdb.com/name/nm0425005/',
            'https://www.imdb.com/name/nm1165110/',
            'https://www.imdb.com/name/nm0000375/',
            'https://www.imdb.com/name/nm0474774/',
            'https://www.imdb.com/name/nm0000329/',
            'https://www.imdb.com/name/nm0177896/',
            'https://www.imdb.com/name/nm0001191/',
            'https://www.imdb.com/name/nm0424060/',
            'https://www.imdb.com/name/nm0005527/',
            'https://www.imdb.com/name/nm0262635/'
           ]
result = [[0 for x in range(len(in_names))] for y in range(len(in_names))] 

In [21]:
numbers = [i for i in range(len(in_names))]

combinations = itertools.combinations(numbers, 2)
print(datetime.datetime.now())

for c in combinations:
    distance = get_movie_distance(in_links[c[0]], in_links[c[1]], 5, 5, logging=False)
    print(f'{datetime.datetime.now()} ***** Distance {distance} from {in_names[c[0]]} to {in_names[c[1]]}*****')

    result[c[0]][c[1]] = distance

2021-12-28 13:34:59.351863
2021-12-28 13:35:05.960848 ***** Distance 2 from Dwayne Johnson to Chris Hemsworth*****
2021-12-28 13:35:10.431456 ***** Distance 2 from Dwayne Johnson to Robert Downey Jr.*****
2021-12-28 13:35:22.027377 ***** Distance 3 from Dwayne Johnson to Akshay Kumar*****
2021-12-28 13:35:32.587883 ***** Distance 3 from Dwayne Johnson to Jackie Chan*****
2021-12-28 13:35:44.855652 ***** Distance 3 from Dwayne Johnson to Bradley Cooper*****
2021-12-28 13:35:57.146345 ***** Distance 3 from Dwayne Johnson to Adam Sandler*****
2021-12-28 13:36:03.167036 ***** Distance 2 from Dwayne Johnson to Scarlett Johansson*****
2021-12-28 13:37:17.182599 ***** Distance -1 from Dwayne Johnson to Sofia Vergara*****
2021-12-28 13:37:23.208164 ***** Distance 2 from Dwayne Johnson to Chris Evans*****
2021-12-28 13:37:28.925889 ***** Distance 1 from Chris Hemsworth to Robert Downey Jr.*****
2021-12-28 13:37:39.518770 ***** Distance 3 from Chris Hemsworth to Akshay Kumar*****
2021-12-28 13:3

In [19]:
result = get_from_cache('result')

In [20]:
#store_to_cache('result', result)
result

[[0, 2, 2, 3, 3, 3, 3, 2, -1, 2],
 [0, 0, 1, 3, -1, 3, 3, 1, 3, 1],
 [0, 0, 0, -1, -1, 3, -1, 1, 3, 1],
 [0, 0, 0, 0, -1, 2, 3, 2, -1, 2],
 [0, 0, 0, 0, 0, -1, -1, -1, -1, -1],
 [0, 0, 0, 0, 0, 0, 3, 3, 3, 2],
 [0, 0, 0, 0, 0, 0, 0, 2, 3, 3],
 [0, 0, 0, 0, 0, 0, 0, 0, -1, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [21]:
import numpy as np
import pandas as pd

In [22]:
df = pd.DataFrame(result, columns=in_names)

In [23]:
df

Unnamed: 0,Dwayne Johnson,Chris Hemsworth,Robert Downey Jr.,Akshay Kumar,Jackie Chan,Bradley Cooper,Adam Sandler,Scarlett Johansson,Sofia Vergara,Chris Evans
0,0,2,2,3,3,3,3,2,-1,2
1,0,0,1,3,-1,3,3,1,3,1
2,0,0,0,-1,-1,3,-1,1,3,1
3,0,0,0,0,-1,2,3,2,-1,2
4,0,0,0,0,0,-1,-1,-1,-1,-1
5,0,0,0,0,0,0,3,3,3,2
6,0,0,0,0,0,0,0,2,3,3
7,0,0,0,0,0,0,0,0,-1,1
8,0,0,0,0,0,0,0,0,0,2
9,0,0,0,0,0,0,0,0,0,0


In [24]:
df.to_csv('distances.csv', index=False)

In [25]:
df1 = pd.read_csv ('distances.csv')
df1

Unnamed: 0,Dwayne Johnson,Chris Hemsworth,Robert Downey Jr.,Akshay Kumar,Jackie Chan,Bradley Cooper,Adam Sandler,Scarlett Johansson,Sofia Vergara,Chris Evans
0,0,2,2,3,3,3,3,2,-1,2
1,0,0,1,3,-1,3,3,1,3,1
2,0,0,0,-1,-1,3,-1,1,3,1
3,0,0,0,0,-1,2,3,2,-1,2
4,0,0,0,0,0,-1,-1,-1,-1,-1
5,0,0,0,0,0,0,3,3,3,2
6,0,0,0,0,0,0,0,2,3,3
7,0,0,0,0,0,0,0,0,-1,1
8,0,0,0,0,0,0,0,0,0,2
9,0,0,0,0,0,0,0,0,0,0


In [15]:
import os

In [17]:
numbers = [i for i in range(len(in_names))]
save_dir = 'txt_files'


for n in numbers:
    
    response = requests.get(in_links[n])
    assert response.status_code == 200
    soup = BeautifulSoup(response.text, "html.parser")
    descr = get_movie_descriptions_by_actor_soup(soup, False)
    print(f'{datetime.datetime.now()} got {len(descr)} movie descriptions for {in_names[n]}')
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    save_path = os.path.join(save_dir, in_names[n].replace(' ', '_').replace('.', '') + '.txt')
    
    with open(save_path, 'w') as f:
        for d in descr:
            f.write(d + '\n')
        print(f'{datetime.datetime.now()} writing file {save_path}')
    
    
    

2021-12-29 19:20:39.995603 got 28 movie descriptions for Dwayne Johnson
2021-12-29 19:20:39.996565 writing file txt_files\Dwayne_Johnson.txt
2021-12-29 19:21:04.633744 got 25 movie descriptions for Chris Hemsworth
2021-12-29 19:21:04.635179 writing file txt_files\Chris_Hemsworth.txt
2021-12-29 19:22:15.180654 got 63 movie descriptions for Robert Downey Jr.
2021-12-29 19:22:15.181616 writing file txt_files\Robert_Downey_Jr.txt
Failed to get movie description for https://imdb.com/title/tt12385290/
Failed to get movie description for https://imdb.com/title/tt5079992/
2021-12-29 19:25:03.585973 got 128 movie descriptions for Akshay Kumar
2021-12-29 19:25:03.587723 writing file txt_files\Akshay_Kumar.txt
Failed to get movie description for https://imdb.com/title/tt8702944/
2021-12-29 19:26:29.579913 got 79 movie descriptions for Jackie Chan
2021-12-29 19:26:29.580874 writing file txt_files\Jackie_Chan.txt
2021-12-29 19:27:01.269076 got 35 movie descriptions for Bradley Cooper
2021-12-29 19: