In [167]:
import requests, ssl, re, sys
from bs4 import BeautifulSoup

root_url = "https://www.imdb.com/title/tt5013056/?ref_=nv_sr_1"

In [168]:
class Movie:
    def __init__(self, url):
        self.url = url
        self.title = ''
        self.year = ''
        self.genres = []
        self.rating = ''
        self.duration = ''
        self.summary = ''
        self.director = ''
        self.writer = ''
        self.cast = []
        self.producers = []
        self.exec_producers = []
        self.imdb_score = 0.0
        self.rt_critic_score = 0
        self.rt_audience_score = 0
        
    def __str__(self):
        return self.title + ' (' + self.year + ')'

In [189]:
def get_soup(movie_url):
    try: 
        html = requests.get(movie_url).content
        soup = BeautifulSoup(html, 'html.parser')
        return soup
    except:
        print("error getting movie")
        return None

In [190]:
# Get HTML parser object
soup = get_soup(root_url)
if not soup:
    print("Exiting at {}".format(root_url))
    sys.exit()

# If successful, create Movie object
movie = Movie(root_url)

In [191]:
# Get HTML element referring to movie headline
movie_header = soup.select('.title_wrapper')[0]

In [192]:
# Get title and year
headline = movie_header.h1.text
movie.title = ' '.join(headline.split('\xa0')[:-1])
movie.year = headline.split('\xa0')[-1].strip()[1:-1]

In [193]:
# Get rating
info = movie_header.select('div.subtext')[0]
movie.rating = info.contents[0].strip()

In [194]:
# Get duration
time_tag = info.find('time')

try:
    duration = re.match("PT(?P<duration>\d+)M", time_tag['datetime']).group('duration')
except:
    timestamp = time_tag.text.strip()
    hours, minutes = timestamp.split(' ')
    hours = int(hours[:-1])
    minutes = int(minutes[:-3])
    duration = hours * 60 + minutes

movie.duration = duration

In [195]:
# Get genres
for tag in movie_header.find_all('a'):
    if '?genres=' in tag['href']:
        movie.genres.append(tag.text.strip())

In [196]:
# Get HTML element referring to movie details
movie_details = soup.select('.plot_summary')[0]

In [197]:
# Get summary
movie.summary = movie_details.select('div.summary_text')[0].text.strip()

In [198]:
# Get other details
details = [el.text.strip().split('\n') for el in movie_details.find_all('div')[1:]]
for item in details:
    if 'Director' in item[0]:
        directors = [d.strip() for d in item[1].split(',')]
        movie.director = directors if len(directors) > 1 else directors[0]
    elif 'Writer' in item[0]:
        writers = [w.strip() for w in item[1].split(',')]
        movie.writer = writers if len(writers) > 1 else writers[0]
    elif 'Stars' in item[0]:
        cast = [actor.strip() for actor in item[1].split(',')]
        if '|' in cast[-1]:
            cast[-1] = ' '.join(cast[-1].split(' ')[:-1])
        movie.cast = cast

In [199]:
# Get IMDB score
score_text = soup.select('div.imdbRating')[0].select('div.ratingValue')[0].text.strip()
movie.imdb_score = score_text.split('/')[0]

In [204]:
# Get Rotten Tomatoes scores
movie_slug = '_'.join(movie.title.split(' ')).lower()
url = 'http://www.rottentomatoes.com/m/{}'.format(movie_slug)
rt_soup = get_soup(url)
def check_rt_year(soup):
    year = rt_soup.select('.h3.year')[0].text.strip()
    if year[0] == '(':
        year = year[1:]
    if year[-1] == ')':
        year = year[:-1]
    return year

if check_year

1958


In [186]:
print("CURRENT MOVIE")
print(vars(movie))
# print("")
# print("CURRENT DOM ELEMENT")
# print(movie_header.prettify())

CURRENT MOVIE
{'url': 'https://www.imdb.com/title/tt5013056/?ref_=nv_sr_1', 'title': 'Dunkirk', 'year': '2017', 'genres': ['Action', 'Drama', 'History'], 'rating': 'PG-13', 'duration': '106', 'summary': 'Allied soldiers from Belgium, the British Empire and France are surrounded by the German Army, and evacuated during a fierce battle in World War II.', 'director': 'Christopher Nolan', 'writer': 'Christopher Nolan', 'cast': ['Fionn Whitehead', 'Barry Keoghan', 'Mark Rylance'], 'producers': [], 'exec_producers': [], 'imdb_score': '7.9', 'rt_critic_score': 0, 'rt_audience_score': 0, 'score': '7.9'}
