In [1]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
headers = {"Accept-Language": "en-US, en;q=0.5"}
url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"
results = requests.get(url, headers=headers)

In [3]:
soup = BeautifulSoup(results.text, "html.parser")

In [4]:
def parse_movie(foobar):
    def get_title(foobar):
        return foobar.h3.a.text
    
    def get_year(foobar):
        return foobar.h3.find('span', class_='lister-item-year').text
    
    def get_runtime(foobar):
        return foobar.find('span', class_='runtime').text if foobar.p.find('span', class_='runtime') else None
    
    def get_imdb_rating(foobar):
        return float(foobar.strong.text)
    
    def get_metascore(foobar):
        return foobar.find('span', class_='metascore').text if foobar.find('span', class_='metascore') else None
    
    def get_votes(foobar):
        nv = foobar.find_all('span', attrs={'name': 'nv'})
        return nv[0].text
    
    def get_gross(foobar):
        nv = foobar.find_all('span', attrs={'name': 'nv'})
        return nv[1].text if len(nv) > 1 else ''  
    
    return {
        'title': get_title(foobar),
        'year': get_year(foobar),
        'runtime': get_runtime(foobar),
        'imdb_rating': get_imdb_rating(foobar),
        'meta_score': get_metascore(foobar),
        'votes': get_votes(foobar),
        'us_gross': get_gross(foobar)}


In [5]:
movies = [parse_movie(foo) for foo in soup.find_all('div', class_='lister-item mode-advanced')]

In [6]:
print(movies)

[{'title': 'Soul', 'year': '(2020)', 'runtime': '100 min', 'imdb_rating': 8.2, 'meta_score': '85        ', 'votes': '60,121', 'us_gross': ''}, {'title': "It's a Wonderful Life", 'year': '(1946)', 'runtime': '130 min', 'imdb_rating': 8.6, 'meta_score': '89        ', 'votes': '401,027', 'us_gross': ''}, {'title': 'Home Alone', 'year': '(1990)', 'runtime': '103 min', 'imdb_rating': 7.6, 'meta_score': '63        ', 'votes': '478,684', 'us_gross': '$285.76M'}, {'title': 'A Christmas Story', 'year': '(1983)', 'runtime': '93 min', 'imdb_rating': 7.9, 'meta_score': '77        ', 'votes': '131,737', 'us_gross': '$20.61M'}, {'title': 'Die Hard', 'year': '(1988)', 'runtime': '132 min', 'imdb_rating': 8.2, 'meta_score': '72        ', 'votes': '785,798', 'us_gross': '$83.01M'}, {'title': 'Klaus', 'year': '(2019)', 'runtime': '96 min', 'imdb_rating': 8.2, 'meta_score': '65        ', 'votes': '96,936', 'us_gross': ''}, {'title': 'The Sound of Music', 'year': '(1965)', 'runtime': '172 min', 'imdb_rati

In [7]:
movies_df = pd.DataFrame(movies)

In [8]:
movies_df['year'] = movies_df['year'].str.extract('(\d+)').astype(int)
movies_df['runtime'] = movies_df['runtime'].str.extract('(\d+)').astype(int)
movies_df['meta_score'] = movies_df['meta_score'].astype(int)
movies_df['votes'] = movies_df['votes'].str.replace(',', '').astype(int)
movies_df['us_gross'] = movies_df['us_gross'].map(lambda x: x.lstrip('$').rstrip('M'))
movies_df['us_gross'] = pd.to_numeric(movies_df['us_gross'])
# movies_df['us_gross'] = pd.to_numeric(movies['us_gross'], errors='coerce')

In [9]:
movies_df.head()

Unnamed: 0,title,year,runtime,imdb_rating,meta_score,votes,us_gross
0,Soul,2020,100,8.2,85,60121,
1,It's a Wonderful Life,1946,130,8.6,89,401027,
2,Home Alone,1990,103,7.6,63,478684,285.76
3,A Christmas Story,1983,93,7.9,77,131737,20.61
4,Die Hard,1988,132,8.2,72,785798,83.01


In [10]:
movies_df.dtypes

title           object
year             int64
runtime          int64
imdb_rating    float64
meta_score       int64
votes            int64
us_gross       float64
dtype: object