In [149]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'
        }
        with closing(get(url, stream=True, headers=headers)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except RequestException as e:
        print('The following error occurred during HTTP GET request to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200
            and content_type is not None
            and content_type.find('html') > -1)

In [150]:
from bs4 import BeautifulSoup
import re
import pandas as pd
from math import ceil
import argparse

IMDB_URL = 'https://www.imdb.com/search/title/?title_type=feature&release_date=1930-01-01,2020-01-01&num_votes=5000,&sort=user_rating,desc&start=1&view=advanced'

In [205]:
s = 1930
e = 2020
m = 5

In [206]:
result = pd.DataFrame(columns = ['title', 'rating', 'year_of_release', 'director', 'actors', 'runtime', 'url'])

In [203]:
ass = []

for i in range(s, e+1):
    ass.append(sum(result.year_of_release == i) >= m)

cond = all(x == True for x in ass)

In [210]:
i = 1

while cond == False:
    
    print(i)
    
    IMDB_URL = f'https://www.imdb.com/search/title/?title_type=feature&release_date={s}-01-01,{e}-01-01&num_votes=5000,&sort=user_rating,desc&start={i}&view=advanced'
    
    html = simple_get(IMDB_URL)
    dom = BeautifulSoup(html, 'html.parser')
    main_html = dom.find("div", {"id": "main"})
    movies_list = main_html.find_all("div", {'class': 'lister-item-content'})
    
    for movie in movies_list:
        try:
            header = movie.find("h3", {"class": "lister-item-header"})
            title = header.find("a").getText()
            rating = movie.find('div', {'class': 'ratings-bar'}).find('div', {'class': "inline-block ratings-imdb-rating"}).find("strong").getText()
            year_of_release = int(header.find("span", {"class": "lister-item-year text-muted unbold"}).getText()[1:5])
            director = movie.find_all('p')[2].getText().split('|')[0].split(':')[1].replace('\n', '')
            actors = movie.find_all('p')[2].getText().split('|')[1].replace('\n', '').split(':')[1].replace(',', ';')
            runtime = int(movie.find('span', {'class': 'runtime'}).getText()[:-4])
            url = movie.find('a')['href']

            dummy = pd.DataFrame({'title': title,
                                 'rating': rating,
                                 'year_of_release': year_of_release,
                                 'director': director,
                                 'actors': actors,
                                 'runtime': runtime,
                                 'url': url}, index=[0])
            result = pd.concat([result, dummy])
        except:
            pass
    
    i+=50
    
    ass = []

    for j in range(s, e+1):
        ass.append(sum(result.year_of_release == j) >= m)

    cond = all(x == True for x in ass)

1
51
101
151
201
251
301
351
401
451
501
551
601
651
701
751
801
851
901
951
1001
1051
1101
1151
1201
1251
1301
1351
1401
1451
1501
1551
1601
1651
1701
1751
1801
1851
1901
1951
2001
2051
2101
2151
2201
2251
2301
2351
2401
2451
2501
2551
2601
2651
2701
2751
2801
2851
2901
2951
3001
3051
3101
3151
3201
3251
3301
3351
3401
3451
3501
3551
3601
3651
3701
3751
3801
3851
3901
3951
4001
4051
4101
4151
4201
4251
4301
4351
4401
4451
4501
4551
4601
4651
4701
4751
4801
4851
4901
4951
5001
5051
5101
5151
5201
5251
5301
5351
5401
5451
5501
5551
5601
5651
5701
5751
5801
5851
5901
5951
6001
6051
6101
6151
6201
6251
6301
6351
6401
6451
6501
6551
6601
6651
6701
6751
6801
6851
6901
6951
7001
7051
7101
7151
7201
7251
7301
7351
7401
7451
7501
7551
7601
7651
7701
7751
7801
7851
7901
7951
8001
8051
8101
8151
8201
8251
8301
8351
8401
8451
8501
8551
8601
8651
8701
8751
8801
8851
8901
8951
9001
9051
9101
9151
9201
9251
9301
9351
9401
9451
9501
9551
9601
9651
9701
9751
9801
9851
9901
9951
10001
10051
10101
10151

KeyboardInterrupt: 

In [212]:
result.to_csv('movies.csv', index=False)