### Load required libraries/packages

In [41]:
# pip install requests
# pip install html5lib
# pip install bs4

In [42]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import os

#### Take a large sample of movies for each genre by scraping IMDB movie rating and details using Python and saving the details into .csv file

### Parsing details of top 250 movies (by user ratings) for each genre
Specifications: Feature Film, Rating Count at least 20,000, (Sorted by IMDb Rating Descending)

In [43]:
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:96.0) Gecko/20100101 Firefox/96.0'}

def get_genre_url_dict(url):
    '''
    Returns a dictionary of genres (keys) and their IMDb urls (values) to that genre.
        
        Params:
            url (str) : A string of the url to the IMDb 'Browse Movies by Genre' page, containing genres.
    '''
    # parsing movie-only genres    
    with requests.Session() as session:
        r = session.get(url, headers=HEADERS)
        #print(r.status_code)
        soup = BeautifulSoup(r.text, "html.parser")
        genres = soup.select('div.ipc-chip-list__scroller span')
        genres_ls = []
        counter = 0
        for genre in genres:        
            if counter == 1:
                genres_ls.append(genre.contents[0])
            if genre.contents[0].lower() == 'thriller':
                counter += 1
    # Getting URLs of different pages
    url_dict = {}

    for genre in genres_ls:
        url = "https://www.imdb.com/search/title/?title_type=feature&num_votes=20000,&genres={}&sort=user_rating,desc&count=250"
        formated_url = url.format(genre)
        url_dict[genre] = formated_url
    
    return url_dict

def get_movies(url, interval, file_name):
    '''
    Returns top 250 movies (sorted by popularity, desc) for a single genre as a .csv file
        
        Params:
            url (str) : A string of the url to the IMDb page of the movie genre.
            interval (int) : Number of seconds for which the code is required to be stopped.
            file_name (str) : The name of the resulting .csv file to be saved
    '''
    # create directory to store datasets (.csv files)
    if not os.path.exists('datasets/'):
        os.makedirs('datasets/', 0o777, exist_ok=True)
        os.chmod('datasets/', 0o777)
        os.chmod('datasets/', 0o777)
    
    # Sending a request to the specifed URL
    resp = requests.get(url, headers=HEADERS)

    # Converting the response to Beautiful Soup Object
    content = BeautifulSoup(resp.content, 'html.parser')
    movie_list = []
    # Iterating throught the list of movies 
    for movie in content.select('.lister-item-content'):

        try:
            # Creating a python dictonary
            data = {

                "title":movie.select('.lister-item-header')[0].a.get_text().strip(),
                "year":movie.select('.lister-item-year')[0].get_text().strip(),
                "certificate":movie.select('.certificate')[0].get_text().strip(),
                "time":movie.select('.runtime')[0].get_text().strip(),
                "genre":movie.select('.genre')[0].get_text().strip(),
                "rating":movie.select('.ratings-imdb-rating')[0].get_text().strip(),
                "metascore":movie.find('span', class_='metascore').text if movie.find('span', class_='metascore') else '-',
                "simple_desc":movie.select('.text-muted')[2].get_text().strip(),
                "directors":'-',
                "stars":'-',
                "votes":'-',#movie.select('.sort-num_votes-visible')[0].get_text().strip(),
                "gross":'-'#movie.select('.sort-num_votes-visible')[1].get_text().strip() if len(nv) > 1 else '-' 


            }
            # adding directors
            # soup = BeautifulSoup(html,'html5lib')
            # p_tag = soup.find('p')
            # span = p_tag.find('span',class_ = "ghost")
            # prev = list(span.previous_siblings) #Finds all the tags before the span tag with class ghost and converts them into a list
            # prev = [str(x) for x in prev]
            # prev = ''.join(prev) #Converts the list to a string
            # soup2 = BeautifulSoup(prev,'html5lib') #Creates a new BeautifulSoup object with the newly formed string
            # a_tags = soup2.find_all('a')
            # for a in a_tags:
            #     txt = a.text.strip()
            #     data['directors'].append(txt)

            directors_and_stars = movie.find_all(text=lambda t: ('Director' in t) and (':' in t))

            for d in directors_and_stars:
                d_and_s = [t.strip() for t in d.find_previous('p').find_all(text=True)[1:] if t.strip() and t.strip() != ',']
                try:
                    d_ls = d_and_s[:d_and_s.index('|')]
                    s_ls = d_and_s[d_and_s.index('|')+2:]
                except ValueError:
                    print(d_and_s)
                
                data["directors"] = d_ls
                data["stars"] = s_ls
            # adding stars (main actors)
            
            
            # adding votes, gross
            nv = movie.find_all('span', attrs={'name':'nv'})
            data['votes'] = nv[0].text
            data['gross'] = nv[1].text if len(nv) > 1 else '-'
            
        except IndexError:
            continue
        movie_list.append(data)
    dataframe = pd.DataFrame(movie_list)    
    dataframe.to_csv('datasets/' + file_name)

    
def get_all_movies(csv_list):
    '''
    Returns a single Pandas.dataframe containing movie data for movies across all genres
    '''
    l = []
 
    for f in csv_list:
        l.append(pd.read_csv(f))

    df_res = pd.concat(l, ignore_index=True)
    return df_res

In [44]:
base_url = 'https://www.imdb.com/feature/genre/?ref_=nv_ch_gr'
url_dict = get_genre_url_dict(base_url)
csv_list = []
for genre, url in url_dict.items():
    get_movies(url, 1, genre+'.csv')
    csv_list.append('datasets/'+genre+'.csv')
    print("Saved:", genre+'.csv')

Saved: Action.csv
Saved: Adventure.csv
Saved: Animation.csv
Saved: Biography.csv
Saved: Comedy.csv
Saved: Crime.csv
Saved: Documentary.csv
Saved: Drama.csv
Saved: Family.csv
Saved: Fantasy.csv
Saved: Film-Noir.csv
Saved: History.csv
Saved: Horror.csv
Saved: Music.csv
Saved: Musical.csv
Saved: Mystery.csv
Saved: Romance.csv
Saved: Sci-Fi.csv
Saved: Short.csv
Saved: Sport.csv
Saved: Thriller.csv
