In [7]:
import requests
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
import numpy as np
import pandas as pd
import seaborn as sns

In [8]:
def scrapping(genre):
    pages = [1,51,101,151,201]
    URL1 = f'https://www.imdb.com/search/title/?title_type=feature&num_votes=10000,&genres={genre}&languages=en&sort=user_rating,desc&start='
    URL2 = '&explore=genres&ref_=adv_nxt'
    headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Mandarin
    #initialize empty lists to store the variables scraped
    genre_titles = []
    genre_imdb_ids = []
    genre_genres = []
    genre_imgdata = []
    genre_plot = []
    for page in pages:
       #get request for adventure
       response = get(URL1
                      + str(page)
                      + URL2, headers=headers)
       sleep(randint(8,15))
       #throw warning for status codes that are not 200
       if response.status_code != 200:
           warn('Request: {}; Status code: {}'.format(requests, response.status_code))
       #parse the content of current iteration of request
       page_html = BeautifulSoup(response.text, 'html.parser')
       movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
       #extract the 50 movies for that page
       for container in movie_containers:
            #title
            title = container.h3.a.text
            genre_titles.append(title)
            #imdb_id
            imdb_id = container.find('a')['href'].strip().split('/')[-2]
            genre_imdb_ids.append(imdb_id)
            #images
            img = f'http://img.omdbapi.com/?i={imdb_id}&h=600&apikey=7c8ba5e4'
            genre_imgdata.append(img)
            #genre
            genre_name = container.p.find('span', class_ = 'genre').text.replace("\n", "").rstrip().split(',') # remove the whitespace character, strip, and split to create an array of genres
            genre_genres.append(genre_name)
            #plot
            plot_url = f'https://www.omdbapi.com/?i={imdb_id}&apikey=7c8ba5e4&plot=full'
            response = requests.get(plot_url)
            response.raise_for_status()  # raises exception when not a 2xx response
            if response.status_code != 204:
                response = response.json()
            genre_plot.append(response['Plot'])
    df = pd.DataFrame({
        'movie': genre_titles,
        'imdb_id': genre_imdb_ids,
        'genre': genre_genres,
        'plot':genre_plot,
        'image_url':genre_imgdata}
                              )
    return df

In [14]:
import requests
import os

def create_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

def download_photos(genre, df):
    for index, row in df.iterrows():
        img_data = requests.get(row['image_url']).content
        imdb_id = row['imdb_id']
        with open(create_dir(f"../raw_data/posters/{genre}") + "/" + f'{imdb_id}.jpg', 'wb') as handler:
            handler.write(img_data)

In [21]:
genres = ["horror", "music", "musical"]
for genre in genres:
    print(f"starting {genre}")
    df = scrapping(genre)
    print(f"done scrapping {genre}")
    download_photos(genre, df)
    print(f"done downloading photos of {genre}")
    df.to_csv(f"../raw_data/{genre}.csv")

starting horror
done scrapping horror
done downloading photos of horror
starting music
done scrapping music
done downloading photos of music
starting musical
done scrapping musical
done downloading photos of musical
