In [5]:
from requests import get
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
import numpy as np
import pandas as pd
import seaborn as sns

In [14]:
def scrapping(genre):
    pages = [1,51,101,151,201,251,301,351,401,451,501]
    URL1 = f'https://www.imdb.com/search/title/?title_type=feature&num_votes=10000,&genres={genre}&languages=en&sort=user_rating,desc&start='
    URL2 = '&explore=genres&ref_=adv_nxt'
    headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Mandarin
    
    #initialize empty lists to store the variables scraped
    genre_titles = []
    genre_imdb_ids = []
    genre_genres = []
    genre_imgdata = []
    genre_plot = []
    
    for page in pages:
      
       #get request for adventure
       response = get(URL1
                      + str(page)
                      + URL2, headers=headers)
      
       sleep(randint(8,15))
       
       #throw warning for status codes that are not 200
       if response.status_code != 200:
           warn('Request: {}; Status code: {}'.format(requests, response.status_code))
    
       #parse the content of current iteration of request
       page_html = BeautifulSoup(response.text, 'html.parser')
          
       movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
      
       #extract the 50 movies for that page
       for container in movie_containers:
    
            #title
            title = container.h3.a.text
            genre_titles.append(title)
    
            #imdb_id
            imdb_id = container.find('a')['href'].strip().split('/')[-2]
            genre_imdb_ids.append(imdb_id)
    
            #images
            img = f'http://img.omdbapi.com/?i={imdb_id}&h=600&apikey=7c8ba5e4'
            genre_imgdata.append(img)
                
            #genre
            genre_name = container.p.find('span', class_ = 'genre').text.replace("\n", "").rstrip().split(',') # remove the whitespace character, strip, and split to create an array of genres
            genre_genres.append(genre_name)
            
            #plot
            plot_url = f'https://www.omdbapi.com/?i={imdb_id}&apikey=7c8ba5e4&plot=full'
            response = requests.get(plot_url)
            response.raise_for_status()  # raises exception when not a 2xx response
            if response.status_code != 204:
                response = response.json()
            genre_plot.append(response['Plot'])
            
    df = pd.DataFrame({'movie': genre_titles,
                          'imdb_id': genre_imdb_ids,
                          'genre': genre_genres,
                       'plot':genre_plot,
                        'image_url':genre_imgdata}
                              )
    return df

In [3]:
action_df = scrapping("action")

In [4]:
adventure_df = scrapping("adventure")

KeyboardInterrupt: 

In [None]:
animation_df = scrapping("animation")

In [None]:
biography_df = scrapping("biography")

In [None]:
comedy_df = scrapping("comedy")

In [None]:
crime_df = scrapping("crime")

In [None]:
documentary_df = scrapping("documentary")

In [None]:
action_df.to_csv('action_df.csv')
adventure_df.to_csv('adventure_df.csv')
animation_df.to_csv('animation_df.csv')
biography_df.to_csv('biography_df.csv')
comedy_df.to_csv('comedy_df.csv')
crime_df.to_csv('crime_df.csv')
documentary_df.to_csv('documentary_df.csv')

## Scrap for all (plot csv)

In [16]:
genre_list = ["Action", "Adventure", "Animation", "Biography", "Comedy", "Crime", "Documentary","Drama", "Family", "Fantasy", "Film-Noir", "History", "Horror", "Music", "Musical","Mystery", "Romance", "Sci-Fi", "Sport", "Thriller", "War", "Western"]
dataframes = []
for i in genre_list:
    df = scrapping(i)
    dataframes.append(df)
    print(f"done for {i}")
merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.to_csv('merged_df.csv')

done for Action
done for Adventure
done for Animation
done for Biography
done for Comedy
done for Crime
done for Documentary
done for Drama


KeyboardInterrupt: 

In [11]:
def savePoster(imdb_id, img_url):
    '''
    Function that fetches and save the poster image from provided url
    and saves it with the provided id (corresponding with IMDb).
    Won't replace (or even fetch) if file already exists.
    
    INPUT:  id from imdb, url where to find image
    OUTPUT: boolean flag if saved or not.
    '''
    import os.path
    
    # Get file extension
    ext = img_url.split('.')[-1]
    
    # Check to see if I already have it
    if os.path.isfile(f'posters/{imdb_id}.{ext}'):
        return False
    
    # Get image data, and save it as imdb_id
    response = requests.get(img_url)
    img = Image.open(BytesIO(response.content))    
    img.save(f'posters/{imdb_id}.{ext}')
    
    return True

In [12]:
import requests
import os

def create_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    return directory

def download_photos(genre, df):
    for index, row in df.iterrows():
        img_data = requests.get(row['image_url']).content
        imdb_id = row['imdb_id']
        with open(create_dir(f"{genre}") + "/" + f'{imdb_id}.jpg', 'wb') as handler:
            handler.write(img_data)

In [28]:
download_photos("animation", animation_df)

In [29]:
download_photos("biography", biography_df)

In [30]:
download_photos("comedy", comedy_df)

In [31]:
download_photos("crime", crime_df)

## Download all photos

In [None]:
download_photos("all photo", merged_df)