# Part 1

Save the following code in a file named `imdb1.py`. ⬇️ 

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

filename = "imdb.json"

class IMDbSpider(scrapy.Spider):
    name = "imdb_movies"
    start_urls = ['https://www.imdb.com/chart/top/']
    
    def parse(self, response):
        
        for movie in response.css(".ipc-metadata-list-summary-item"):
            # Title and rank
            title_and_rank = movie.css("h3.ipc-title__text::text").get()
            rank, title = (title_and_rank.split(". ", 1) if title_and_rank else (None, None))

            # Movie URL
            url = movie.css("a.ipc-title-link-wrapper::attr(href)").get()
            full_url = response.urljoin(url) if url else None

            # Rating
            rating = movie.xpath('.//span[contains(@class, "ipc-rating-star--rating")]/text()').get()

            # Votes
            votes = movie.xpath('.//span[@class="ipc-rating-star--voteCount"]//text()').getall()
            votes = votes[1].strip('"()')

            # Yield the result
            yield {
                "rank": rank,
                "title": title,
                "url": full_url,
                "rating": rating,
                "votes": votes,
            }


process = CrawlerProcess(settings = {
    'USER_AGENT': 'Chrome/97.0',
    "FEEDS": {
        '01-Become_a_movie_director/' + filename : {"format": "json"},
    },
})

process.crawl(IMDbSpider)
process.start()

# Part 2 - Optional 💪💪

Save the following code in a file named `imdb2.py`. ⬇️ 

In [None]:
import os 
import logging
import scrapy
from scrapy.crawler import CrawlerProcess

my_file = open("01-Become_a_movie_director/url_list.txt", "r")
content = my_file.read()
content_list = content.split("\n")[:-1]

class imdb_spider(scrapy.Spider):
    # Name of your spider
    name = "imdb"

    # Url to start your spider from 
    start_urls = content_list
    # Callback function that will be called when starting your spider
    # It will get text, author and tags of the first <div> with class="quote"
    # //*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[250]/td[2]/a
    # //*[@id="main"]/div/span/div/div/div[3]/table/tbody/tr[1]/td[2]/a
    def parse(self, response):
        return {
            "title": response.xpath('/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/h1/text()').get(),
            "url": response.request.url,
            "cast": [
                {
                    "actor" : element.xpath("div[2]/a/text()").get(),
                    "role" : element.xpath("div[2]/div/ul/li/a/span[1]/text()").get()
                }
                for element in response.xpath('//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[4]/div[2]/div[2]/div')
            ],
            "storyline": response.xpath('/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[1]/p/span[1]/text()').get(),
            "tags": response.xpath('//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section/div[2]/div[2]/a/span/text()').getall()
        }
        

# Name of the file where the results will be saved
filename = "imdb3.json"

# If file already exists, delete it before crawling (because Scrapy will 
# concatenate the last and new results otherwise)
if filename in os.listdir('01-Become_a_movie_director/'):
        os.remove('01-Become_a_movie_director/' + filename)

# Declare a new CrawlerProcess with some settings
## USER_AGENT => Simulates a browser on an OS
## LOG_LEVEL => Minimal Level of Log 
## FEEDS => Where the file will be stored 
## More info on built-in settings => https://docs.scrapy.org/en/latest/topics/settings.html?highlight=settings#settings
process = CrawlerProcess(settings = {
    'USER_AGENT': 'Chrome/97.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        '01-Become_a_movie_director/' + filename : {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED" : False
})

# Start the crawling using the spider you defined above
process.crawl(imdb_spider)
process.start()