In [1]:
import time
import requests
import numpy as np
import pandas as pd
from scrapy.http import TextResponse
import re

# Problem 1

In [2]:
def send_request(url):
    page = requests.get(url)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")
    return response

In [3]:
def scraping_quotes(response):
    quotes = response.css("div.quote > span.text::text").extract()
    author = response.css("small.author::text").extract()
    tags_div =response.css("div.tags ")
    tags = [i.css("a.tag::text").extract() for i in tags_div]
    hyper = response.css("small.author ~ a::attr(href)").extract()
    base_link = 'http://quotes.toscrape.com'
    hyperlink = [base_link + i for i in hyper]
    return pd.DataFrame({"quotes":quotes,"author":author,"tags":tags,"hyperlink":hyperlink})
    

In [4]:
quotes = []
url = "http://quotes.toscrape.com/" 
while True:
    response = send_request(url)
    quotes.append(scraping_quotes(response))
    next_page_url = response.css("li.next > a::attr(href)").extract_first()
    if next_page_url:
        url = response.urljoin(next_page_url)
    else:
        break

In [5]:
quote = pd.concat(quotes)
quote

Unnamed: 0,quotes,author,tags,hyperlink
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]",http://quotes.toscrape.com/author/Albert-Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]",http://quotes.toscrape.com/author/J-K-Rowling
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]",http://quotes.toscrape.com/author/Albert-Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]",http://quotes.toscrape.com/author/Jane-Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]",http://quotes.toscrape.com/author/Marilyn-Monroe
...,...,...,...,...
5,“You never really understand a person until yo...,Harper Lee,[better-life-empathy],http://quotes.toscrape.com/author/Harper-Lee
6,“You have to write the book that wants to be w...,Madeleine L'Engle,"[books, children, difficult, grown-ups, write,...",http://quotes.toscrape.com/author/Madeleine-LE...
7,“Never tell the truth to people who are not wo...,Mark Twain,[truth],http://quotes.toscrape.com/author/Mark-Twain
8,"“A person's a person, no matter how small.”",Dr. Seuss,[inspirational],http://quotes.toscrape.com/author/Dr-Seuss


# Problem 2


In [6]:
def scraping_books(url,base_url="http://books.toscrape.com/"):
    page = requests.get(url)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")
    title = response.css("h3 > a::attr(title)").extract()
    price1 = response.css("p.price_color::text").extract()
    price = [i.replace("Â", "") for i in price1]
    burl = response.css("h3 >a::attr(href)").extract()
    purl = response.css("img::attr(src)").extract()
    star = response.css("p[class^='star-rating']::attr(class)").extract()
    star_rating = []
    for i in star:
        star_rating.append(i.replace("star-rating", ""))
    stock  =response.css("p.price_color ~ p[class^='instock']::attr(class)").extract()
    instock = [i.replace("availability", " ") for i in stock]    
    base_url = "http://books.toscrape.com/catalogue/"
    bookurl = [base_url + i for i in burl]
    picurl = [base_url + i for i in purl]
    return pd.DataFrame({"title":title,  "price":price, "bookurl":bookurl, "picurl":picurl,"star_rating":star_rating,"instock":instock})

In [7]:
books = []
for i in range(1,1000):
    cureent_page =scraping_books(url = f"http://books.toscrape.com/catalogue/page-{i}.html")
    if cureent_page.shape[0] == 0:
        break
    else:
        books.append(cureent_page)

In [8]:
books = pd.concat(books)
books

Unnamed: 0,title,price,bookurl,picurl,star_rating,instock
0,A Light in the Attic,£51.77,http://books.toscrape.com/catalogue/a-light-in...,http://books.toscrape.com/catalogue/../media/c...,Three,instock
1,Tipping the Velvet,£53.74,http://books.toscrape.com/catalogue/tipping-th...,http://books.toscrape.com/catalogue/../media/c...,One,instock
2,Soumission,£50.10,http://books.toscrape.com/catalogue/soumission...,http://books.toscrape.com/catalogue/../media/c...,One,instock
3,Sharp Objects,£47.82,http://books.toscrape.com/catalogue/sharp-obje...,http://books.toscrape.com/catalogue/../media/c...,Four,instock
4,Sapiens: A Brief History of Humankind,£54.23,http://books.toscrape.com/catalogue/sapiens-a-...,http://books.toscrape.com/catalogue/../media/c...,Five,instock
...,...,...,...,...,...,...
15,Alice in Wonderland (Alice's Adventures in Won...,£55.53,http://books.toscrape.com/catalogue/alice-in-w...,http://books.toscrape.com/catalogue/../media/c...,One,instock
16,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",£57.06,http://books.toscrape.com/catalogue/ajin-demi-...,http://books.toscrape.com/catalogue/../media/c...,Four,instock
17,A Spy's Devotion (The Regency Spies of London #1),£16.97,http://books.toscrape.com/catalogue/a-spys-dev...,http://books.toscrape.com/catalogue/../media/c...,Five,instock
18,1st to Die (Women's Murder Club #1),£53.98,http://books.toscrape.com/catalogue/1st-to-die...,http://books.toscrape.com/catalogue/../media/c...,One,instock


# Problem 3

In [9]:
def imdb_scrap(url,base_url="https://www.imdb.com/chart/moviemeter/"):
    page = requests.get(url)
    response = TextResponse(body=page.text,url=url,encoding="utf-8")
    title = response.css("td.titleColumn >a ::text").extract()
    year_brack = response.css("td >span.secondaryInfo::text").extract()
    year = [i.replace("(" , "").replace(")" , "") for i in year_brack]
    ratings_td = response.css("td.ratingColumn.imdbRating")
    rating = [i.css('strong::text').extract_first() for i in ratings_td]
    murl = response.css("td.titleColumn ::attr(href)").extract()
    hyper = 'https://www.imdb.com/'
    movie_hyperlink = [hyper+i for i in murl]
    rank_div = response.css("div.velocity")
    rank_ext = [i.css("::text").extract_first() for i in rank_div]
    rank = [i.replace('\n',"").replace("(no change)","") for i in rank_ext]  
    return pd.DataFrame({"title":title,  "year":year, "rating":rating, "movie_hyperlink": movie_hyperlink , 'rank':rank,})

In [10]:
scrapedimdb =imdb_scrap(url = "https://www.imdb.com/chart/moviemeter/")
scrapedimdb

Unnamed: 0,title,year,rating,movie_hyperlink,rank
0,365 dni,2020,3.5,https://www.imdb.com//title/tt10886166/,1
1,Da 5 Bloods,2020,6.7,https://www.imdb.com//title/tt9777644/,2
2,Artemis Fowl,2020,4.1,https://www.imdb.com//title/tt3089630/,3
3,The King of Staten Island,2020,7.2,https://www.imdb.com//title/tt9686708/,4
4,Knives Out,2019,7.9,https://www.imdb.com//title/tt8946378/,5
...,...,...,...,...,...
95,The Rising Hawk,2019,6.2,https://www.imdb.com//title/tt7439064/,96
96,Bloodshot,2020,5.7,https://www.imdb.com//title/tt1634106/,97
97,The Room,2003,3.7,https://www.imdb.com//title/tt0368226/,98
98,Emma.,2020,6.8,https://www.imdb.com//title/tt9214832/,99
