# Web Crawler
A Web crawler, sometimes called a spider or spiderbot and often shortened to crawler, is an Internet bot that systematically browses the internet to Web indexing (web spidering). Here in my tests I am going to use **magazine luiza store** to get information and download some images. For help me on that, I am also using the library `BeautifulSoup` and `urllib`.

[source: **keeping the same file format when saving .xlsx file using python**](https://stackoverflow.com/questions/51411809/keeping-the-same-file-format-when-saving-xlsx-file-using-python)

## Import the necessary libraries

In [6]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlopen
from urllib.request import urlretrieve
import os
import pandas as pd
import requests
import urllib
import csv
import re
import xlsxwriter
from openpyxl.workbook import Workbook
from openpyxl import load_workbook

## Here is my web page that I want to crawl

In [2]:
# get contents from url
webpage = 'https://www.magazineluiza.com.br/smartphone-motorola-g7-play-32gb-indigo-4g-2gb-ram-tela-57-cam-13mp-cam-selfie-8mp/p/155549300/te/mtgp/'

# get contents from url
page = requests.get(webpage)

## Here are my functions

In [3]:
def extract_title(content):
    # get soup
    soup = BeautifulSoup(content, 'lxml')
    tag = soup.find('title', text=True)
    if not tag:
        return None
    return tag.string.strip()

def extractMax(input): 
    numbers = re.findall('\d+',input)
    numbers = map(int,numbers)
    return max(numbers)

def extract_old_price(content):
    # get soup
    soup = BeautifulSoup(content, 'lxml')
    tag = soup.select_one('.price-template__from')
    if not tag:
        return None
    return extractMax(tag.string.strip())


def extract_new_price(content):
    # get soup
    soup = BeautifulSoup(content, 'lxml')
    tag = soup.select_one('.price-template__text')
    if not tag:
        return None
#     return tag.string.strip()
    return extractMax(tag.string.strip())

def extract_all_links(content):
    # get soup
    soup = BeautifulSoup(content, 'lxml')
    links = set()
    for tag in soup.find_all('a', href=True):
        if tag['href'].startswith('https://www.magazineluiza.com.br/'):
            links.add(tag['href'])
    return links

def extract_img_links(content):
    # get soup
    soup = BeautifulSoup(content,'lxml')
    img_links = set()    
    for image_tag in soup.findAll("img", {"class":"carousel-product__item-img js-carousels-main-item-img"}):
        img_links.add(image_tag.get('src'))
    return img_links

def extract_showcase_link(content):
    soup = BeautifulSoup(content,'lxml') # choose lxml parser
    image_tags = soup.findAll('img', {"class":"showcase-product__big-img"})
    for image_tag in image_tags:
        return(image_tag.get('src'))
    
def download_showcase_img(content):
    soup = BeautifulSoup(content, 'lxml')
    imgs = soup.findAll("img", {"class":"showcase-product__big-img"})
    for img in imgs:
        img_url = urljoin(webpage, img['src'])
        file_name = img['src'].split('/')[-1]
        file_path = os.path.join("/Users/mattosoerick/Desktop/crawler/img/", file_name)
        urlretrieve(img_url, file_path)

# Fazer o Craler de verdade

In [4]:
# def crawl(start_url):
#     seen_urls = set([start_url])
#     available_urls = set([start_url])
    
#     header = [
#         'titulo',
#         'preco antigo',
#         'preco novo',
#         'link da vitrine',
#         'link de todas as imagens do carrosel']
    
#     with open('myProducts.csv', 'w', encoding='UTF-8') as csvFile:
#         writer = csv.writer(csvFile,delimiter=',')
#         writer.writerow(header)   
#     csvFile.close()
    
#     while available_urls:
#         url = available_urls.pop()
#         try:
#             content = requests.get(url, timeout=3).text
#         except Exception:
#             continue

#         for link in extract_all_links(content):
#             if link not in seen_urls:
#                 seen_urls.add(link)
#                 available_urls.add(link)
        
#         if(extract_new_price(content)):
#             print(extract_title(content))
#             print(url)
#             print(extract_old_price(content)) 
#             print(extract_new_price(content))
#             print(extract_showcase_link(content))
#             download_showcase_img(content)
#             print()
#             print()
#             print()
            
#             lines = [
#                 (extract_title(content)),\
#                 (url),\
#                 (extract_old_price(content)),\
#                 (extract_new_price(content))]
            
#             with open('myProducts.csv', 'a', encoding='UTF-8') as csvFile:

#                 writer = csv.writer(csvFile,delimiter=',')
#                 writer.writerow(lines)
#             csvFile.close() 

# try:
#     crawl(webpage)
# except KeyboardInterrupt:
#     print('Bye!')

In [14]:
def crawl(start_url):
    seen_urls = set([start_url])
    available_urls = set([start_url])
    
    headers = [
        'titulo',
        'preco antigo',
        'preco novo',
        'link da vitrine',
        'link de todas as imagens do carrosel']

    workbook_name = 'uuuu.xlsx'
    wb = Workbook()
    page = wb.active
    page.title = 'companies'
    page.append(headers)
    wb.save(filename = workbook_name)
    
    while available_urls:
        url = available_urls.pop()
        try:
            content = requests.get(url, timeout=3).text
        except Exception:
            continue

        for link in extract_all_links(content):
            if link not in seen_urls:
                seen_urls.add(link)
                available_urls.add(link)
        
        if(extract_new_price(content)):
            print(extract_title(content))
            print(url)
#             print(extract_old_price(content)) 
#             print(extract_new_price(content))
#             print(extract_showcase_link(content))
#             download_showcase_img(content)
            print()
            print()
            print()
            
            lines = [
                (extract_title(content)),\
                (url),\
                (extract_old_price(content)),\
                (extract_new_price(content))]
            
                        
            wb = load_workbook(workbook_name)
            page = wb.active
#             for info in lines:
            page.append(lines)
            wb.save(filename=workbook_name)
            
try:
    crawl(webpage)
    
except KeyboardInterrupt:
    print('Bye!')

Smartphone Motorola G7 Play 32GB Indigo 4G - 2GB RAM Tela 5,7” Câm. 13MP + Câm. Selfie 8MP - Moto G7 Play - Magazine Luiza
https://www.magazineluiza.com.br/smartphone-motorola-g7-play-32gb-indigo-4g-2gb-ram-tela-57-cam-13mp-cam-selfie-8mp/p/155549300/te/mtgp/



iPhone 8 Apple 64GB Dourado 4G Tela 4,7” - Retina Câm. 12MP + Selfie 7MP iOS 11 - iPhone - Magazine Luiza
https://www.magazineluiza.com.br/iphone-8-apple-64gb-dourado-4g-tela-47-retina-cam-12mp-selfie-7mp-ios-11/p/155542800/te/teip/



Cortador de Cabelo Mallory Mithos Power - 1 Velocidade - Máquina de Cortar Cabelo - Magazine Luiza
https://www.magazineluiza.com.br/cortador-de-cabelo-mallory-mithos-power-1-velocidade/p/208569200/pf/macc/



Bye!
