In [8]:
# Librerías para automatización de navegadores web con Selenium
from selenium import webdriver  
from webdriver_manager.chrome import ChromeDriverManager 
 
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.support.ui import Select 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException 

from time import sleep

# Librerías para captura de datos
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import pickle
import re
import json
import requests
from tqdm import tqdm

# Google sheets
import gspread
from google.auth import default
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
from gspread_dataframe import set_with_dataframe
from googleapiclient.discovery import build

In [91]:
url = "https://minimalismbrand.com/products/camiseta-algodon-organico?variant=34948534108317"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
    html_content = response.text
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")
    html_content = None

if html_content:
    soup = BeautifulSoup(html_content, 'html.parser')
    

In [94]:
with open("datos/sopa.html", "w", encoding="utf-8") as file:
    file.write(soup.prettify())

### Conseguir URLs de Categorías

In [None]:
url = "https://minimalismbrand.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

href_links = [a["href"] for a in soup.find_all("a", class_="navigation__link") if "href" in a.attrs]

base_url = "https://minimalismbrand.com"

full_urls = []
for path in href_links:
    full_url = base_url + path
    full_urls.append(full_url)

print(full_urls)

['https://minimalismbrand.com/collections/ropa-minimalism', 'https://minimalismbrand.com/collections/ropa-minimalism-mujer', 'https://minimalismbrand.com/collections/camisetas-algodon-organico-nino-a', 'https://minimalismbrand.com/collections/mochilas', 'https://minimalismbrand.com/collections/hogar', 'https://minimalismbrand.com/pages/manifiesto-minimalism', 'https://minimalismbrand.com/collections/ropa-minimalism', 'https://minimalismbrand.com/collections/packs-minimalism', 'https://minimalismbrand.com/products/camiseta-algodon-organicox3', 'https://minimalismbrand.com/products/calzoncillos-organicos-x3-1', 'https://minimalismbrand.com/products/calcetines-organicos-tobilleros-x5', 'https://minimalismbrand.com/products/calcetines-organicos-x5-1', 'https://minimalismbrand.com/collections/hombre', 'https://minimalismbrand.com/collections/camisetas-algodon-organico', 'https://minimalismbrand.com/collections/calzoncillos', 'https://minimalismbrand.com/collections/calcetines-algodon-organi

### Conseguir URLs de Productos

In [81]:
def urls_productos(urls):
    price_pattern = re.compile(r"€\d+,\d{2}|€\d+\.\d{2}|\$\d+,\d{2}|\$\d+\.\d{2}")
    all_products_data = []
    all_product_links = [] 

    for url in urls:
        category = url.split("/")[-1]
        
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            product_links = [a["href"] for a in soup.find_all("a", href=True) if "product" in a["href"]]
            all_product_links.extend(product_links)

            product_sections = soup.find_all("div", class_="product-list")
            for product in product_sections:
                items = product.get_text(strip=True).split("ORGÁNICO")
                
                for item in items:
                    item = item.strip()
                    price_match = price_pattern.search(item)
                    price = price_match.group() if price_match else None
                    description = price_pattern.sub("", item).strip()
                    
                    if description and price:
                        all_products_data.append({
                            "description": description, 
                            "price": price, 
                            "url": url,
                            "category": category
                        })
        else:
            print(f"Failed to retrieve {url}")

    df = pd.DataFrame(all_products_data)
    base_url = "https://minimalismbrand.com"

    full_urls = []
    for path in all_product_links:
        full_url = base_url + path
        full_urls.append(full_url)
    
    return full_urls

In [82]:
links = urls_productos(full_urls)

In [None]:
set_links = list(set(links))
len(set_links)

149

In [103]:
set_links

['https://minimalismbrand.com/products/camiseta-algodon-organico-nino?variant=46638545633615',
 'https://minimalismbrand.com/products/champu-solido-uso-diario-x3',
 'https://minimalismbrand.com/products/camiseta-oversize-algodon-organico?variant=47929940246863',
 'https://minimalismbrand.com/products/calcetines-organicos-pinkies-x7',
 'https://minimalismbrand.com/products/pantalon-organico-semi-skinny-hombre',
 'https://minimalismbrand.com/products/camiseta-minimalism-mujer-algodon-organico?variant=35087937339549',
 'https://minimalismbrand.com/products/toilet-bag-minimalism',
 'https://minimalismbrand.com/products/camiseta-algodon-organico?variant=34948530765981',
 'https://minimalismbrand.com/products/bralette-minimalism-algodon-organico?variant=35088152756381',
 'https://minimalismbrand.com/products/toalla-ducha-algodon-organico?variant=43057217765575',
 'https://minimalismbrand.com/products/pantalon-relaxed-jogger-organico?variant=49870609514831',
 'https://minimalismbrand.com/prod

### Captura de Productos

In [159]:

def productos_minimalism(urls):
    all_product_data = []  # List to hold data for each product
    
    # Define patterns for size, color, and category extraction
    size_pattern = re.compile(r"^(XS|S|M|L|XL|XXL)$")
    color_pattern = re.compile(r"^[A-Za-z\s]+$")
    category_pattern = re.compile(r"products/([^/-]+)")

    for url in tqdm(urls):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise an error for bad status codes
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Extract the title
            title = soup.find("h1", class_="title").get_text(strip=True) if soup.find("h1", class_="title") else None
            
            # Extract the current price
            current_price = soup.find("span", class_="current-price theme-money").get_text(strip=True) if soup.find("span", class_="current-price theme-money") else None
            
            # Extract sizes and colors
            sizes_colors = [size.get_text(strip=True) for size in soup.find_all("span", class_="opt-label__text")]
            sizes = [item for item in sizes_colors if size_pattern.match(item)]
            colors = [item for item in sizes_colors if color_pattern.match(item) and not size_pattern.match(item)]
            
            # Extract cost breakdown
            cost_breakdown = []
            for li in soup.find_all("li"):
                if "—" in li.get_text() and "%" in li.get_text():
                    cost_breakdown.append(li.get_text(strip=True))
            
            # Extract production details
            production_details = []
            production_section = soup.find("div", id="block-id-template--19096866292047__2cecfbfa-137a-4ccc-97aa-6738b0cac11b-text_block-1")
            if production_section:
                production_details = [item.get_text(strip=True) for item in production_section.find_all("li")]
            
            # Extract country and manufacturing details
            country_section = soup.find("div", id="block-id-template--19096866292047__2cecfbfa-137a-4ccc-97aa-6738b0cac11b-text_block-2")
            material = material2 = None
            if country_section:
                text_items = [item.get_text(strip=True) for item in country_section.find_all(["p", "a"])]
                if len(text_items) > 2:
                    material = text_items[0] if country_section else None
                    factory = text_items[2] if country_section else None
            
            # Extract category from URL
            category_match = category_pattern.search(url)
            category = category_match.group(1) if category_match else None

            # Extract rating (valoración)
            opinion = soup.find("span", class_="stamped-badge-caption")
            count = opinion.get("data-reviews") if opinion else None
            rating = opinion.get("data-rating") if opinion else None
            
            # Append product data to the list
            all_product_data.append({
                "producto": title,
                "precio": current_price,
                "tallas": sizes,
                "colores": colors,
                "costes": cost_breakdown,
                "procedencia_algodon": production_details,
                "material": material,
                "fabrica": factory,
                "categoria": category,
                "valoracion": rating,
                "cant_valoraciones": count,
                "url": url
            })

        except requests.RequestException as e:
            print(f"Failed to retrieve {url}: {e}")
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(all_product_data)
    df["precio"] = df["precio"].str.replace("€","").astype(float)
    df['colores'] = df['colores'].apply(lambda x: ', '.join(x) if isinstance(x, list) else "")
    df['tallas'] = df['tallas'].apply(lambda x: ', '.join(x) if isinstance(x, list) else "")
    df['costes'] = df['costes'].apply(lambda x: ', '.join(x) if isinstance(x, list) else "")
    df['procedencia_algodon'] = df['procedencia_algodon'].apply(lambda x: ', '.join(x) if isinstance(x, list) else "")
    df['fabrica'] = df['fabrica'].apply(lambda x: x.split(":", 1)[1] if ":" in x else "")
    return df

In [160]:
df_minimalism = productos_minimalism(set_links)

 28%|██▊       | 41/149 [00:09<00:27,  4.00it/s]

Failed to retrieve https://minimalismbrand.comhttps://minimalismbrand.com/collections/carteras/products/cartera-minimalism: HTTPSConnectionPool(host='minimalismbrand.comhttps', port=443): Max retries exceeded with url: /minimalismbrand.com/collections/carteras/products/cartera-minimalism (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x12b7219a0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


 62%|██████▏   | 93/149 [00:21<00:09,  6.06it/s]

Failed to retrieve https://minimalismbrand.comhttps://minimalismbrand.com/collections/carteras/products/cartera-minimalism?variant=19279110832190: HTTPSConnectionPool(host='minimalismbrand.comhttps', port=443): Max retries exceeded with url: /minimalismbrand.com/collections/carteras/products/cartera-minimalism?variant=19279110832190 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x126f6ff70>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


 83%|████████▎ | 124/149 [00:29<00:04,  6.17it/s]

Failed to retrieve https://minimalismbrand.comhttps://minimalismbrand.com/collections/carteras/products/cartera-minimalism?variant=19279110799422: HTTPSConnectionPool(host='minimalismbrand.comhttps', port=443): Max retries exceeded with url: /minimalismbrand.com/collections/carteras/products/cartera-minimalism?variant=19279110799422 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x128b18340>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


100%|██████████| 149/149 [00:35<00:00,  4.16it/s]


In [161]:
df_minimalism

Unnamed: 0,producto,precio,tallas,colores,costes,procedencia_algodon,material,fabrica,categoria,valoracion,cant_valoraciones,url
0,Camiseta algodón orgánico niño/a,9.0,,"Negra, Blanca, Gris",44% — Fabricación (materia prima + confección ...,"50% Turquía, 50% Kazajistán, España, Tanzania",Algodón certificado:OCS100 (Organic Content St...,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",camiseta,48,4,https://minimalismbrand.com/products/camiseta-...
1,Champú sólido uso diario x3,20.0,,,44% — Fabricación (materia prima + confección ...,,,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",champu,,,https://minimalismbrand.com/products/champu-so...
2,Camiseta oversize algodón orgánico,25.0,"XS, S, M, L, XL, XXL","Negra, Blanca, Lavagrey, Navyblue",44% — Fabricación (materia prima + confección ...,"50% Turquía, 50% Kazajistán, España, Tanzania",Algodón certificado:OCS100 (Organic Content St...,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",camiseta,43,13,https://minimalismbrand.com/products/camiseta-...
3,Calcetines Pinkies Orgánicos - Pack 7 uds,28.0,,,44% — Fabricación (materia prima + confección ...,,,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",calcetines,,,https://minimalismbrand.com/products/calcetine...
4,Pantalón orgánico Slim Hombre,70.0,,,44% — Fabricación (materia prima + confección ...,"50% Turquía, 50% Kazajistán, España, Tanzania",Algodón certificado:OCS100 (Organic Content St...,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",pantalon,48,8,https://minimalismbrand.com/products/pantalon-...
...,...,...,...,...,...,...,...,...,...,...,...,...
141,Camiseta oversize algodón orgánico,25.0,"XS, S, M, L, XL, XXL","Negra, Blanca, Lavagrey, Navyblue",44% — Fabricación (materia prima + confección ...,"50% Turquía, 50% Kazajistán, España, Tanzania",Algodón certificado:OCS100 (Organic Content St...,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",camiseta,43,13,https://minimalismbrand.com/products/camiseta-...
142,Jersey de punto fino de lana merino reciclado,75.0,"S, M, L, XL, XXL","Gris Vigore, Beige, Azul Marino, Verde",44% — Fabricación (materia prima + confección ...,,,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",jersey,,,https://minimalismbrand.com/products/jersey-de...
143,Funda de portátil impermeable,35.0,,,44% — Fabricación (materia prima + confección ...,,,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",laptop,50,2,https://minimalismbrand.com/products/laptop-ca...
144,Tarjeta regalo · Gift card · Formato digital,25.0,,,44% — Fabricación (materia prima + confección ...,"50% Turquía, 50% Kazajistán, España, Tanzania",Algodón certificado:OCS100 (Organic Content St...,"Fábrica Ribeiro & Matos, en Guimarães (Portuga...",tarjeta,50,2,https://minimalismbrand.com/products/tarjeta-r...


In [162]:
df_minimalism.to_csv("datos/minimalism_final.csv")

In [163]:
df_minimalism["categoria"].value_counts()

categoria
camiseta        39
calcetines      12
jersey          11
sudadera        10
pantalon        10
calzoncillos     6
tanga            5
juego            5
culotte          5
polo             5
toalla           4
cartera          4
bralette         3
pack             3
banador          3
gorro            3
gel              3
champu           3
mochila          2
bufanda          2
guantes          2
pano             1
toilet           1
toallas          1
vaquero          1
laptop           1
tarjeta          1
Name: count, dtype: int64

In [None]:
# Define the scopes for Google Sheets and Google Drive
SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive"
]

# Load the Service Account credentials with the correct scopes
creds = Credentials.from_service_account_file("datos/client_secret.json", scopes=SCOPES)
client = gspread.authorize(creds)

# Load your CSV file into a DataFrame
df = pd.read_csv("datos/minimalism_final.csv")  # Replace with your CSV file path


# Set up the Drive API client
drive_service = build('drive', 'v3', credentials=creds)

# Define metadata for the new Google Sheet, including the specific folder ID
folder_id = '1P8WTL5shxMqT3g92WbzCzyNJUvNj_li0'  # Replace with your actual folder ID
file_metadata = {
    'name': "Minimalism",  # The name of your Google Sheet
    'mimeType': 'application/vnd.google-apps.spreadsheet',
    'parents': [folder_id]  # Specify the folder ID here
}

# Create the new Google Sheet in the specified folder
file = drive_service.files().create(body=file_metadata, fields='id').execute()
spreadsheet_id = file.get('id')

# Open the newly created Google Sheet by ID
spreadsheet = client.open_by_key(spreadsheet_id)
worksheet = spreadsheet.get_worksheet(0)  # Access the first sheet

# Upload the DataFrame to the Google Sheet
set_with_dataframe(worksheet, df)

print("CSV data uploaded to Google Sheets successfully!")

CSV data uploaded to Google Sheets successfully!
