<a href="https://colab.research.google.com/github/diarrabell/fashion-recs/blob/main/fashion_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains a webscraper that collects images from Forever21.com and organizes the images into a dataframe. This dataframe is used to generate recommendations from the website.

In [None]:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/data


In [None]:
# create testing dataset 
product_catalog = pd.DataFrame(columns =['img_name', 'links', "aesthetics"])

## Forever 21 Web Scraper

In [None]:
def scrape_download_label(baseurl, headers, url, labels):
    pids = []
    productlinks = []
    img_links =[]

    # scrape webpage using beautiful soup
    k = requests.get(url).text
    soup=BeautifulSoup(k,'html.parser')
    productlist = soup.find_all("div",{"class":"product-grid__item"})
    # print(productlist)


    for product in productlist:

            # scrape product id and links to the product page
            p = product.find("div",{"class":"product"})
            name = p.find("div", {"class":"product-tile product-tile--default"}).get('data-pid')
            link = p.find("a",{"class":"product-tile__anchor product-tile__anchor--product-info"}).get('href')
            # print(name)
            # print(link)
            pids.append(name)
            productlinks.append(baseurl + link)

            # get the links to images
            im1 = p.find("div", {"class":"product-tile product-tile--default"}).find("div",{"class":"product-tile__media product-tile__media--default"}).find("div",{"class":"product-tile__media-container component-overlay component-overlay--center"})
            image_link = im1.find("picture").find("source").get("data-srcset")
            # print(image_link)
            img_links.append(image_link)

    # download images 
    folder_name = "/content/drive/MyDrive/data/test-data"
    for i in range(len(img_links)):
      img_link = img_links[i]
      img_name = str(pids[i])
      try:
        r = requests.get(img_link).content

        try:
          r = str(r, 'utf-8')
        except UnicodeDecodeError:
          with open(f"{folder_name}/{img_name}.jpg", "wb+") as f:
                            f.write(r)
      except:
        print("download failed")
        pass


    # create labels column. for now all of them will have the same label 
    labels_list = [labels] * len(img_links)

    # compile dataframe 
    product_df = pd.DataFrame(list(zip(pids, productlinks, labels_list)),
               columns =['img_name', 'links', "aesthetics"])
    
    return product_df

## Create Testing Dataset

In [None]:
baseurl = "https://www.forever21.com"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
url = 'https://www.forever21.com/us/shop/catalog/category/f21/promo-barbie-collection'
labels = "70s boho"

In [None]:
products = scrape_download_label(baseurl, headers, url, labels)
products.tail()

Unnamed: 0,img_name,links,aesthetics
18,20004602220102,https://www.forever21.com/us/2000460222.html?d...,70s boho
19,1000460718011,https://www.forever21.com/us/1000460718.html?d...,70s boho
20,1000461383011,https://www.forever21.com/us/1000461383.html?d...,70s boho
21,1000460426011,https://www.forever21.com/us/1000460426.html?d...,70s boho
22,20004597390101,https://www.forever21.com/us/2000459739.html?d...,70s boho


In [None]:
# add to product catalog
product_catalog = pd.concat([product_catalog, products], ignore_index=True)

In [None]:
product_catalog

Unnamed: 0,img_name,links,aesthetics
0,20004596280102,https://www.forever21.com/us/2000459628.html?d...,70s boho
1,20004595940101,https://www.forever21.com/us/2000459594.html?d...,70s boho
2,2000459965032,https://www.forever21.com/us/2000459965.html?d...,70s boho
3,20004602160303,https://www.forever21.com/us/2000460216.html?d...,70s boho
4,20004595990101,https://www.forever21.com/us/2000459599.html?d...,70s boho
5,2000459967031,https://www.forever21.com/us/2000459967.html?d...,70s boho
6,1000460710011,https://www.forever21.com/us/1000460710.html?d...,70s boho
7,20004601020101,https://www.forever21.com/us/2000460102.html?d...,70s boho
8,20004600970202,https://www.forever21.com/us/2000460097.html?d...,70s boho
9,20004596300201,https://www.forever21.com/us/2000459630.html?d...,70s boho


In [None]:
# write to csv file to preserve this data 
product_catalog.to_csv('/content/drive/MyDrive/data/product_catalog.csv',index=False)