<a href="https://colab.research.google.com/github/ctar/DS_Portfolio/blob/master/pe_dealflow/fastener_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import json
from urllib.parse import quote_plus

# Please note: f-strings require Python 3.6+

# The URL of the Common Crawl Index server
CC_INDEX_SERVER = 'http://index.commoncrawl.org/'

# The Common Crawl index you want to query
INDEX_NAME = 'CC-MAIN-2023-40'      # Replace with the latest index name

# The URL you want to look up in the Common Crawl index
target_url = 'commoncrawl.org/faq'  # Replace with your target URL

# Function to search the Common Crawl Index
def search_cc_index(url):
    encoded_url = quote_plus(url)
    index_url = f'{CC_INDEX_SERVER}{INDEX_NAME}-index?url={encoded_url}&output=json'
    response = requests.get(index_url)
    print("Response from CCI:", response.text)  # Output the response from the server
    if response.status_code == 200:
        records = response.text.strip().split('\n')
        return [json.loads(record) for record in records]
    else:
        return None

# Function to fetch the content from Common Crawl
def fetch_page_from_cc(records):
    for record in records:
        offset, length = int(record['offset']), int(record['length'])
        prefix = record['filename'].split('/')[0]
        s3_url = f'https://data.commoncrawl.org/{record["filename"]}'
        response = requests.get(s3_url, headers={'Range': f'bytes={offset}-{offset+length-1}'})
        if response.status_code == 206:
            # Process the response content if necessary
            # For example, you can use warcio to parse the WARC record
            return response.content
        else:
            print(f"Failed to fetch data: {response.status_code}")
            return None



In [2]:
target_url = "www.atf-inc.com"

In [3]:
def scrape(target_url):# Search the index for the target URL
  records = search_cc_index(target_url)
  if records:
      print(f"Found {len(records)} records for {target_url}")

      # Fetch the page content from the first record
      content = fetch_page_from_cc(records)
      if content:
          print(f"Successfully fetched content for {target_url}")
          # You can now process the 'content' variable as needed
          return content
  else:
      print(f"No records found for {target_url}")

In [4]:
import gdown

url = 'https://drive.google.com/uc?id=1thWydc9rLFDzQTitjqm_ug9uxVFMuIyf'
#html = 'fasteners.html'
output = "./"
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1thWydc9rLFDzQTitjqm_ug9uxVFMuIyf
To: /content/fasteners.html
100%|██████████| 338k/338k [00:00<00:00, 454MB/s]


'./fasteners.html'

In [5]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup
import pandas as pd



In [6]:
# Load the newly uploaded HTML file
with open('./fasteners.html', 'r', encoding='utf-8') as file:
    latest_html_content = file.read()

# Parse the HTML content
latest_soup = BeautifulSoup(latest_html_content, 'html.parser')

# Extract all company names and their URLs in <p class="company-link">
latest_company_info = []

# Loop through all <p class="company-link"> tags
for para in latest_soup.find_all('p', class_='company-link'):
#for para in latest_soup.find_all('p', class_='title-text'):
    company_name_tag = para.find_previous_sibling('p')
    #if company_name_tag and '–' in company_name_tag.text:
    company_name = company_name_tag.text.strip()
    next_a_tag = para.find('a', href=True)
    if next_a_tag:
      latest_company_info.append((company_name, next_a_tag['href']))

# Display the extracted information
#import ace_tools as tools; tools.display_dataframe_to_user(name="Company Information", dataframe=pd.DataFrame(latest_company_info, columns=["Company Name", "URL"]))

#latest_company_info


In [7]:
latest_company_info

[('Acument Global Technologies', 'http://www.acument.com/'),
 ('AFI Industries, Inc.', 'http://www.afiindustries.com/'),
 ('Agrati, Inc.', 'http://www.agrati.com/'),
 ('Arnold Fastening Systems, Inc.', 'http://www.arnold-fastening.com/'),
 ('Auto Bolt', 'http://www.autobolt.net/'),
 ('B & B Specialties, Inc.', 'http://www.bbspecialties.com/'),
 ('Birmingham Fastener, Inc.', 'http://www.bhamfast.com/'),
 ('Böllhoff, Inc.', 'http://www.bollhoff.com/'),
 ('Brico Industries, Inc.', 'http://www.bricoind.com/'),
 ('Celo USA – Trident Fasteners', 'http://www.celofasteners.com/'),
 ('Chicago Fastener Manufacturing', 'http://www.chicagofastener.com/'),
 ('Chicago Rivet & Machine Co.', 'http://www.chicagorivet.com/'),
 ('Click Bond, Inc.', 'http://www.clickbond.com/'),
 ('CMC Anchoring Systems', 'http://www.cmc.com/'),
 ('Cold Heading Company', 'http://www.coldheading.com/'),
 ('CSM Fastener Products', 'http://www.csmfastenerproducts.com/'),
 ('Dexter Fastener Technologies, Inc.', 'http://www.de

In [8]:
urls_to_fetch = []
for item in latest_company_info:
  urls_to_fetch.append(item[1])

In [9]:
import time

In [10]:
content_list = {}
for item in latest_company_info:
  content_list[item[0]] = (scrape(item[1]))
  time.sleep(20)

Response from CCI: {"urlkey": "com,acument)/", "timestamp": "20230928203447", "url": "http://www.acument.com/", "mime": "text/html", "mime-detected": "text/html", "status": "406", "digest": "DXI2LRHAQOJGQSZF3D3M7V6WOC25TW4Z", "length": "651", "offset": "2065790", "filename": "crawl-data/CC-MAIN-2023-40/segments/1695233510454.60/crawldiagnostics/CC-MAIN-20230928194838-20230928224838-00739.warc.gz"}

Found 1 records for http://www.acument.com/
Successfully fetched content for http://www.acument.com/
Response from CCI: {"urlkey": "com,afiindustries)/", "timestamp": "20230925004122", "url": "http://www.afiindustries.com/", "mime": "text/plain", "mime-detected": "text/plain", "status": "301", "digest": "NJOGTVI7R5XR5ZCAUWHDBZKT36NOGPDW", "length": "564", "offset": "1713906", "filename": "crawl-data/CC-MAIN-2023-40/segments/1695233506669.96/crawldiagnostics/CC-MAIN-20230924223409-20230925013409-00844.warc.gz", "redirect": "https://www.afiindustries.com/"}
{"urlkey": "com,afiindustries)/", "t

In [11]:
# pickle the dict to disk (I think it's raw bytes of WARC type data)
import pickle

with open('fastener_scrape_warc_dict.pickle', 'wb') as handle:
    pickle.dump(content_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('fastener_scrape_warc_dict.pickle', 'rb') as handle:
    content_list_from_disk = pickle.load(handle)

print(content_list == content_list_from_disk)

True


In [None]:
# to restore saved pickle from Drive

import gdown

url = 'https://drive.google.com/uc?id=1IOdj79rt0hcFEU83vSpPg6XOH1h2R0U0'
output = "./"
gdown.download(url, output, quiet=False)



Downloading...
From: https://drive.google.com/uc?id=1IOdj79rt0hcFEU83vSpPg6XOH1h2R0U0
To: /content/fastener_scrape_warc_dict.pickle
100%|██████████| 2.06k/2.06k [00:00<00:00, 3.71MB/s]


'./fastener_scrape_warc_dict.pickle'