In [6]:
import os

def file_read(fpath = r"LICENSE"):
    if not os.path.exists(fpath):
        print("File not found")
        return None
    if not os.path.isfile(fpath):
        print("Not a file")
        return None
    with open(fpath) as file:
        c = file.read()
        return c
file_read()

'MIT License\n\nCopyright (c) 2022 AkulS1008\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOU

In [None]:
import requests #used to collect the page from the web
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

In [None]:
url = "https://python-poetry.org/docs/basic-usage/"
ua = UserAgent()
headers = {'User-Agent' : ua.random}
page = requests.get(url, headers = headers, cookies = {"session-id" : "", "session-id-time" : "", "session-token" : ""})
print(page)
if page.status_code == 404:
    print("Page not found")
elif page.status_code == 503:
    print("Page unavailable")
elif page.status_code == 200:
    print("Page found")
    soup = BeautifulSoup(page.content, 'html.parser')
    print(soup)

In [None]:
requests.get()

In [8]:
from fpdf import FPDF

pdf = FPDF()
pdf.add_page()
pdf.set_font('helvetica', size =12)
pdf.cell(txt="Sample text")
pdf.output("sample.pdf")

In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re

def __validate_url__(url):
    regex = re.compile(r'^(?:http)s?://', re.IGNORECASE)
    return re.match(regex, url) is not None

def __clean_url__(url):
    if '?' in url:
        url = url.split('?')[0]
    return url

def get_webpage_data(url, headers = None, cookies = None) -> BeautifulSoup:
    url = __clean_url__(url)
    if not __validate_url__(url):
        print("Invalid url")
        return None
    if headers is None:
        ua = UserAgent()
        headers = {'User-Agent' : ua.random}
    if cookies is None:
        cookies = {"session-id" : "", "session-id-time" : "", "session-token" : ""}
    try:
        page = requests.get(url, headers = headers, cookies = cookies)
        if page.status_code == 404:
            print("Page not found")
            return None
        elif page.status_code == 503:
            print("Page unavailable")
            return None
        elif page.status_code == 200:
            print("Page found")
            soup = BeautifulSoup(page.content, 'html.parser')
            return soup
    except Exception as e:
        raise e

def extract_one(soup : BeautifulSoup, **selectors) -> dict:
    if not isinstance(soup, BeautifulSoup):
        print("Not a BeautifulSoup object")
        return None
    data = {}
    try:
        for key,info in selectors.items():
            tag = info.get('tag', 'div')
            attrs = info.get('attrs', None) #defaults as 2nd param
            output = info.get('output', 'text')
            if output == 'text':
                data[key] = soup.find(tag, attrs = attrs).text.strip()
            elif output == 'href':
                data[key] = soup.find(tag, attrs = attrs).attrs.get('href') 
            elif output == 'src': #for images
                data[key] = soup.find(tag, attrs = attrs).attrs.get('src')     
            else:
                print('Not suitable output')
        return data
    except Exception as e:
        print("Could not extract data")
        raise e

def extract_many(soup : BeautifulSoup, **selectors) -> list:
    if 'target' in selectors:
        tag = selectors['target'].get('tag')
        attrs = selectors['target'].get('attrs')
        if tag is None:
            print("Please give valid selectors")
            print("Example: target = {'tag' : 'div', 'attrs' : {...}")
            return None
        else:
            print(soup)
            target = soup.find(tag, attrs)
            if target is None:
                print(f"Could not find target section with this {tag} and {attrs}")
                return None
    else:
        target = soup
    data_list = []
    if 'items' in selectors:
        items = target.find_all(selectors['items'].get('tag'), attrs = selectors['items'].get('attrs'))
        items_count = len(items)
        if items_count == 0:
            print("No data found")
            return data_list
        else:
            print(f"{items_count} items found")
            selectors.pop('target')
            selectors.pop('items')
            for idx, item in enumerate(items):
                data = {}
                try:
                    for key,info in selectors.items():
                        tag = info.get('tag', 'div')
                        attrs = info.get('attrs', None) 
                        output = info.get('output', 'text')
                        if output == 'text':
                            data[key] = item.find(tag, attrs = attrs).text.strip()
                        elif output == 'href':
                            data[key] = item.find(tag, attrs = attrs).attrs.get('href') 
                        elif output == 'src': 
                            data[key] = item.find(tag, attrs = attrs).attrs.get('src')     
                        else:
                            print('Not suitable output')
                    data_list.append(data)
                except:
                    print("Item skipped at index:", idx)
            else:
                print("All items extracted")
            return data_list
    else:
        print("items is required as a parameter containing dict containing tag, attrs as keys")
        print("Example: items = {'tag' : 'div', 'attrs' : {...}")

#source $HOME/.poetry/env command imp
#To publish, cd dutils, then poetry publish
#poetry add 'module'
import os
from xmlrpc.client import Boolean
import docx2txt
from pdfminer.high_level import extract_text
from fpdf import FPDF
from docx import Document

def get_data(path : str, output = 's', encoding = 'utf-8') -> str:
    """
    Obtains data from files of any extension (supports text files, binary files, pdf, doc for now; more coming!)
    Returns a string or binary data depending on the output arg
    
    Args:
    path (str): path of the file to be read ex:"sample.txt" or "sample.pdf", or "sample.doc", etc.
    output (str): 's' is passed for the data to be stored as string; 'b' to obtain binary data
    encoding (str): existing encoding of file
    """
    if output == 's':
        if not os.path.exists(path):
            print("File not found")
            return None
        if not os.path.isfile(path):
            print("Not a file")
            return None
        file_type = __file_type__(path)
        if file_type == 1:
            data = __doc_read__(path)
        elif file_type == 2:
            data = __pdf_read__(path)
        elif file_type == 3:
            try: 
                data = __text_read__(path, encoding = encoding)
            except:
                print("Encoding not supported")
        else:
            print("File type could not be understood")
            data = None
        return data
    elif output == 'b':
        return __binary_read__(path)

def __file_type__(path : str) -> int:
    base, ext = os.path.splitext(path)
    if ext in [".doc", ".docx"]:
        return 1
    if ext == ".pdf":
        return 2
    if ext in [".txt", ".css", ".html", ".py", ".java", ".cpp", ".ipynb", ".md", ".lock", ".toml", ".rst"]:
        return 3

def __text_read__(path : str, encoding) -> str:
    try:
        with open(path, encoding = encoding) as file:
            return file.read()
    except Exception as e:
        print("File could not be read")
        raise e

def __doc_read__(path : str) -> str:
    try:
        return docx2txt.process(path)
    except Exception as e:
        print("Doc/Docx file could not be read")
        raise e

def __pdf_read__(path : str) -> str:
    try:
        return extract_text(path)
    except Exception as e:
        print("Pdf file could not be read")
        raise e

def __binary_read__(path : str):
    try:
        with open(path, mode = 'rb') as file:
            return file.read()
    except Exception as e:
        print("Binary file could not be read")
        raise e

def save_data(path : str, data : str) -> bool:
    """
    Writes and saves data into a file of any extension
    Returns True if file is successfully accessed and modified. Otherwise False.
    
    Args:
    path (str): path of the file to be modified ex:"sample.txt" or "sample.pdf", or "sample.doc", etc.
    data (str): data to be stored and saved into the given file
    """
    status = False
    file_type = __file_type__(path)
    if file_type == 1:
        status = __txt_file_write__(path, data)
    if file_type == 2:
        status =  __pdf_write__(path, data)
    if file_type == 3:
        status =  __doc_write__(path, data)
    return status

def __txt_file_write__(path : str, data : str) -> bool:
    try:
        with open(path, 'w') as file:
            file.write(data)
        return True
    except Exception as e:
        print("File could not be modified")
        raise e

def __pdf_write__(path : str, data : str) -> bool:
    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font('helvetica', size =12)
        pdf.cell(txt=data)
        pdf.output(path)
        return True
    except Exception as e:
        print("Pdf file could not be modified")
        raise e

def __doc_write__(path : str, data : str) -> bool:
    try:
        document = Document(path)
        paragraph = document.paragraphs[len(document.paragraphs) - 1]
        paragraph.text = data
        document.save(path)
        return True
    except Exception as e:
        print("Doc/Docx file could not be modified")
        raise e

#how to generate github pages

In [17]:
extract_one(get_webpage_data("https://en.wikipedia.org/wiki/Hurricane_Leslie_(2018)"), title = {'tag' : 'h1', 'attrs' : {'id' : 'firstHeading'}, 'output' : 'text'})

Page found


{'title': 'Hurricane Leslie (2018)'}

In [4]:

soup = get_webpage_data("https://www.amazon.com/s?k=headphones&crid=1DUUWW6PEVAJ1&sprefix=headphones%2Caps%2C161&ref=nb_sb_noss_1")

Page found


In [5]:
soup

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-us"><!-- sp:feature:head-start -->
<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>
<!-- sp:end-feature:head-start -->
<script type="text/javascript">var ue_t0=ue_t0||+new Date();</script>
<!-- sp:feature:cs-optimization -->
<meta content="on" http-equiv="x-dns-prefetch-control"/>
<link href="https://images-na.ssl-images-amazon.com" rel="dns-prefetch"/>
<link href="https://m.media-amazon.com" rel="dns-prefetch"/>
<link href="https://completion.amazon.com" rel="dns-prefetch"/>
<!-- sp:end-feature:cs-optimization -->
<script type="text/javascript">
window.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;
if (window.ue_ihb === 1) {

var ue_csm = window,
    ue_hob = +new Date();
(function(d){var e=d.ue=d.ue||{},f=Date.now||function(){return+new Date};e.d=function(b){return f()-(b?0:d.ue_t0)};e.stub=function(b,a){if(!b[a]){var c=[];b[a]=function(){c.push([c.slice.call(arguments),e.

In [8]:
def extract_urls(soup : BeautifulSoup, items : dict, target = None) -> list:
    if target is not None:
        tag = target.get('tag')
        attrs = target.get('attrs')
        if tag is None:
            print("Please give valid selectors")
            print("Example: target = {'tag' : 'div', 'attrs' : {...}")
            return None
        else:
            target = soup.find(tag, attrs)
            if target is None:
                print(f"Could not find target section with this {tag} and {attrs}")
                return None
    else:
        target = soup
        url_list = []
        ITEMS = target.find_all(items.get('tag'), attrs = items.get('attrs'))
        ITEMS_count = len(ITEMS)
        if ITEMS_count == 0:
            print("No data found")
            return url_list
        else:
            print(f"{ITEMS_count} items found")
            for idx, ITEM in enumerate(ITEMS):
                data = {}
                try:
                    tag = items.get('tag', 'a')
                    attrs = items.get('attrs', None) 
                    data[idx] = ITEM.find(tag, attrs = attrs).attrs.get('href')    
                    url_list.append(data)
                except:
                    print("Item skipped at index:", idx)
            else:
                print("All items extracted")
            return url_list
        
print(extract_urls(get_webpage_data("https://www.amazon.com/s?k=headphones&crid=1DUUWW6PEVAJ1&sprefix=headphones%2Caps%2C161&ref=nb_sb_noss_1"), 
    target = {'tag' : 'div', 'attrs' : {'class':'s-matching-dir sg-col-16-of-20 sg-col sg-col-8-of-12 sg-col-12-of-16'}},
    items = {'tag' : 'a', 'attrs' : {'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'}}))

Page found
Could not find target section with this div and {'class': 's-matching-dir sg-col-16-of-20 sg-col sg-col-8-of-12 sg-col-12-of-16'}
False


In [13]:
def extract_urls(soup : BeautifulSoup, target = None) -> list:
    if target is not None:
        tag = target.get('tag')
        attrs = target.get('attrs')
        if tag is None:
            print("Please give valid selectors")
            print("Example: target = {'tag' : 'div', 'attrs' : {...}")
            return None
        else:
            target = soup.find(tag, attrs)
            if target is None:
                print(f"Could not find target section with this {tag} and {attrs}")
                return None
    else:
        target = soup
    url_list = target.find_all('a'); links = set()
    try:
        for link in url_list:
            url = link.attrs.get('href')
            if url:
                if url != "#":
                    links.add(url)
    except Exception as e:
        print("Could not filter links")
    return list(links)





In [1]:
from dputils.files import get_data, save_data
from dputils.scrape import get_webpage_data, extract_one, extract_many, extract_urls

In [13]:
print(extract_urls(get_webpage_data("https://www.amazon.com/s?k=headphones&crid=1DUUWW6PEVAJ1&sprefix=headphones%2Caps%2C161&ref=nb_sb_noss_1")))

['/s?k=headphones&rh=n%3A172541%2Cp_n_feature_twenty_browse-bin%3A2972991011&dc&crid=1DUUWW6PEVAJ1&qid=1655704355&rnid=2972980011&sprefix=headphones%2Caps%2C161&ref=sr_nr_p_n_feature_twenty_browse-bin_8&ds=v1%3A%2FuZYBjjk1UWwUZyje367nbYiqlg3dHb%2BfZ%2FcS9Pe0Tg', '/Sony-Wired-Headphones-MDRZX110-WHI/dp/B00NJ2M43M/ref=ice_ac_b_dpb?crid=1DUUWW6PEVAJ1&keywords=headphones&qid=1655704355&sprefix=headphones%2Caps%2C161&sr=8-4', '/s?k=headphones&rh=n%3A172541%2Cp_n_feature_twenty-four_browse-bin%3A11636675011&dc&crid=1DUUWW6PEVAJ1&qid=1655704355&rnid=11636674011&sprefix=headphones%2Caps%2C161&ref=sr_nr_p_n_feature_twenty-four_browse-bin_2&ds=v1%3Ap6ovXxJ%2F4EkZhYmESyb5xovwvC1tCvCDRDv2GsUU1lk', '/s?k=headphones&rh=n%3A172282%2Cp_89%3Abeyerdynamic&dc&crid=1DUUWW6PEVAJ1&qid=1655704355&rnid=2528832011&sprefix=headphones%2Caps%2C161&ref=sr_nr_p_89_27&ds=v1%3AElEXMIxTRXiYZTXCfMNIxzM2g%2Fn2j319boQ7mlpD0RA', 'https://www.amazon.com/gp/css/order-history?ref_=footer_yo', '/s?k=headphones&rh=n%3A172541%2