In [6]:
import os

def file_read(fpath = r"LICENSE"):
    if not os.path.exists(fpath):
        print("File not found")
        return None
    if not os.path.isfile(fpath):
        print("Not a file")
        return None
    with open(fpath) as file:
        c = file.read()
        return c
file_read()

'MIT License\n\nCopyright (c) 2022 AkulS1008\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the "Software"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOU

In [None]:
import requests #used to collect the page from the web
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

In [None]:
url = "https://python-poetry.org/docs/basic-usage/"
ua = UserAgent()
headers = {'User-Agent' : ua.random}
page = requests.get(url, headers = headers, cookies = {"session-id" : "", "session-id-time" : "", "session-token" : ""})
print(page)
if page.status_code == 404:
    print("Page not found")
elif page.status_code == 503:
    print("Page unavailable")
elif page.status_code == 200:
    print("Page found")
    soup = BeautifulSoup(page.content, 'html.parser')
    print(soup)

In [None]:
requests.get()

In [8]:
from fpdf import FPDF

pdf = FPDF()
pdf.add_page()
pdf.set_font('helvetica', size =12)
pdf.cell(txt="Sample text")
pdf.output("sample.pdf")

In [16]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re

def __validate_url__(url):
    regex = re.compile(r'^(?:http)s?://', re.IGNORECASE)
    return re.match(regex, url) is not None

def __clean_url__(url):
    if '?' in url:
        url = url.split('?')[0]
    return url

def get_webpage_data(url, headers = None, cookies = None) -> BeautifulSoup:
    url = __clean_url__(url)
    if not __validate_url__(url):
        print("Invalid url")
        return None
    if headers is None:
        ua = UserAgent()
        headers = {'User-Agent' : ua.random}
    if cookies is None:
        cookies = {"session-id" : "", "session-id-time" : "", "session-token" : ""}
    try:
        page = requests.get(url, headers = headers, cookies = cookies)
        if page.status_code == 404:
            print("Page not found")
            return None
        elif page.status_code == 503:
            print("Page unavailable")
            return None
        elif page.status_code == 200:
            print("Page found")
            soup = BeautifulSoup(page.content, 'html.parser')
            return soup
    except Exception as e:
        raise e

def extract_one(soup : BeautifulSoup, **selectors) -> dict:
    if not isinstance(soup, BeautifulSoup):
        print("Not a BeautifulSoup object")
        return None
    data = {}
    try:
        for key,info in selectors.items():
            tag = info.get('tag', 'div')
            attrs = info.get('attrs', None) #defaults as 2nd param
            output = info.get('output', 'text')
            if output == 'text':
                data[key] = soup.find(tag, attrs = attrs).text.strip()
            elif output == 'href':
                data[key] = soup.find(tag, attrs = attrs).attrs.get('href') 
            elif output == 'src': #for images
                data[key] = soup.find(tag, attrs = attrs).attrs.get('src')     
            else:
                print('Not suitable output')
        return data
    except Exception as e:
        print("Could not extract data")
        raise e


In [17]:
extract_one(get_webpage_data("https://en.wikipedia.org/wiki/Hurricane_Leslie_(2018)"), title = {'tag' : 'h1', 'attrs' : {'id' : 'firstHeading'}, 'output' : 'text'})

Page found


{'title': 'Hurricane Leslie (2018)'}