Skip to content

Commit

Permalink
Merge pull request #2090 from yashwantaditya009/master
Browse files Browse the repository at this point in the history
Web scraping using python
  • Loading branch information
fineanmol committed Oct 3, 2022
2 parents 74292d4 + fe7021d commit 1bf1414
Showing 1 changed file with 152 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# import required modules
import json
import requests
from datetime import datetime
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from beautifultable import BeautifulTable



def load_json(database_json_file="scraped_data.json"):
"""
This function will load json data from scraped_data.json file if it exist else crean an empty array
"""
try:
with open(database_json_file, "r") as read_it:
all_data_base = json.loads(read_it.read())
return all_data_base
except:
all_data_base = dict()
return all_data_base


def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
"""
This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
if file already exist you can view previous scraped data
"""
file_obj = open(database_json_file, "w")
file_obj.write(json.dumps(data))
file_obj.close()


def existing_scraped_data_init(json_db):
"""
This function init data from json file if it exist have data else create an empty one
"""
scraped_data = json_db.get("scraped_data")
if scraped_data is None:
json_db['scraped_data'] = dict()

return None


def scraped_time_is():
"""
This function create time stamp for keep our book issue record trackable
"""
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
return dt_string

def process_url_request(website_url):
"""
This function process provided URL get its data using requets module
and contrunct soup data using BeautifulSoup for scarping
"""
requets_data = requests.get(website_url)
if requets_data.status_code == 200:
soup = BeautifulSoup(requets_data.text,'html')
return soup
return None

def proccess_beautiful_soup_data(soup):
return {
'title': soup.find('title').text,
'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
'all_anchors': [str(i) for i in soup.find_all('a')],
'all_images_data': [ str(i) for i in soup.find_all('img')],
'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
'all_h1_data': [i.text for i in soup.find_all('h1')],
'all_h2_data': [i.text for i in soup.find_all('h2')],
'all_h3_data': [i.text for i in soup.find_all('h3')],
'all_p_data': [i.text for i in soup.find_all('p')]
}



# Here I used infinite loop because i don't want to run it again and again.
while True:

print(""" ================ Welcome to this scraping program =============
==>> press 1 for checking existing scraped websites
==>> press 2 for scrap a single website
==>> press 3 for exit
""")

choice = int(input("==>> Please enter your choice :"))

# Load json function called for fetching/creating data from json file.
local_json_db = load_json()
existing_scraped_data_init(local_json_db)

if choice == 1:
# I used Beautiful table for presenting scraped data in a good way !!
# you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
scraped_websites_table = BeautifulTable()
scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title", "Scraped at", "Status"]
scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)


local_json_db = load_json()
for count, data in enumerate(local_json_db['scraped_data']):
scraped_websites_table.rows.append([count + 1,
local_json_db['scraped_data'][data]['alias'],
local_json_db['scraped_data'][data]['domain'],
local_json_db['scraped_data'][data]['title'],
local_json_db['scraped_data'][data]['scraped_at'],
local_json_db['scraped_data'][data]['status']])
# all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
if not local_json_db['scraped_data']:
print('===> No existing data found !!!')
print(scraped_websites_table)

elif choice == 2:
print()
url_for_scrap = input("===> Please enter url you want to scrap:")
is_accessable = process_url_request(url_for_scrap)
if is_accessable:
scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
print()
print(' =====> Data scraped successfully !!!')
key_for_storing_data = input("enter alias name for saving scraped data :")
scraped_data_packet['url'] = url_for_scrap
scraped_data_packet['name'] = key_for_storing_data
scraped_data_packet['scraped_at'] = scraped_time_is()
if key_for_storing_data in local_json_db['scraped_data']:
key_for_storing_data = key_for_storing_data + str(scraped_time_is())
print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
scraped_data_packet['alias'] = key_for_storing_data
scraped_data_packet['status'] = True
scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc

local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
print(
'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
)
save_scraped_data_in_json(local_json_db)
# load data
local_json_db = load_json()
print(' =====> Data saved successfully !!!')
print()
elif choice == 3:
print('Thank you for using !!!')
break

elif choice == 4:
print('Thank you for using !!!')
break

else:
print("enter a valid choice ")

0 comments on commit 1bf1414

Please sign in to comment.