Merge pull request #2090 from yashwantaditya009/master

Web scraping using python
fineanmol · Oct 3, 2022 · 1bf1414 · 1bf1414
2 parents 74292d4 + fe7021d
commit 1bf1414
Showing 1 changed file with 152 additions and 0 deletions.
diff --git a/Program's_Contributed_By_Contributors/Python_Programs/Web_Scraping_with_BeatifulSoup.py b/Program's_Contributed_By_Contributors/Python_Programs/Web_Scraping_with_BeatifulSoup.py
@@ -0,0 +1,152 @@
+# import required modules 
+import json 
+import requests
+from datetime import datetime
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+from beautifultable import BeautifulTable
+
+
+
+def load_json(database_json_file="scraped_data.json"):
+    """
+    This function will load json data from scraped_data.json file if it exist else crean an empty array
+    """
+    try:
+        with open(database_json_file, "r") as read_it: 
+            all_data_base = json.loads(read_it.read())
+            return all_data_base
+    except:
+        all_data_base = dict()
+        return all_data_base
+
+
+def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
+    """
+    This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
+    if file already exist you can view previous scraped data
+    """
+    file_obj =  open(database_json_file, "w")
+    file_obj.write(json.dumps(data))
+    file_obj.close()
+
+
+def existing_scraped_data_init(json_db):
+    """
+    This function init data from json file if it exist have data else create an empty one 
+    """
+    scraped_data = json_db.get("scraped_data")
+    if scraped_data is None:
+        json_db['scraped_data'] = dict()
+
+    return None
+
+
+def scraped_time_is():
+    """
+    This function create time stamp for keep our book issue record trackable 
+    """
+    now = datetime.now()
+    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
+    return dt_string
+
+def process_url_request(website_url):
+    """
+    This function process provided URL get its data using requets module
+    and contrunct soup data using BeautifulSoup for scarping
+    """
+    requets_data = requests.get(website_url)
+    if requets_data.status_code == 200:
+        soup = BeautifulSoup(requets_data.text,'html')
+        return soup
+    return None
+
+def proccess_beautiful_soup_data(soup):
+    return {
+        'title': soup.find('title').text,
+        'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
+        'all_anchors': [str(i) for i in soup.find_all('a')],
+        'all_images_data': [ str(i) for i in soup.find_all('img')],
+        'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
+        'all_h1_data': [i.text for i in soup.find_all('h1')],
+        'all_h2_data': [i.text for i in soup.find_all('h2')],
+        'all_h3_data': [i.text for i in soup.find_all('h3')],
+        'all_p_data': [i.text for i in soup.find_all('p')]
+    }
+
+
+
+# Here I used infinite loop because i don't want to run it again and again.
+while True:
+
+    print("""  ================ Welcome to this scraping program =============
+    ==>> press 1 for checking existing scraped websites
+    ==>> press 2 for scrap a single website
+    ==>> press 3 for exit
+    """)
+
+    choice = int(input("==>> Please enter your choice :"))
+
+    # Load json function called for fetching/creating data from json file.
+    local_json_db = load_json()
+    existing_scraped_data_init(local_json_db)
+
+    if choice == 1:
+        # I used Beautiful table for presenting scraped data in a good way !!
+        # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
+        scraped_websites_table = BeautifulTable()
+        scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title",   "Scraped at", "Status"]
+        scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)
+
+
+        local_json_db = load_json()
+        for count,  data in enumerate(local_json_db['scraped_data']):
+           scraped_websites_table.rows.append([count + 1, 
+                            local_json_db['scraped_data'][data]['alias'], 
+                            local_json_db['scraped_data'][data]['domain'], 
+                            local_json_db['scraped_data'][data]['title'], 
+                            local_json_db['scraped_data'][data]['scraped_at'], 
+                            local_json_db['scraped_data'][data]['status']])
+        # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
+        if not local_json_db['scraped_data']:
+            print('===> No existing data found !!!')
+        print(scraped_websites_table)
+
+    elif choice == 2:
+        print()
+        url_for_scrap = input("===> Please enter url you want to scrap:")
+        is_accessable = process_url_request(url_for_scrap)
+        if is_accessable:
+            scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
+            print()
+            print(' =====> Data scraped successfully !!!')
+            key_for_storing_data = input("enter alias name for saving scraped data :")
+            scraped_data_packet['url'] = url_for_scrap
+            scraped_data_packet['name'] = key_for_storing_data
+            scraped_data_packet['scraped_at'] = scraped_time_is()
+            if key_for_storing_data in  local_json_db['scraped_data']:
+                key_for_storing_data = key_for_storing_data + str(scraped_time_is())
+                print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
+            scraped_data_packet['alias'] = key_for_storing_data
+            scraped_data_packet['status'] = True
+            scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc
+
+            local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
+            print(
+                'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
+            )
+            save_scraped_data_in_json(local_json_db)
+            # load data
+            local_json_db = load_json()
+            print(' =====> Data saved successfully !!!')
+            print()
+    elif choice == 3:
+        print('Thank you for using !!!')
+        break
+
+    elif choice == 4:
+        print('Thank you for using !!!')
+        break
+
+    else:
+        print("enter a valid choice ")