# Automating websearch for the query term

Using the python file `screenshot_automation.py` for automated screenshot of the query results in Chrome for query term: "Childhood cancer early diagnosis methods". selenium.webdriver is used inorder to automate the query and the final screenshot if saved as screenshot.png.

In [1]:
%run screenshot_automation.py

Enter the folder path to save the screenshots: /Users/abhinapremachandran/PycharmProjects/DSEI2400_Final_Project/Web_Photos


 Now, the obtained screenshot is used to scrape info using pytesseract library 

In [2]:
pip install mysql-connector-python

Note: you may need to restart the kernel to use updated packages.


In [3]:
import cv2
import pytesseract
import matplotlib.pyplot as plt
import os
import re
import pandas as pd
import mysql.connector

In [4]:
# specify folder where photos are
pic_folder = 'Web_Photos'

# list all files in the folder
pic_names = os.listdir(pic_folder)

# specify folder for text
text_folder = 'Extracted_Text'

## using a gray image of the image files to extract text

In [5]:
# need tesseract as environment variable in path
for pic_name in pic_names:
    image_path = os.path.join(pic_folder, pic_name)
    image = cv2.imread(image_path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding
    _, thresh_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    text = pytesseract.image_to_string(thresh_image)
    txt_path = os.path.join(text_folder, pic_name.split('.')[0] + '.txt')
    
    with open(txt_path, 'w') as text_file:
        text_file.write(text)
    

In [6]:
# list all text files in the folder
text_names = os.listdir(text_folder)
print(text_names)

['yahoo.txt', 'google.txt', 'bing.txt', 'duckgo.txt', 'duckduckgo.txt']


In [7]:
# regex pattern: starts with h, however many letters in between, ":/", howevermany letters in between up until there is a space
url_dict = {}

for text_name in text_names:
    txt_path = os.path.join(text_folder, text_name.split('.')[0] + '.txt')
    
    with open(txt_path, 'r') as text_file:
        file_contents = text_file.read()

        if 'yahoo' in text_name:
            urls = re.findall(r'www.[a-zA-Z]*[^\s]*', file_contents)

        else:
            urls = re.findall(r'h[a-zA-Z]*:/[^\s]*', file_contents)
        #print(urls)

    url_dict[text_name] = urls

In [8]:

for key, value in url_dict.items():
    print(key, value)
    print("\n")

yahoo.txt ['www.iccp-portal.org', 'www.paho.org', 'www.ncbi.nim.nih.gov', 'www.nature.com', 'www.thelancet.com', 'www.cancer.org']


google.txt ['https://together.stjude.org', 'https:/Awww.cancer.org', 'https:/Awww.cancer.net', 'https:/Awww.ncbi.nim.nih.gov', 'https://iris.paho.org', 'https:/Awww.acco.org', 'https:/Awww.cancercenter.com', 'https:/Awww.ncbi.nim.nih.gov', 'https://my.clevelandclinic.org', 'https://Awww.paho.org', 'https:/Awww.advocatechildrenshospital.com', 'https://Awww.who.int', 'https://obmjopen.bmj.com', 'https:/Awww.cancer.org', 'https:/Awww.yalemedicine.org', 'https://omchealthservres.biomedcentral.com', 'https:/Awww.cancer.gov', 'https:/Awww.choc.org']


bing.txt ['https:/Avww.nyp.org/pediatrics', 'https:/Avww.texaschildrens.org/cancer/treatment', 'https:/Avww.mskcc.org/cancer-care/experts', 'https:/Avww.azioinpractice.com', 'https:/Avww.ncbi.nim.nih.gov/pme/articles/PMC7584', 'https:/Avww.ncbi.nim.nih.gov/pme/articles/PMC8651632', 'https:/Avww.iccp-portal.org/res

In [9]:
for k, v in url_dict.items():
    for i in range(len(v)):
        v[i] = re.sub(r'^h[a-z]*:', 'https:', v[i])
        v[i] = re.sub(r':/A\w\w\w\.', '://www.', v[i])


for key, value in url_dict.items():
    print(key, value)
    print("\n")

yahoo.txt ['www.iccp-portal.org', 'www.paho.org', 'www.ncbi.nim.nih.gov', 'www.nature.com', 'www.thelancet.com', 'www.cancer.org']


google.txt ['https://together.stjude.org', 'https://www.cancer.org', 'https://www.cancer.net', 'https://www.ncbi.nim.nih.gov', 'https://iris.paho.org', 'https://www.acco.org', 'https://www.cancercenter.com', 'https://www.ncbi.nim.nih.gov', 'https://my.clevelandclinic.org', 'https://Awww.paho.org', 'https://www.advocatechildrenshospital.com', 'https://Awww.who.int', 'https://obmjopen.bmj.com', 'https://www.cancer.org', 'https://www.yalemedicine.org', 'https://omchealthservres.biomedcentral.com', 'https://www.cancer.gov', 'https://www.choc.org']


bing.txt ['https://www.nyp.org/pediatrics', 'https://www.texaschildrens.org/cancer/treatment', 'https://www.mskcc.org/cancer-care/experts', 'https://www.azioinpractice.com', 'https://www.ncbi.nim.nih.gov/pme/articles/PMC7584', 'https://www.ncbi.nim.nih.gov/pme/articles/PMC8651632', 'https://www.iccp-portal.org/res

In [10]:
# create an empty df similar to database "search" table

df_search = pd.DataFrame(columns=["search_term", "search_output", "url_results", "term_in_url"])


In [11]:
df_search.head()

Unnamed: 0,search_term,search_output,url_results,term_in_url



Add the frequency search term count to dataframe.

In [12]:
# add dictionary of urls into data frame


Count the frequency of each search term in the URL

Connecting to the Mysql database

In [13]:
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user=input("Enter your database username: "),
  password=input("Enter your database password: ")
)

print(mydb)

Enter your database username: root
Enter your database password: Astro2498!
<mysql.connector.connection_cext.CMySQLConnection object at 0x1761e6110>
