# Automating websearch for the query term

Using the python file `screenshot_automation.py` for automated screenshot of the query results in Chrome for query term: "Childhood cancer early diagnosis methods". selenium.webdriver is used inorder to automate the query and the final screenshot if saved as screenshot.png.

In [1]:
%run screenshot_automation.py

 Now, the obtained screenshot is used to scrape info using pytesseract library 

In [2]:
#pip install mysql-connector-python

In [3]:
import cv2
import pytesseract
import matplotlib.pyplot as plt
import os
import re
import pandas as pd
import mysql.connector

In [4]:
# specify folder where photos are
pic_folder = 'Web_Photos'

# list all files in the folder
pic_names = os.listdir(pic_folder)

# specify folder for text
text_folder = 'Extracted_Text'

### Extracting Text From Picture and Adding it to Text File

In [5]:
# need tesseract as environment variable in path before running this

for pic_name in pic_names:
    image_path = os.path.join(pic_folder, pic_name)
    # read in image
    image = cv2.imread(image_path)

    # turning image into grayscale
    #gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    '''
    # Apply thresholding
    _, thresh_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    '''
    # extract text from image
    text = pytesseract.image_to_string(image)
    txt_path = os.path.join(text_folder, pic_name.split('.')[0] + '.txt')
    
    with open(txt_path, 'w') as text_file:
        text_file.write(text)
    

In [6]:
# list all text files in the folder
text_names = os.listdir(text_folder)
print(text_names)

['bing.txt', 'duckduckgo.txt', 'google.txt', 'yahoo.txt']


### Finding all URLs from each search engine and adding them to a dictionary.

In [7]:
# regex pattern: starts with h, however many letters in between, ":/", howevermany letters in between up until there is a space
url_dict = {}

for text_name in text_names:
    txt_path = os.path.join(text_folder, text_name.split('.')[0] + '.txt')
    
    with open(txt_path, 'r') as text_file:
        file_contents = text_file.read()

        if 'yahoo' in text_name:
            urls = re.findall(r'www.[a-zA-Z]*[^\s]*', file_contents)

        else:
            urls = re.findall(r'h[a-zA-Z]*:/[^\s]*', file_contents)
        #print(urls)

    url_dict[text_name] = urls

In [8]:
# viewing the dictionary
for key, value in url_dict.items():
    print(key, value)
    print("\n")

bing.txt ['https:/www.nyp.orgipediatrics', 'https:/Awww', 'https:/Awww.ncbi.nim.nih.gov/pmc/articles/PMC', 'https:/www.mskcc.org/cancer-care/experts', 'https:/www.texaschildrens.org/canceritreatment', 'https://cancer.montefioreeinstein.org/chiara_story/cancer', 'https:/www.nyp.orgipediatrics']


duckduckgo.txt ['hitps://www.iccp-portal.org', 'hups:/www.cancerorg', 'heps:/www.nebinim.nih.gov', 'https:/www.uptodate.com', 'hutps:/pmchealthservres.biomedcentral.com', 'hutps://www.paho.org', 'https://www.nature.com', 'hups://iris', 'hups://iris', 'hutps:/www.cancerorg', 'hutps:/www.2afp.org', 'hutps://qa.cancerorg', 'hutps://pubmed.ncbi.nim.nih.gov', 'heps:/www.nebi.nim.nih.gov', 'hups://www.researchgate.net', 'hups:/www.ncbi.nim.nih.gov', 'hitps://journals.', 'https:/bmjopen.bmj.com', 'hups://pubmed.ncbi.nim.nih.gov', 'https://www.ncbinim.nih.gov', 'hutps:/jeccrbiomedcentral.com', 'hups://ink.springer.com']


google.txt ['https:/together.stjude.org', 'hitps:/iwwnv.cancercenter.com', 'hitps

### Cleaning up the incorrectly spelled URLs.

In [9]:
# cleaning up the URLs
for k, v in url_dict.items():
    for i in range(len(v)):
        v[i] = re.sub(r'^h[a-z]*:', 'https:', v[i])
        v[i] = re.sub(r':/A\w\w\w\.', '://www.', v[i])


# viewing them
for key, value in url_dict.items():
    print(key, value)
    print("\n")

bing.txt ['https:/www.nyp.orgipediatrics', 'https:/Awww', 'https://www.ncbi.nim.nih.gov/pmc/articles/PMC', 'https:/www.mskcc.org/cancer-care/experts', 'https:/www.texaschildrens.org/canceritreatment', 'https://cancer.montefioreeinstein.org/chiara_story/cancer', 'https:/www.nyp.orgipediatrics']


duckduckgo.txt ['https://www.iccp-portal.org', 'https:/www.cancerorg', 'https:/www.nebinim.nih.gov', 'https:/www.uptodate.com', 'https:/pmchealthservres.biomedcentral.com', 'https://www.paho.org', 'https://www.nature.com', 'https://iris', 'https://iris', 'https:/www.cancerorg', 'https:/www.2afp.org', 'https://qa.cancerorg', 'https://pubmed.ncbi.nim.nih.gov', 'https:/www.nebi.nim.nih.gov', 'https://www.researchgate.net', 'https:/www.ncbi.nim.nih.gov', 'https://journals.', 'https:/bmjopen.bmj.com', 'https://pubmed.ncbi.nim.nih.gov', 'https://www.ncbinim.nih.gov', 'https:/jeccrbiomedcentral.com', 'https://ink.springer.com']


google.txt ['https:/together.stjude.org', 'https:/iwwnv.cancercenter.com

### Run the separating URLs .py file to create a Data Frame similar to the MySQL table

In [15]:
from separating_urls import separate_url
df_search = separate_url(url_dict)

In [20]:
# checking if the .py file ran correctly
df_search.head(10)

Unnamed: 0,search_engine,urls
0,bing.txt,https:/www.nyp.orgipediatrics
1,bing.txt,https:/Awww
2,bing.txt,https://www.ncbi.nim.nih.gov/pmc/articles/PMC
3,bing.txt,https:/www.mskcc.org/cancer-care/experts
4,bing.txt,https:/www.texaschildrens.org/canceritreatment
5,bing.txt,https://cancer.montefioreeinstein.org/chiara_s...
6,bing.txt,https:/www.nyp.orgipediatrics
7,duckduckgo.txt,https://www.iccp-portal.org
8,duckduckgo.txt,https:/www.cancerorg
9,duckduckgo.txt,https:/www.nebinim.nih.gov


In [19]:
df_search.shape

(46, 2)

In [48]:
# remove .txt from search engine column
df_search['search_engine'] = df_search['search_engine'].str.replace('.txt', '')
df_search.head()

Unnamed: 0,search_engine,urls,url_results,term_in_url
0,bing,https:/www.nyp.orgipediatrics,https:/www.nyp.orgipediatrics,0
1,bing,https:/Awww,https:/Awww,0
2,bing,https://www.ncbi.nim.nih.gov/pmc/articles/PMC,https://www.ncbi.nim.nih.gov/pmc/articles/PMC,0
3,bing,https:/www.mskcc.org/cancer-care/experts,https:/www.mskcc.org/cancer-care/experts,1
4,bing,https:/www.texaschildrens.org/canceritreatment,https:/www.texaschildrens.org/canceritreatment,3


### Add 'search_term', 'search_output', 'url_results', 'term_in_url' columns to df

In [49]:
df_search['url_results'] = df_search['urls']

In [50]:
# List of terms to check for
terms_to_check = ['child', 'cancer', 'early', 'treatment']

# Count occurrences of terms in each row of 'url_results' column
df_search['term_in_url'] = df_search['url_results'].apply(lambda x: sum(x.count(term) for term in terms_to_check))


In [39]:
# 
#df_search['search_term'] = 
#df_search['search_output'] = 


KeyError: 'https:/www.nyp.orgipediatrics'

In [45]:
df_search.head()

Unnamed: 0,search_engine,urls,url_results,term_in_url
0,bing.txt,https:/www.nyp.orgipediatrics,https:/www.nyp.orgipediatrics,0
1,bing.txt,https:/Awww,https:/Awww,0
2,bing.txt,https://www.ncbi.nim.nih.gov/pmc/articles/PMC,https://www.ncbi.nim.nih.gov/pmc/articles/PMC,0
3,bing.txt,https:/www.mskcc.org/cancer-care/experts,https:/www.mskcc.org/cancer-care/experts,1
4,bing.txt,https:/www.texaschildrens.org/canceritreatment,https:/www.texaschildrens.org/canceritreatment,3


Count the frequency of each search term in the URL


Add the frequency search term count to dataframe.

In [None]:
# add dictionary of urls into data frame


Connecting to the Mysql database

In [None]:
import mysql.connector

In [24]:
# create connection
conn = mysql.connector.connect(
  host="localhost",
  user= "esther",
  password= ""
)


<mysql.connector.connection.MySQLConnection object at 0x000001759FAD8880>


In [25]:
# preparing the cursor object
cursor = conn.cursor()

In [None]:
# creating the database
cursor.execute('CREATE DATABASE MY_CUSTOM_BOT')

In [None]:
# creating table
cursor.execute('CREATE TABLE MY_CUSTOM_BOT.search (search_term VARCHAR(255),search_output VARCHAR(255),url_results VARCHAR(255),term_in_url VARCHAR(255));')

In [34]:
# convert df to sql
df_search.to_sql(con = conn, if_exists='append', name='search')

  df_search.to_sql(con = conn, if_exists='append', name='search')


DatabaseError: Execution failed on sql '
        SELECT
            name
        FROM
            sqlite_master
        WHERE
            type IN ('table', 'view')
            AND name=?;
        ': Not all parameters were used in the SQL statement