-
Notifications
You must be signed in to change notification settings - Fork 0
/
keys.py
189 lines (152 loc) · 6.65 KB
/
keys.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import subprocess
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from openpyxl import load_workbook
import csv
import time
import re
# Function to prevent the system from sleeping
def prevent_sleep():
return subprocess.Popen(['caffeinate'])
# Function to allow the system to sleep
def allow_sleep(p):
p.terminate()
# Prevent system sleep
p = prevent_sleep()
# Function to update a text-based progress bar during processing
def update_progress_bar(total, progress):
bar_length = 40 # Set the length of the progress bar
if total > 0:
fraction_completed = progress / total
else:
fraction_completed = 0
arrow = int(fraction_completed * bar_length - 1) * "=" + ">"
padding = (bar_length - len(arrow)) * ' '
print(f"\rProgress: [{arrow}{padding}] {progress}/{total} URLs processed", end='', flush=True)
# Function to color text in terminal output
def colored(text, color):
colors = {
"red": "\033[31m",
"green": "\033[32m",
"yellow": "\033[33m",
"blue": "\033[34m",
"magenta": "\033[35m",
"cyan": "\033[36m",
"white": "\033[37m",
}
reset = "\033[0m" # Resets the color to default
return f"{colors.get(color, reset)}{text}{reset}"
# Function to convert a keyword into a hyphenated format
def convert_to_hyphenated(keyword):
keyword_str = str(keyword)
return keyword_str.replace(' ', '-').lower()
# Function to check if the keyword is present in a paragraph tag
def keyword_in_paragraph(keyword, driver):
paragraphs = driver.find_elements(By.TAG_NAME, 'p')
for paragraph in paragraphs:
if keyword.lower() in paragraph.text.lower():
return 'Pass'
return 'Fail'
def count_total_words(driver):
text = driver.find_element(By.TAG_NAME, 'body').text
words = text.split()
return len(words)
# Function to count the occurrences of the keyword on a page
def count_keyword_occurrences(keyword, driver):
text = driver.find_element(By.TAG_NAME, 'body').text.lower()
return text.count(keyword.lower())
# Function to check if the URL is a document (PDF, DOC, DOCX)
def is_document(url):
return re.search(r'\.(pdf|docx?)(\?.*)?$', url.lower()) is not None
# Function to find the next available file name with an iterative suffix
def next_available_filename(base_name, extension):
counter = 1
while True:
new_name = f"{base_name}-{counter}{extension}"
if not os.path.exists(new_name):
return new_name
counter += 1
# File paths for input and output
input_file_path = 'rank.xlsx'
output_file_path = 'output.csv'
# Base name and extension for output file
base_output_file_name = 'output'
output_file_extension = '.csv'
# Find the next available file name
output_file_path = next_available_filename(base_output_file_name, output_file_extension)
# Selenium WebDriver configuration for headless Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
service = Service(executable_path='/opt/homebrew/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
# Initialize a list to store results
results = []
# Load the Excel workbook and select the active sheet
wb = load_workbook(filename=input_file_path)
ws = wb.active
# Count the number of URLs to process
total_urls = sum(1 for row in ws.iter_rows(min_row=2) if row[0].value and row[1].value)
processed_urls = 0
# Process each URL in the Excel file
for row in ws.iter_rows(min_row=2, values_only=True):
keyword, url = row[0], row[1]
if not (keyword and url):
continue # Skip empty rows
# Convert keyword to hyphenated format
hyphenated_keyword = convert_to_hyphenated(keyword)
try:
# Open the URL with Selenium WebDriver
driver.get(url)
time.sleep(2) # Wait for the page to load
# Check if the keyword is in the page title
page_title = driver.title
keyword_in_title = 'Pass' if keyword.lower() in page_title.lower() else 'Fail'
# Check if the keyword is in the H1 tag of the page
try:
h1_tag = driver.find_element(By.TAG_NAME, 'h1').text
keyword_in_h1 = 'Pass' if keyword.lower() in h1_tag.lower() else 'Fail'
except:
keyword_in_h1 = 'Fail' # Fail if H1 tag is not found or other errors
# Check if the hyphenated keyword is in the URL
keyword_in_url = 'Pass' if hyphenated_keyword in url.lower() else 'Fail'
# Check if the keyword is in at least one paragraph tag
keyword_in_paragraph_tag = keyword_in_paragraph(keyword, driver)
# Check if the hyphenated keyword is in image URLs and alt attributes
images = driver.find_elements(By.TAG_NAME, 'img')
keyword_in_image_url = 'Fail'
keyword_in_alt_attribute = 'Fail'
for image in images:
if hyphenated_keyword in image.get_attribute('src').lower():
keyword_in_image_url = 'Pass'
if hyphenated_keyword in (image.get_attribute('alt') or '').lower():
keyword_in_alt_attribute = 'Pass'
# Count total words and keyword occurrences
total_words = count_total_words(driver)
keyword_occurrences = count_keyword_occurrences(keyword, driver)
# Calculate keyword density
keyword_density = (keyword_occurrences / total_words) * 100 if total_words > 0 else 0
results.append((url, keyword, keyword_in_title, keyword_in_h1, keyword_in_url, keyword_in_paragraph_tag, keyword_in_image_url, keyword_in_alt_attribute, f"{keyword_density:.2f}%"))
except Exception as e:
print(f"Error processing URL {url}: {e}")
continue
processed_urls += 1
update_progress_bar(total_urls, processed_urls) # Update the progress bar
# Print the final message after processing
output_file_full_path = os.path.join(os.getcwd(), output_file_path)
completion_message = colored("Processing Complete! ", "magenta")
path_message = colored(f"Your document is saved at {output_file_full_path}", "white")
print('\n' + completion_message + path_message)
# Write results to the new CSV file
with open(output_file_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['URL', 'Keyword', 'Keyword in Title', 'Keyword In H1', 'Keyword in URL', 'Keyword in Paragraph', 'Keyword in Image URL', 'Keyword in Alt Attribute', 'Keyword Density'])
for row in results:
writer.writerow(row)
# Quit the WebDriver
driver.quit()
# Allow the system to sleep when done
allow_sleep(p)