In [1]:
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import requests

# Load data from Excel
file_path = 'Government website Scraping.xlsx'
df = pd.read_excel(file_path)

# Setup Chrome options
chrome_options = webdriver.ChromeOptions()
test_ua = 'Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument(f'--user-agent={test_ua}')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--disable-extensions")

# Initialize the Chrome driver with ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)

# URL to visit
url = "https://nacionalidade.justica.gov.pt/"

# Your 2Captcha API key
API_KEY = 'Get Your API key'

# Function to solve reCAPTCHA with 2Captcha
def solve_recaptcha_2captcha(site_key, url):
    # Send the request to 2Captcha
    captcha_id = requests.post(
        "http://2captcha.com/in.php",
        data={
            "key": API_KEY,
            "method": "userrecaptcha",
            "googlekey": site_key,
            "pageurl": url
        }
    ).text.split('|')[1]
    
    # Wait for the CAPTCHA to be solved
    time.sleep(20)
    
    # Retrieve the response
    recaptcha_answer = requests.get(
        f"http://2captcha.com/res.php?key={API_KEY}&action=get&id={captcha_id}"
    ).text
    
    # Wait until the CAPTCHA is solved
    while 'CAPCHA_NOT_READY' in recaptcha_answer:
        time.sleep(5)
        recaptcha_answer = requests.get(
            f"http://2captcha.com/res.php?key={API_KEY}&action=get&id={captcha_id}"
        ).text
    
    recaptcha_answer = recaptcha_answer.split('|')[1]
    return recaptcha_answer

# Function to process a row
def process_row(index, row):
    for retry in range(3):  # Retry up to 3 times
        driver.get(url)
        
        try:
            # Wait for the reCAPTCHA iframe to be present
            recaptcha_iframe = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//iframe[@title="reCAPTCHA"]')))
            print(f"Row {index + 1}: reCAPTCHA iframe found.")
            
            senha_acesso = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.ID, "SenhaAcesso")))
            print(f"Row {index + 1}: Senha Acesso field found.")
            
            btn_pesquisa = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.ID, "btnPesquisa")))
            print(f"Row {index + 1}: Pesquisa button found.")
            
            # Enter the key from the current row
            senha_acesso.clear()
            senha_acesso.send_keys(row['Access Password'])  # Use the actual column name from the Excel file
            time.sleep(3)
            print(f"Row {index + 1}: Entered Access Password.")
            
            # Get the site key
            site_key = recaptcha_iframe.get_attribute("src").split("k=")[1].split("&")[0]
            
            # Solve the reCAPTCHA using 2Captcha
            recaptcha_response = solve_recaptcha_2captcha(site_key, url)
            
            # Inject the reCAPTCHA response into the page
            driver.execute_script(f'document.getElementById("g-recaptcha-response").innerHTML="{recaptcha_response}";')
            time.sleep(3)
            print(f"Row {index + 1}: Injected reCAPTCHA response.")
            
            # Click on the search button after solving the CAPTCHA
            btn_pesquisa.click()
            print(f"Row {index + 1}: Clicked Pesquisa button after solving reCAPTCHA.")
            
            # Wait for the result elements to be present and scrape the required data
            application_number_element = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="bloc1"]')))
            application_number_text = application_number_element.text
            print(f"Row {index + 1}: Application Number Text: {application_number_text}")
            
            name_element = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="search-results"]/div/div[3]')))
            name = name_element.text
            print(f"Row {index + 1}: Name found: {name}")
            
            status_text_element = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="search-results"]/div/div[7]')))
            status_text = status_text_element.text
            print(f"Row {index + 1}: Status Text found: {status_text}")
            
            # Debug print statements to verify data
            print(f"Row {index + 1}:")
            print(f"Access Password: {row['Access Password']}")
            print(f"Application Number: {application_number_text}")
            print(f"Name: {name}")
            print(f"Status Text: {status_text}")
            print("-" * 40)
            
            # Update the DataFrame
            df.at[index, 'Application Number'] = application_number_text
            df.at[index, 'Name'] = name
            df.at[index, 'Status Text'] = status_text
            
            # Save the updated DataFrame to Excel
            df.to_excel(file_path, index=False)
            
            return  # Exit the retry loop if successful
        
        except Exception as e:
            print(f"Error processing row {index + 1} on attempt {retry + 1}: {e}")
            time.sleep(10)  # Wait before retrying the entire row

    print(f"Row {index + 1}: Failed to process after multiple retries. Skipping this row.")

# Iterate through each row in the Excel file
for index, row in df.iterrows():
    if pd.isna(row['Application Number']) or pd.isna(row['Name']) or pd.isna(row['Status Text']):
        process_row(index, row)

# Close the driver
driver.quit()

print("Done")


KeyboardInterrupt: 