In [1]:
import spacy
from collections import Counter, defaultdict
from itertools import permutations
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from urllib.parse import urlparse
import csv
import os
import random
import time
from keybert import KeyBERT

  from tqdm.autonotebook import tqdm, trange


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords(text, top_n=2):
    # Initialize the TF-IDF Vectorizer with stop words
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))  # Ensure single words only

    # Fit and transform the data
    tfidf_matrix = vectorizer.fit_transform([text])

    # Get the feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get the scores for each word
    tfidf_scores = tfidf_matrix.toarray().flatten()

    # Create a dictionary of words and their TF-IDF scores
    word_scores = {word: score for word, score in zip(feature_names, tfidf_scores)}

    # Sort words by score
    sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)

    # Extract top keywords
    keywords = [word for word, score in sorted_words[:top_n]]
    
    return keywords

def generate_combinations(keywords):
    # Generate all permutations of length 2 and 3
    comb2 = list(permutations(keywords, 2))
    comb3 = list(permutations(keywords, 3))
    # Combine and flatten the lists
    all_combinations = [item for sublist in [comb2, comb3] for item in sublist]
    # Convert tuples to strings
    search_terms = [' '.join(comb) for comb in all_combinations]
    return search_terms

def search_by_image(image_path, chrome_driver_path):
    service = ChromeService(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=service)
    try:
        driver.get('https://images.google.com/')
        search_by_image_button = WebDriverWait(driver, 3).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'Gdd5U'))
        )
        search_by_image_button.click()

        upload_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'input[type="file"]'),
            )
        )
        upload_input.send_keys(image_path)

        results_div = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'aah4tc'))
        )

        all_links = results_div.find_elements(
            By.CSS_SELECTOR, '.G19kAf.ENn9pd > .Vd9M6 > a'
        )
        if len(all_links) == 0:
            print("Cannot get the search results.")
            return None, None
        time.sleep(3)
        keyword_list = []
        for link in all_links[:10]:  # Limiting to the first 5 results
            headline = link.get_attribute('aria-label')
            url = link.get_attribute('href')
            keywords = extract_keywords(headline)
            print(f"Headline: {headline}")
            print(f"URL: {url}")
            print(f"Keywords: {', '.join(keywords)}")
            print()
            keyword_list.extend(keywords)

        # Count the frequency of each keyword
        keyword_counter = Counter(keyword_list)
        # Get the top 2 most common keywords
        most_common_keywords = [keyword for keyword, _ in keyword_counter.most_common(2)]

        print("Top 2 most repeated keywords:")
        for keyword, count in keyword_counter.most_common(2):
            print(f"{keyword}: {count} times")

        return keyword_counter, most_common_keywords

    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

    finally:
        driver.quit()


In [3]:
# Path to your ChromeDriver executable
chrome_driver_path = r"E:\Projects\JK\d1\chromedriver-win64\chromedriver.exe"

# CSV file path
csv_file_path = r"E:\Projects\JK\Data\output_data.csv"


In [4]:
def perform_google_search(search_terms, chrome_driver_path):
    domain_counts = defaultdict(int)

    # Initialize Chrome driver with the service object
    service = ChromeService(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=service)

    try:
        for term in search_terms:
            # Open a new tab for each search term
            driver.execute_script("window.open('about:blank', 'tab');")
            driver.switch_to.window(driver.window_handles[-1])

            # Navigate to Google
            driver.get("https://www.google.com")

            # Find the search box
            search_box = driver.find_element(By.NAME, "q")

            # Enter the search term and submit the form
            search_box.send_keys(term)
            search_box.send_keys(Keys.RETURN)

            # Wait for the results to load
            time.sleep(random.uniform(10, 20))  # Adjust waiting time as needed

            # Find the search result links
            results = driver.find_elements(By.XPATH, "//div[@class='g']//a")

            # Extract and print the URLs of the first 10 results
            urls = [result.get_attribute("href") for result in results[:10]]
            time.sleep(3)
            print(f"Search term: {term}")
            for index, url in enumerate(urls, start=1):
                print(f"Result {index}: {url}")
                # Parse the URL and extract the domain
                parsed_url = urlparse(url)
                domain = parsed_url.netloc

                # Count occurrences of each domain
                domain_counts[domain] += 1

            # Close the current tab
            driver.close()

            # Switch back to the main tab
            driver.switch_to.window(driver.window_handles[0])

        # Find the most repetitive domain
        most_common_domain = max(domain_counts, key=domain_counts.get)
        print(f"\nMost repetitive domain: {most_common_domain} ({domain_counts[most_common_domain]} occurrences)")

        return most_common_domain

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

    finally:
        # Close the browser
        driver.quit()


In [5]:
# Read the CSV file
with open(csv_file_path, mode='r', newline='') as file:
    reader = csv.DictReader(file)
    rows = list(reader)

for row in rows:
    image_path = row['Image_Path']
    input_domain_name = row['Input_Domain_Name']

    # Perform image search and extract keywords
    keyword_counter, most_common_keywords = search_by_image(image_path, chrome_driver_path)

    if not most_common_keywords:
        continue

    # Use the third keyword from the CSV file's Input_Domain_Name column
    third_keyword = input_domain_name
    most_common_keywords.append(third_keyword)

    # Generate all combinations of the top 3 keywords
    search_terms = generate_combinations(most_common_keywords)
    search_terms.append(input_domain_name)

    # Perform Google search with generated search terms
    most_common_domain = perform_google_search(search_terms, chrome_driver_path)

    # Update the CSV file with the most common domain for each image path
    row['Output_Domain_Name'] = most_common_domain

# Write the updated rows back to the CSV file
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["Image_Path", "Input_Domain_Name", "Output_Domain_Name"])
    writer.writeheader()
    writer.writerows(rows)


Headline: File:Cricbuzz Logo.png - Wikimedia Commons from Wikimedia
URL: https://commons.wikimedia.org/wiki/File:Cricbuzz_Logo.png
Keywords: cricbuzz, png, logo, wikimedia, file

Headline: Cricbuzz TV on the App Store from Apple
URL: https://apps.apple.com/us/app/cricbuzz-tv/id6448011464
Keywords: cricbuzz, app, tv, apple, store

Headline: Crickbuzz - IPL 2024 from Govt. Hrangbana College
URL: https://ghbc.edu.in/college/menu/cricket/crickbuzz/
Keywords: ipl, crickbuzz, hrangbana, 2024, govt

Headline: Home - Cricket Exchange in india from cricbuzzid.com
URL: https://cricbuzzid.com/
Keywords: cricket, exchange, india, cricbuzzid, home

Headline: cricbuzz app - 9Apps from 9Apps
URL: https://www.9apps.com/query/cricbuzz-app/
Keywords: 9apps, cricbuzz, app

Headline: Eapp24 – All In One Apps from Eapp24
URL: https://easyshop64.com/
Keywords: eapp24, apps

Headline: so sad #viral 5 time winner team but now omg can't blieved yaar#for... | TikTok from TikTok - Make Your Day
URL: https://www.