In [7]:
pip install selenium pandas webdriver-manager

Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.9 (from selenium)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.1.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio~=0.17->selenium)
  Downloading sniffio-1.3.1-py3-no

In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd

def remove_date(text):
    """Remove date from the Title text."""
    return ' '.join(text.split()[1:])

def extract_news_details(base_url, max_pages):
    """Extract news details from the given base URL up to the specified number of pages."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    page_url = base_url

    # Initialize lists to store the details
    titles = []
    kickers = []
    images = []
    links = []
    dates = []

    # Initialize page counter
    page_counter = 0

    while page_counter < max_pages:
        # Open the URL
        driver.get(page_url)

        # Select all div elements with the class "item_noticias"
        items = driver.find_elements(By.CLASS_NAME, 'item_noticias')

        # Iterate over each element and extract the necessary details
        for item in items:
            title = item.find_element(By.CLASS_NAME, 'fuente_roboto_slab').text
            kicker = item.find_element(By.TAG_NAME, 'a').get_attribute('title')
            image = item.find_element(By.TAG_NAME, 'img').get_attribute('src')
            link = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
            date = item.find_element(By.CLASS_NAME, 'fecha_item_listado_noticias').text

            titles.append(title)
            kickers.append(kicker)
            images.append(image)
            links.append(link)
            dates.append(date)

        # Check if there is a "Next" button to go to the next page
        try:
            next_button = driver.find_element(By.CLASS_NAME, 'boton_paginador siguiente')
            page_number = int(page_url.split('=')[-1]) if '=' in page_url else 1
            page_url = f"{base_url}?buscar=&pagina={page_number + 1}"
            page_counter += 1
        except:
            break

    # Close the WebDriver
    driver.quit()

    # Create a DataFrame to store the details
    data = {
        'Date': dates,
        'Title': titles,
        'Kicker': kickers,
        'Image': images,
        'Link': links
    }
    df = pd.DataFrame(data)

    # Apply the remove_date function to the 'Title' column
    df['Title'] = df['Title'].apply(remove_date)

    return df

def get_category_links():
    """Get category links from the main page."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    url = "https://www.yogonet.com/international/"

    # Open the URL
    driver.get(url)

    # Move cursor over the "Categories" tab
    categories_tab = driver.find_element(By.CSS_SELECTOR, '.item_menu.transition_02.tiene_hijos.categorias')
    ActionChains(driver).move_to_element(categories_tab).perform()

    # Select all elements with the class "item_menu hijo"
    items = driver.find_elements(By.CSS_SELECTOR, '.contenedor_items_hijos .item_menu.hijo')

    # Initialize list to store the links
    links = []

    # Iterate over each element and extract the necessary details
    for item in items:
        link = item.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
        links.append(link)

    # Close the WebDriver
    driver.quit()

    # Create a DataFrame to store the details
    data = {
        'Link': links
    }
    df = pd.DataFrame(data)

    # Convert the 'Link' column to a list of URLs
    urls = df['Link'].tolist()

    return urls[:4]

# Call the function and display the list of URLs
urls = get_category_links()

# Initialize an empty DataFrame to store combined results
combined_df = pd.DataFrame()

for url in urls:
    df = extract_news_details(url, max_pages=1)
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Display the combined DataFrame
combined_df

Unnamed: 0,Date,Title,Kicker,Image,Link
0,2025-03-07,GOLDSTEIN TO BECOME SENIOR ADVISER,Las Vegas Sands CEO Robert Goldstein to step d...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
1,2025-03-07,HAS OVER 250 ROOMS,Michigan’s Gun Lake Casino opens new 16-story ...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
2,2025-03-07,"TO DRIVE GROWTH, ENHANCE GUEST EXPERIENCE",Virgin Las Vegas appoints former Venetian exec...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
3,2025-03-07,EBITDA DROPS SHARPLY TO $1 MILLION,Resorts World Las Vegas reports weak Q4 2024 e...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
4,2025-03-06,EXCEEDS GOVERNMENT PROJECTIONS BY 5.4%,Macau’s gaming tax revenue up 35% to $11 billi...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
...,...,...,...,...,...
75,2024-10-08,ITS 11TH STATE IN THE US,"Lotto.com enters Massachusetts, bringing first...",https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
76,2024-09-30,SUSTAINABLE PRACTICES ADOPTED,The European Lotteries announces Greenhouse Ga...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
77,2024-09-30,EIGHT ADDITIONAL CATEGORIES,Texas Lottery wins two awards at NASPL Conference,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...
78,2024-09-27,AIMS TO ENHANCE PLAYER ENGAGEMENT,Florida Lottery unveils four new scratch-off g...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...


In [9]:
from collections import Counter
import pandas as pd
import re
from textblob import TextBlob

def extract_keywords(titles, num_keywords=10):
    """Extract the most frequent keywords from the titles."""
    # Combine all titles into a single string
    combined_text = ' '.join(titles)

    # Remove punctuation and convert to lowercase
    combined_text = re.sub(r'[^\w\s]', '', combined_text).lower()

    # Split the text into words
    words = combined_text.split()

    # Count the frequency of each word
    word_counts = Counter(words)

    # Get the most common keywords
    common_keywords = word_counts.most_common(num_keywords)

    return [keyword for keyword, count in common_keywords]

def post_process_data(df):
    """Perform post-processing on the scraped data."""

    def calculate_readability(text):
        """Calculate readability score using Flesch-Kincaid readability tests."""
        words = text.split()
        num_words = len(words)
        num_sentences = text.count('.') + text.count('!') + text.count('?')
        num_syllables = sum([len(re.findall(r'[aeiouy]+', word.lower())) for word in words])

        if num_words == 0 or num_sentences == 0:
            return 0

        flesch_kincaid_score = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)
        return flesch_kincaid_score

    def calculate_complexity(text):
        """Calculate title complexity based on average word length and sentence length."""
        words = text.split()
        num_words = len(words)

        if num_words == 0:
            return 0

        avg_word_length = sum(len(word) for word in words) / num_words
        return avg_word_length

    def sentiment_analysis(text):
        """Perform sentiment analysis on the text."""
        analysis = TextBlob(text)
        return analysis.sentiment.polarity

    # Extract keywords from all titles
    keywords_to_check = extract_keywords(df['Title'])

    def keyword_frequency(text, keywords):
        """Count frequency of specific keywords in the text."""
        word_list = text.lower().split()
        keyword_count = {keyword: word_list.count(keyword) for keyword in keywords}
        return keyword_count

    # Word count in Title
    df['Word_Count_Title'] = df['Title'].apply(lambda x: len(x.split()))

    # Word count in Kicker
    df['Word_Count_Kicker'] = df['Kicker'].apply(lambda x: len(x.split()))

    # Character count in Title
    df['Character_Count_Title'] = df['Title'].apply(lambda x: len(x))

    # Character count in Kicker
    df['Character_Count_Kicker'] = df['Kicker'].apply(lambda x: len(x))

    # List of words that start with a capital letter in Title
    df['Capital_Words_Kicker'] = df['Kicker'].apply(lambda x: [word for word in x.split() if word.istitle()])

    # Sentiment analysis on Title
    df['Sentiment_Title'] = df['Title'].apply(sentiment_analysis)

    # Sentiment analysis on Kicker
    df['Sentiment_Kicker'] = df['Kicker'].apply(sentiment_analysis)

    # Keyword frequency count in Title
    df['Keyword_Frequency_Title'] = df['Title'].apply(lambda x: keyword_frequency(x, keywords_to_check))

    # Keyword frequency count in Kicker
    df['Keyword_Frequency_Kicker'] = df['Kicker'].apply(lambda x: keyword_frequency(x, keywords_to_check))

    # Readability Score of Title
    df['Readability_Score_Title'] = df['Title'].apply(calculate_readability)

    # Readability Score of Kicker
    df['Readability_Score_Kicker'] = df['Kicker'].apply(calculate_readability)

    # Title complexity based on average word length and sentence length
    df['Title_Complexity'] = df['Title'].apply(calculate_complexity)

    return df

combined_df = post_process_data(combined_df)

# Display the processed DataFrame
combined_df

Unnamed: 0,Date,Title,Kicker,Image,Link,Word_Count_Title,Word_Count_Kicker,Character_Count_Title,Character_Count_Kicker,Capital_Words_Kicker,Sentiment_Title,Sentiment_Kicker,Keyword_Frequency_Title,Keyword_Frequency_Kicker,Readability_Score_Title,Readability_Score_Kicker,Title_Complexity
0,2025-03-07,GOLDSTEIN TO BECOME SENIOR ADVISER,Las Vegas Sands CEO Robert Goldstein to step d...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,5,15,34,84,"[Las, Vegas, Sands, Robert, Goldstein, Patrick...",0.000,-0.155556,"{'to': 1, 'for': 0, 'in': 0, 'million': 0, 'up...","{'to': 2, 'for': 0, 'in': 1, 'million': 0, 'up...",0.00,0.000000,6.000000
1,2025-03-07,HAS OVER 250 ROOMS,Michigan’s Gun Lake Casino opens new 16-story ...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,4,11,18,70,"[Gun, Lake, Casino, $300M]",0.000,0.136364,"{'to': 0, 'for': 0, 'in': 0, 'million': 0, 'up...","{'to': 0, 'for': 0, 'in': 1, 'million': 0, 'up...",0.00,0.000000,3.750000
2,2025-03-07,"TO DRIVE GROWTH, ENHANCE GUEST EXPERIENCE",Virgin Las Vegas appoints former Venetian exec...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,6,15,41,101,"[Virgin, Las, Vegas, Venetian, John, Fechik, S...",0.000,0.000000,"{'to': 1, 'for': 0, 'in': 0, 'million': 0, 'up...","{'to': 0, 'for': 0, 'in': 0, 'million': 0, 'up...",0.00,0.000000,6.000000
3,2025-03-07,EBITDA DROPS SHARPLY TO $1 MILLION,Resorts World Las Vegas reports weak Q4 2024 e...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,6,14,34,84,"[Resorts, World, Las, Vegas, Q4]",-0.125,-0.375000,"{'to': 1, 'for': 0, 'in': 0, 'million': 1, 'up...","{'to': 1, 'for': 0, 'in': 0, 'million': 1, 'up...",0.00,95.939286,4.833333
4,2025-03-06,EXCEEDS GOVERNMENT PROJECTIONS BY 5.4%,Macau’s gaming tax revenue up 35% to $11 billi...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,5,11,38,56,[],0.000,0.000000,"{'to': 0, 'for': 0, 'in': 0, 'million': 0, 'up...","{'to': 1, 'for': 0, 'in': 1, 'million': 0, 'up...",49.48,0.000000,6.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,2024-10-08,ITS 11TH STATE IN THE US,"Lotto.com enters Massachusetts, bringing first...",https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,6,9,24,83,"[Massachusetts,]",0.000,0.000000,"{'to': 0, 'for': 0, 'in': 1, 'million': 0, 'up...","{'to': 0, 'for': 0, 'in': 0, 'million': 0, 'up...",0.00,-18.500000,3.166667
76,2024-09-30,SUSTAINABLE PRACTICES ADOPTED,The European Lotteries announces Greenhouse Ga...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,3,13,29,103,"[The, European, Lotteries, Greenhouse, Gas, Em...",0.000,0.000000,"{'to': 0, 'for': 0, 'in': 0, 'million': 0, 'up...","{'to': 1, 'for': 0, 'in': 0, 'million': 0, 'up...",0.00,0.000000,9.000000
77,2024-09-30,EIGHT ADDITIONAL CATEGORIES,Texas Lottery wins two awards at NASPL Conference,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,3,8,27,49,"[Texas, Lottery, Conference]",0.000,0.300000,"{'to': 0, 'for': 0, 'in': 0, 'million': 0, 'up...","{'to': 0, 'for': 0, 'in': 0, 'million': 0, 'up...",0.00,0.000000,8.333333
78,2024-09-27,AIMS TO ENHANCE PLAYER ENGAGEMENT,Florida Lottery unveils four new scratch-off g...,https://imagenesyogonet.b-cdn.net/data/imagene...,https://www.yogonet.com/international/news/202...,5,14,33,88,"[Florida, Lottery]",0.000,0.136364,"{'to': 1, 'for': 0, 'in': 0, 'million': 0, 'up...","{'to': 0, 'for': 0, 'in': 1, 'million': 1, 'up...",0.00,0.000000,5.800000


In [11]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
from collections import Counter
import re
import os

# Install necessary libraries
!pip install selenium pandas webdriver-manager textblob

def remove_date(text):
    """Remove date from the Title text."""
    return ' '.join(text.split()[1:])

def extract_news_details(base_url, max_pages):
    """Extract news details from the given base URL up to the specified number of pages."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    page_url = base_url

    # Initialize lists to store the details
    titles = []
    kickers = []
    images = []
    links = []
    dates = []

    # Initialize page counter
    page_counter = 0

    while page_counter < max_pages:
        # Open the URL
        driver.get(page_url)

        # Select all div elements with the class "item_noticias"
        items = driver.find_elements(By.CLASS_NAME, 'item_noticias')

        # Iterate over each element and extract the necessary details
        for item in items:
            title = item.find_element(By.CLASS_NAME, 'fuente_roboto_slab').text
            kicker = item.find_element(By.TAG_NAME, 'a').get_attribute('title')
            image = item.find_element(By.TAG_NAME, 'img').get_attribute('src')
            link = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
            date = item.find_element(By.CLASS_NAME, 'fecha_item_listado_noticias').text

            titles.append(title)
            kickers.append(kicker)
            images.append(image)
            links.append(link)
            dates.append(date)

        # Check if there is a "Next" button to go to the next page
        try:
            next_button = driver.find_element(By.CLASS_NAME, 'boton_paginador siguiente')
            page_number = int(page_url.split('=')[-1]) if '=' in page_url else 1
            page_url = f"{base_url}?buscar=&pagina={page_number + 1}"
            page_counter += 1
        except:
            break

    # Close the WebDriver
    driver.quit()

    # Create a DataFrame to store the details
    data = {
        'Date': dates,
        'Title': titles,
        'Kicker': kickers,
        'Image': images,
        'Link': links
    }
    df = pd.DataFrame(data)

    # Apply the remove_date function to the 'Title' column
    df['Title'] = df['Title'].apply(remove_date)

    return df


def get_category_links():
    """Get category links from the main page."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    url = "https://www.yogonet.com/international/"

    # Open the URL
    driver.get(url)

    # Move cursor over the "Categories" tab
    categories_tab = driver.find_element(By.CSS_SELECTOR, '.item_menu.transition_02.tiene_hijos.categorias')
    ActionChains(driver).move_to_element(categories_tab).perform()

    # Select all elements with the class "item_menu hijo"
    items = driver.find_elements(By.CSS_SELECTOR, '.contenedor_items_hijos .item_menu.hijo')

    # Initialize list to store the links
    links = []

    # Iterate over each element and extract the necessary details
    for item in items:
        link = item.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
        links.append(link)

    # Close the WebDriver
    driver.quit()

    # Create a DataFrame to store the details
    data = {
        'Link': links
    }
    df = pd.DataFrame(data)

    # Convert the 'Link' column to a list of URLs
    urls = df['Link'].tolist()

    return urls[:4]


# Call the function and display the list of URLs
urls = get_category_links()

# Initialize an empty DataFrame to store combined results
combined_df = pd.DataFrame()

for url in urls:
    df = extract_news_details(url, max_pages=1)
    combined_df = pd.concat([combined_df, df], ignore_index=True)


def extract_keywords(titles):
    """Extract the most frequent keywords from the titles."""
    # Combine all titles into a single string
    combined_text = ' '.join(titles)

    # Remove punctuation and convert to lowercase
    combined_text = re.sub(r'[^\w\s]', '', combined_text).lower()

    # Split the text into words
    words = combined_text.split()

    # Count the frequency of each word
    word_counts = Counter(words)

    # Find the maximum frequency
    max_frequency = max(word_counts.values()) if word_counts else 0

    # Get keywords with maximum frequency
    keywords = [word for word, count in word_counts.items() if count == max_frequency]
    
    return keywords

def post_process_data(df):
    """Perform post-processing on the scraped data."""

    def calculate_readability(text):
        """Calculate readability score using Flesch-Kincaid readability tests."""
        words = text.split()
        num_words = len(words)
        num_sentences = text.count('.') + text.count('!') + text.count('?')
        num_syllables = sum([len(re.findall(r'[aeiouy]+', word.lower())) for word in words])

        if num_words == 0 or num_sentences == 0:
            return 0

        flesch_kincaid_score = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)
        return flesch_kincaid_score

    def calculate_complexity(text):
        """Calculate title complexity based on average word length and sentence length."""
        words = text.split()
        num_words = len(words)

        if num_words == 0:
            return 0

        avg_word_length = sum(len(word) for word in words) / num_words
        return avg_word_length

    def sentiment_analysis(text):
        """Perform sentiment analysis on the text."""
        analysis = TextBlob(text)
        return analysis.sentiment.polarity

    # Extract keywords from all titles
    keywords_to_check = extract_keywords(df['Title'])

    def keyword_frequency(text, keywords):
        """Count frequency of specific keywords in the text."""
        word_list = text.lower().split()
        keyword_count = {keyword: word_list.count(keyword) for keyword in keywords}
        return keyword_count

    # Word count in Title
    df['Word_Count_Title'] = df['Title'].apply(lambda x: len(x.split()))

    # Word count in Kicker
    df['Word_Count_Kicker'] = df['Kicker'].apply(lambda x: len(x.split()))

    # Character count in Title
    df['Character_Count_Title'] = df['Title'].apply(lambda x: len(x))

    # Character count in Kicker
    df['Character_Count_Kicker'] = df['Kicker'].apply(lambda x: len(x))

    # List of words that start with a capital letter in Title
    df['Capital_Words_Kicker'] = df['Kicker'].apply(lambda x: [word for word in x.split() if word.istitle()])

    # Sentiment analysis on Title
    df['Sentiment_Title'] = df['Title'].apply(sentiment_analysis)

    # Sentiment analysis on Kicker
    df['Sentiment_Kicker'] = df['Kicker'].apply(sentiment_analysis)

    # Keyword frequency count in Title
    df['Keyword_Frequency_Title'] = df['Title'].apply(lambda x: keyword_frequency(x, keywords_to_check))

    # Keyword frequency count in Kicker
    df['Keyword_Frequency_Kicker'] = df['Kicker'].apply(lambda x: keyword_frequency(x, keywords_to_check))

    # Readability Score of Title
    df['Readability_Score_Title'] = df['Title'].apply(calculate_readability)

    # Readability Score of Kicker
    df['Readability_Score_Kicker'] = df['Kicker'].apply(calculate_readability)

    # Title complexity based on average word length and sentence length
    df['Title_Complexity'] = df['Title'].apply(calculate_complexity)

    return df

combined_df = post_process_data(combined_df)

# Save the DataFrame to a CSV file
combined_df.to_csv('combined_news_data.csv', encoding='utf-8-sig', index=False)

# Save the DataFrame to a CSV file in the current directory
current_directory = os.getcwd()
file_path = os.path.join(current_directory, 'combined_news_data.csv')
combined_df.to_csv(file_path, encoding='utf-8-sig', index=False)

print(f"File saved to: {file_path}")



KeyboardInterrupt: 

In [10]:
import os
import re
from collections import Counter

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from textblob import TextBlob

def remove_date(text):
    """Remove date from the Title text."""
    return ' '.join(text.split()[1:])

def extract_news_details(base_url, max_pages):
    """Extract news details from the given base URL up to the specified number of pages."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    page_url = base_url

    # Initialize lists to store the details
    titles, kickers, images, links, dates = [], [], [], [], []

    # Initialize page counter
    page_counter = 0

    while page_counter < max_pages:
        # Open the URL
        driver.get(page_url)

        # Select all div elements with the class "item_noticias"
        items = driver.find_elements(By.CLASS_NAME, 'item_noticias')

        # Iterate over each element and extract the necessary details
        for item in items:
            title = item.find_element(By.CLASS_NAME, 'fuente_roboto_slab').text
            kicker = item.find_element(By.TAG_NAME, 'a').get_attribute('title')
            image = item.find_element(By.TAG_NAME, 'img').get_attribute('src')
            link = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
            date = item.find_element(By.CLASS_NAME, 'fecha_item_listado_noticias').text

            titles.append(title)
            kickers.append(kicker)
            images.append(image)
            links.append(link)
            dates.append(date)

        # Check if there is a "Next" button to go to the next page
        try:
            next_button = driver.find_element(By.CLASS_NAME, 'boton_paginador siguiente')
            page_number = int(page_url.split('=')[-1]) if '=' in page_url else 1
            page_url = f"{base_url}?buscar=&pagina={page_number + 1}"
            page_counter += 1
        except:
            break

    # Close the WebDriver
    driver.quit()

    # Create a DataFrame to store the details
    data = {
        'Date': dates,
        'Title': titles,
        'Kicker': kickers,
        'Image': images,
        'Link': links
    }
    df = pd.DataFrame(data)

    # Apply the remove_date function to the 'Title' column
    df['Title'] = df['Title'].apply(remove_date)

    return df

def get_category_links():
    """Get category links from the main page."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)

    url = "https://www.yogonet.com/international/"

    # Open the URL
    driver.get(url)

    # Move cursor over the "Categories" tab
    categories_tab = driver.find_element(By.CSS_SELECTOR, '.item_menu.transition_02.tiene_hijos.categorias')
    ActionChains(driver).move_to_element(categories_tab).perform()

    # Select all elements with the class "item_menu hijo"
    items = driver.find_elements(By.CSS_SELECTOR, '.contenedor_items_hijos .item_menu.hijo')

    # Initialize list to store the links
    links = [item.find_element(By.CSS_SELECTOR, 'a').get_attribute('href') for item in items]

    # Close the WebDriver
    driver.quit()

    return links[:4]

def extract_keywords(text, num_keywords=10):
    """Extract the most frequent keywords from a given text."""
    combined_text = re.sub(r'[^\w\s]', '', text).lower()
    
    words = combined_text.split()
    
    word_counts = Counter(words)
    
    common_keywords = word_counts.most_common(num_keywords)
    
    return [keyword for keyword, count in common_keywords]

def post_process_data(df):
    """Perform post-processing on the scraped data."""

    def calculate_readability(text):
        """Calculate readability score using Flesch-Kincaid readability tests."""
        words = text.split()
        num_words = len(words)
        num_sentences = text.count('.') + text.count('!') + text.count('?')
        num_syllables = sum([len(re.findall(r'[aeiouy]+', word.lower())) for word in words])

        if num_words == 0 or num_sentences == 0:
            return 0

        flesch_kincaid_score = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)
        return flesch_kincaid_score

    def calculate_complexity(text):
        """Calculate title complexity based on average word length and sentence length."""
        words = text.split()
        num_words = len(words)

        if num_words == 0:
            return 0

        avg_word_length = sum(len(word) for word in words) / num_words
        return avg_word_length

    def sentiment_analysis(text):
        """Perform sentiment analysis on the text."""
        analysis = TextBlob(text)
        return analysis.sentiment.polarity

    # Extract keywords from all titles (maximum 10)
    keywords_to_check_title = extract_keywords(' '.join(df['Title']))
    keywords_to_check_kicker = extract_keywords(' '.join(df['Kicker']))

    def keyword_frequency(text, keywords):
        """Count frequency of specific keywords in the text."""
        word_list = text.lower().split()
        keyword_count = {keyword: word_list.count(keyword) for keyword in keywords}
        return keyword_count

    # Word count in Title
    df['Word_Count_Title'] = df['Title'].apply(lambda x: len(x.split()))

    # Word count in Kicker
    df['Word_Count_Kicker'] = df['Kicker'].apply(lambda x: len(x.split()))

    # Character count in Title
    df['Character_Count_Title'] = df['Title'].apply(lambda x: len(x))

    # Character count in Kicker
    df['Character_Count_Kicker'] = df['Kicker'].apply(lambda x: len(x))

    # List of words that start with a capital letter in Title
    df['Capital_Words_Kicker'] = df['Kicker'].apply(lambda x: [word for word in x.split() if word.istitle()])

    # Sentiment analysis on Title
    df['Sentiment_Title'] = df['Title'].apply(sentiment_analysis)

    # Sentiment analysis on Kicker
    df['Sentiment_Kicker'] = df['Kicker'].apply(sentiment_analysis)

    # Keyword frequency count in Title
    df['Keyword_Frequency_Title'] = df['Title'].apply(lambda x: keyword_frequency(x, keywords_to_check_title))

    # Keyword frequency count in Kicker
    df['Keyword_Frequency_Kicker'] = df['Kicker'].apply(lambda x: keyword_frequency(x, keywords_to_check_kicker))

    # Readability Score of Title
    df['Readability_Score_Title'] = df['Title'].apply(calculate_readability)

    # Readability Score of Kicker
    df['Readability_Score_Kicker'] = df['Kicker'].apply(calculate_readability)

    # Title complexity based on average word length and sentence length
    df['Title_Complexity'] = df['Title'].apply(calculate_complexity)

    return df

def main():
    """Main function to run the pipeline."""
    # Call the function and display the list of URLs
    urls = get_category_links()

    # Initialize an empty DataFrame to store combined results
    combined_df = pd.DataFrame()

    for url in urls:
        df = extract_news_details(url, max_pages=1)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    combined_df = post_process_data(combined_df)

    # Save the DataFrame to a CSV file
    file_path = os.path.join(os.getcwd(), 'combined_news_data.csv')
    combined_df.to_csv(file_path, encoding='utf-8-sig', index=False)

    print(f"File saved to: {file_path}")

if __name__ == "__main__":
    main()

File saved to: c:\Users\56937\OneDrive\data_engineer\scraper_news_yogonet\combined_news_data.csv
