In [None]:
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

def get_driver(link):
    options = uc.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-popup-blocking")
    
    
    prefs = {
        "profile.default_content_setting_values.popups": 0,
        "profile.default_content_settings.popups": 0,
        "profile.managed_default_content_settings.popups": 0,
    }
    options.add_experimental_option("prefs", prefs)
    
    driver = uc.Chrome(options=options)
    wait = WebDriverWait(driver, 30)
    
    driver.get(link)
    wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'views-field-name')))
    
    return driver,wait

def get_elements1(driver):
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    tabla = soup.find('table', class_='views-table cols-3')
    links = tabla.find_all('tr')

    urls = []
    topics = []
    dates = []
    
    for link in links:

        td1 = link.find('td', class_='views-field views-field-name')
        a_tag1 = td1.find('a') if td1 else None
        url = a_tag1['href'] if a_tag1 else None
        if url:
            url = 'https://www.allsides.com'+url
    
        td2 = link.find('td', class_='views-field views-field-field-story-topic')
        a_tag2 = td2.find('a') if td2 else None
        topic = a_tag2['href'] if a_tag2 else None
        topic = topic.split('/')[-1] if topic else None
    
        td3 = link.find('td', class_='views-field views-field-field-story-date')
        span = td3.find('span',class_='date-display-single') if td3 else None
        date = span.text if span else None
    
        
        if url and topic and date:
            
            urls.append(url)
            topics.append(topic)
            dates.append(date)

    return urls,topics,dates

def get_sides(driver, url):
    driver.execute_script("window.open('');")
    tabs = driver.window_handles
    driver.switch_to.window(tabs[-1])
    driver.get(url)

    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'featured-coverage'))
        )
    except:
        driver.close()
        driver.switch_to.window(tabs[0])
        return None, None, None

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    driver.close()
    driver.switch_to.window(driver.window_handles[0])

    content = soup.find('div', class_='featured-coverage')
    if not content:
        return None, None, None

    left = content.find('div', class_='news-item left')
    center = content.find('div', class_='news-item center')
    right = content.find('div', class_='news-item right')

    return left, center, right


def get_elements2(elemento):
    
    a1 = elemento.find('a',class_='news-title')
    new_url = a1['href']
    new_title = a1.text
    
    a2 = elemento.find('a',class_='source-area')
    site = a2.find('div',class_='news-source').text
    
    type = a2.find('img')['src']
    type = type.split('/')[-1].replace('bias-','').replace('.png','')
    
    if 'x' in type:
        parts = type.split('-')
        parts = [p for p in parts if 'x' not in p]
        type = '-'.join(parts)

    return new_url, new_title,site,type

def action(driver):

    urls_global = []
    topics_global = []
    dates_global = []
    title_global = []
    site_global = []
    type_global = []
    
    urls,topics,dates = get_elements1(driver)
    
    total = len(urls)
    lleva = 0

    for url,topic,date in zip(urls,topics,dates):
        print(f'URL {lleva} de {total}')
        lleva += 1
        try:
            left,center,right = get_sides(driver,url)
            time.sleep(0.1)
            sides = [left,center,right]

            for side in sides:
                try:
                    new_url, new_title,site,type = get_elements2(side)
        
                    urls_global.append(new_url)
                    topics_global.append(topic)
                    dates_global.append(date)
                    title_global.append(new_title)
                    site_global.append(site)
                    type_global.append(type)
                except:
                    print(f'Ha fallado en {side}')
        except:
            print('Una noticia fallida')

    
            
    df = pd.DataFrame({
        'url': urls_global,
        'topic': topics_global,
        'date': dates_global,
        'title': title_global,
        'site': site_global,
        'bias': type_global
    })

    
    return df   


def main():

    i = 115
    errores = 0
    scrolls = 0

    df = pd.DataFrame()
    
    while errores < 20:

        link = f'https://www.allsides.com/headline-roundups?page={i}'
        i += 1
        
        try:
            driver,wait = get_driver(link)
            print(f'\nINICIANDO SCROLL {scrolls}')
            print('-'*60)
            print()
            
            df2 = action(driver)
            df = pd.concat([df, df2], ignore_index=True)
            df.to_csv('/Users/pablogandia/Desktop/allsides_links_moment.csv',index=False)
            driver.quit()
            time.sleep(1)
            scrolls += 1
            
        except:
            time.sleep(1)
            print()
            print('-'*60)
            print('ERROR FUERTE DETECTADO')
            print('-'*60)
            
            errores += 1

    df.to_csv('/Users/pablogandia/Desktop/allsides_links_def.csv',index=False)
    driver.quit()

main()

In [1]:
df = pd.read_csv('/Users/pablogandia/Desktop/allsides_links_def.csv')
len(df)

22287

In [3]:
df['url'].nunique()

22264

In [5]:
df.head()

Unnamed: 0,url,topic,date,title,site,bias
0,https://www.npr.org/2021/01/01/952336030/what-...,general-news,2021-01-01,"What Got Us Through 2020? For Many, It Was Hob...",NPR (Online News),leaning-left
1,https://www.washingtonpost.com/opinions/2020/1...,general-news,2021-01-01,"Goodbye, 2020. You were awful, but some lights...",Guest Writer - Right,leaning-right
2,https://www.cbsnews.com/news/new-years-eve-gat...,general-news,2020-12-31,New Year's Eve gatherings could accelerate COV...,CBS News (Online),leaning-left
3,https://www.ibtimes.com/nye-traditions-you-can...,general-news,2020-12-31,NYE Traditions That You Can Still Participate ...,International Business Times,center
4,https://nypost.com/2020/12/31/heres-how-countr...,general-news,2020-12-31,Here’s how countries around the world are ring...,New York Post (News),leaning-right
