In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

# For handling request errors
from requests.exceptions import RequestException

In [2]:
def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
    }

In [None]:
def scrape_techinasia(page_number=1):
    base_url = f'https://www.techinasia.com/news'
    articles = []

    try:
        # Add delay to be respectful to the website
        time.sleep(2)

        response = requests.get(base_url, headers=get_headers())
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all article elements (you'll need to inspect the website to get the correct CSS selectors)
        article_elements = soup.find_all('article', class_='jsx-1786728215')

        for article in article_elements:
            try:
                title = article.find('h2').text.strip()
                link = article.find('a')['href']
                date = article.find('time')['datetime']

                articles.append({
                    'title': title,
                    'link': f'https://www.techinasia.com{link}',
                    'date': date,
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
            except (AttributeError, KeyError) as e:
                print(f"Error parsing article: {e}")
                continue

        return articles

    except RequestException as e:
        print(f"Error fetching page {page_number}: {e}")
        return []

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
    }

def scrape_techinasia(page_number=1):
    base_url = f'https://www.techinasia.com/news?page={page_number}'
    articles = []

    try:
        time.sleep(2)  # Respectful delay

        response = requests.get(base_url, headers=get_headers())
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all article elements with the correct class
        article_elements = soup.find_all('article', class_='jsx-1678928787 post-card')

        for article in article_elements:
            try:
                # Extract title
                title = article.find('h3', class_='post-title').text.strip()

                # Extract source
                source_element = article.find('span', class_='post-source-name')
                source = source_element.text if source_element else 'N/A'

                # Extract time
                time_element = article.find('time')
                posted_time = time_element['datetime'] if time_element else 'N/A'

                # Extract categories/tags
                tags = []
                tag_elements = article.find_all('a', class_='post-taxonomy-link')
                for tag in tag_elements:
                    tags.append(tag.text.strip('· '))

                # Extract article URL
                link = article.find('h3', class_='post-title').find_parent('a')['href']
                full_link = f'https://www.techinasia.com{link}'

                articles.append({
                    'title': title,
                    'source': source,
                    'posted_time': posted_time,
                    'tags': ', '.join(tags),
                    'url': full_link,
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })

            except (AttributeError, KeyError) as e:
                print(f"Error parsing article: {e}")
                continue

        return articles

    except requests.RequestException as e:
        print(f"Error fetching page {page_number}: {e}")
        return []

# Function to scrape multiple pages
def scrape_multiple_pages(start_page=1, end_page=5):
    all_articles = []

    for page in range(start_page, end_page + 1):
        print(f"Scraping page {page}...")
        articles = scrape_techinasia(page)
        all_articles.extend(articles)

    return pd.DataFrame(all_articles)

In [4]:
# Scrape first 3 pages
df = scrape_multiple_pages(1, 3)

# Save to CSV
df.to_csv('techinasia_news.csv', index=False)

# Display first few rows
print(df.head())

Scraping page 1...
Scraping page 2...
Scraping page 3...
Empty DataFrame
Columns: []
Index: []


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import json

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.techinasia.com/'
    }

def scrape_techinasia_ai_news(num_articles=20):
    # Initial URL for AI category
    url = 'https://www.techinasia.com/news?category=artificial-intelligence'
    articles = []

    try:
        response = requests.get(url, headers=get_headers())
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all article elements
        article_elements = soup.find_all('article', class_='jsx-1678928787 post-card')

        for article in article_elements:
            try:
                # Extract source information
                source_element = article.find('span', class_='post-source-name')
                source = source_element.text.strip() if source_element else 'N/A'

                # Extract source URL
                source_link = article.find('a', class_='post-source')['href'] if article.find('a', class_='post-source') else None

                # Extract title
                title_element = article.find('h3', class_='post-title')
                title = title_element.text.strip() if title_element else 'N/A'

                # Extract TechInAsia article URL
                article_url = None
                title_link = article.find('h3', class_='post-title').find_parent('a')
                if title_link:
                    article_url = f"https://www.techinasia.com{title_link['href']}"

                # Extract time
                time_element = article.find('time')
                posted_time = time_element['datetime'] if time_element else 'N/A'
                relative_time = time_element.text.strip() if time_element else 'N/A'

                # Extract categories and tags
                tags = []
                category_elements = article.find_all('a', class_='post-taxonomy-link')
                for tag in category_elements:
                    tags.append(tag.text.strip('· '))

                articles.append({
                    'title': title,
                    'source': source,
                    'source_url': source_link,
                    'article_url': article_url,
                    'posted_time': posted_time,
                    'relative_time': relative_time,
                    'tags': ', '.join(tags),
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })

            except Exception as e:
                print(f"Error parsing article: {e}")
                continue

            if len(articles) >= num_articles:
                break

    except requests.RequestException as e:
        print(f"Error fetching page: {e}")
        return []

    return pd.DataFrame(articles)

# Function to save the results
def save_results(df):
    # Save to CSV
    filename = f'techinasia_ai_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Saved {len(df)} articles to {filename}")

    # Display first few articles
    print("\nFirst few articles:")
    print(df[['title', 'source', 'posted_time']].head())

    return filename

In [6]:
# Scrape the latest 20 AI news articles
df = scrape_techinasia_ai_news(20)

# Save and display results
filename = save_results(df)

Saved 0 articles to techinasia_ai_news_20250114_203500.csv

First few articles:


KeyError: "None of [Index(['title', 'source', 'posted_time'], dtype='object')] are in the [columns]"

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.techinasia.com/'
    }

def scrape_techinasia_ai_news(num_articles=20):
    url = 'https://www.techinasia.com/news?category=artificial-intelligence'
    articles = []

    try:
        print("Fetching URL:", url)
        response = requests.get(url, headers=get_headers())
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main container
        main_container = soup.find('section', class_='jsx-2115179455 news-section')
        if not main_container:
            print("Could not find main news container")
            return pd.DataFrame()

        # Find all article elements
        article_elements = main_container.find_all('article', class_='jsx-1678928787 post-card')
        print(f"Found {len(article_elements)} articles")

        for i, article in enumerate(article_elements, 1):
            try:
                # Extract title and its link
                title_element = article.find('h3', class_='post-title')
                title = title_element.text.strip() if title_element else 'N/A'

                # Get the complete article URL
                article_link = article.find('h3', class_='post-title').find_parent('a')
                article_url = f"https://www.techinasia.com{article_link['href']}" if article_link else None

                # Get source information
                source_element = article.find('span', class_='post-source-name')
                source = source_element.text.strip() if source_element else 'N/A'

                source_link = article.find('a', class_='post-source')
                source_url = source_link['href'] if source_link else None

                # Get time information
                time_element = article.find('time', class_='post-time')
                posted_time = time_element['datetime'] if time_element else 'N/A'
                relative_time = time_element.text.strip() if time_element else 'N/A'

                # Get categories and tags
                tags = []
                tag_elements = article.find_all('a', class_='post-taxonomy-link')
                for tag in tag_elements:
                    tag_text = tag.text.strip('· ')
                    if tag_text:
                        tags.append(tag_text)

                article_data = {
                    'title': title,
                    'article_url': article_url,
                    'source': source,
                    'source_url': source_url,
                    'posted_time': posted_time,
                    'relative_time': relative_time,
                    'tags': ', '.join(tags) if tags else 'N/A',
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                print(f"Processing article {i}: {title[:50]}...")
                articles.append(article_data)

            except Exception as e:
                print(f"Error parsing article {i}: {str(e)}")
                continue

            if len(articles) >= num_articles:
                break

    except requests.RequestException as e:
        print(f"Error fetching page: {e}")
        return pd.DataFrame()

    if not articles:
        print("No articles were collected")
        return pd.DataFrame()

    df = pd.DataFrame(articles)
    print(f"\nSuccessfully collected {len(df)} articles")
    return df

def save_results(df):
    if df.empty:
        print("No data to save")
        return None

    # Save to CSV
    filename = f'techinasia_ai_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nSaved {len(df)} articles to {filename}")

    # Display first few articles
    print("\nFirst few articles:")
    print(df[['title', 'source', 'posted_time']].head())

    return filename

# Test the scraper
if __name__ == "__main__":
    print("Starting scraper...")
    df = scrape_techinasia_ai_news(20)
    if not df.empty:
        save_results(df)

Starting scraper...
Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence
Could not find main news container


In [8]:
df = scrape_techinasia_ai_news(20)
save_results(df)

Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence
Could not find main news container
No data to save


In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.techinasia.com/'
    }

def scrape_techinasia_ai_news(num_articles=20):
    url = 'https://www.techinasia.com/news?category=artificial-intelligence'
    articles = []

    try:
        print("Fetching URL:", url)
        response = requests.get(url, headers=get_headers())
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Debug: Print the HTML structure
        print("\nSearching for news container...")

        # First find the post-list-wrapper
        post_list = soup.find('div', class_='jsx-4205048050 post-list-wrapper')
        if not post_list:
            print("Could not find post list wrapper")
            return pd.DataFrame()

        # Find all article elements
        article_elements = post_list.find_all('article', class_='jsx-1678928787')
        print(f"Found {len(article_elements)} articles")

        for i, article in enumerate(article_elements, 1):
            try:
                # Get post content div
                content_div = article.find('div', class_='jsx-1678928787 post-content')

                # Extract title and its link
                title_element = content_div.find('h3', class_='jsx-1678928787 post-title')
                title = title_element.text.strip() if title_element else 'N/A'

                # Get the complete article URL
                article_link = content_div.find('a', href=True)
                article_url = f"https://www.techinasia.com{article_link['href']}" if article_link and not article_link['href'].startswith('http') else article_link['href']

                # Get source information
                source_element = content_div.find('span', class_='jsx-1678928787 post-source-name')
                source = source_element.text.strip() if source_element else 'N/A'

                source_link = content_div.find('a', class_='jsx-1678928787 post-source')
                source_url = source_link['href'] if source_link else None

                # Get time information from post footer
                footer = article.find('div', class_='jsx-1678928787 post-footer')
                time_element = footer.find('time') if footer else None
                posted_time = time_element['datetime'] if time_element else 'N/A'
                relative_time = time_element.text.strip() if time_element else 'N/A'

                # Get categories and tags from footer
                tags = []
                if footer:
                    tag_elements = footer.find_all('a', class_='post-taxonomy-link')
                    for tag in tag_elements:
                        tag_text = tag.text.strip('· ')
                        if tag_text:
                            tags.append(tag_text)

                article_data = {
                    'title': title,
                    'article_url': article_url,
                    'source': source,
                    'source_url': source_url,
                    'posted_time': posted_time,
                    'relative_time': relative_time,
                    'tags': ', '.join(tags) if tags else 'N/A',
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                print(f"Processing article {i}: {title[:50]}...")
                articles.append(article_data)

            except Exception as e:
                print(f"Error parsing article {i}: {str(e)}")
                continue

            if len(articles) >= num_articles:
                break

    except requests.RequestException as e:
        print(f"Error fetching page: {e}")
        return pd.DataFrame()

    if not articles:
        print("No articles were collected")
        return pd.DataFrame()

    df = pd.DataFrame(articles)
    print(f"\nSuccessfully collected {len(df)} articles")
    return df

def save_results(df):
    if df.empty:
        print("No data to save")
        return None

    # Save to CSV
    filename = f'techinasia_ai_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nSaved {len(df)} articles to {filename}")

    # Display first few articles
    print("\nFirst few articles:")
    print(df[['title', 'source', 'posted_time']].head())

    return filename

# Test the scraper
df = scrape_techinasia_ai_news(20)
save_results(df)

Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence

Searching for news container...
Could not find post list wrapper
No data to save


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.techinasia.com/'
    }

def scrape_techinasia_ai_news(num_articles=20):
    url = 'https://www.techinasia.com/news?category=artificial-intelligence'
    articles = []

    try:
        print("Fetching URL:", url)
        response = requests.get(url, headers=get_headers())
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        print("\nSearching for news container...")

        # Debug: Print initial structure
        main_section = soup.find('main')
        if not main_section:
            print("Could not find main section")
            return pd.DataFrame()

        # First find the infinite scroll container
        infinite_scroll = soup.find('div', class_='infinite-scroll')
        if not infinite_scroll:
            print("Could not find infinite scroll container")
            return pd.DataFrame()

        # Find post list wrapper (using partial class match)
        post_list = infinite_scroll.find('div', class_=lambda x: x and 'post-list-wrapper' in x)
        if not post_list:
            print("Could not find post list wrapper")
            return pd.DataFrame()

        # Find all article elements (using partial class match)
        article_elements = post_list.find_all('article', class_=lambda x: x and 'post-card' in x)
        print(f"Found {len(article_elements)} articles")

        for i, article in enumerate(article_elements, 1):
            try:
                # Get post content div
                content_div = article.find('div', class_=lambda x: x and 'post-content' in x)
                if not content_div:
                    print(f"No content div found for article {i}")
                    continue

                # Extract title and its link
                title_element = content_div.find('h3', class_=lambda x: x and 'post-title' in x)
                title = title_element.text.strip() if title_element else 'N/A'

                # Get article URL
                article_link = content_div.find_all('a')[-1] if content_div.find_all('a') else None
                article_url = f"https://www.techinasia.com{article_link['href']}" if article_link and not article_link['href'].startswith('http') else article_link['href'] if article_link else None

                # Get source information
                source_element = content_div.find('span', class_=lambda x: x and 'post-source-name' in x)
                source = source_element.text.strip() if source_element else 'N/A'

                source_link = content_div.find('a', class_=lambda x: x and 'post-source' in x)
                source_url = source_link['href'] if source_link else None

                # Get time information from post footer
                footer = article.find('div', class_=lambda x: x and 'post-footer' in x)
                time_element = footer.find('time') if footer else None
                posted_time = time_element['datetime'] if time_element else 'N/A'
                relative_time = time_element.text.strip() if time_element else 'N/A'

                # Get categories and tags
                tags = []
                if footer:
                    tag_elements = footer.find_all('a', class_='post-taxonomy-link')
                    for tag in tag_elements:
                        tag_text = tag.text.strip('· ')
                        if tag_text:
                            tags.append(tag_text)

                article_data = {
                    'title': title,
                    'article_url': article_url,
                    'source': source,
                    'source_url': source_url,
                    'posted_time': posted_time,
                    'relative_time': relative_time,
                    'tags': ', '.join(tags) if tags else 'N/A',
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                print(f"Processing article {i}: {title[:50]}...")
                articles.append(article_data)

            except Exception as e:
                print(f"Error parsing article {i}: {str(e)}")
                continue

            if len(articles) >= num_articles:
                break

    except requests.RequestException as e:
        print(f"Error fetching page: {e}")
        return pd.DataFrame()

    if not articles:
        print("No articles were collected")
        return pd.DataFrame()

    df = pd.DataFrame(articles)
    print(f"\nSuccessfully collected {len(df)} articles")
    return df

def save_results(df):
    if df.empty:
        print("No data to save")
        return None

    # Save to CSV
    filename = f'techinasia_ai_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nSaved {len(df)} articles to {filename}")

    # Display first few articles
    print("\nFirst few articles:")
    print(df[['title', 'source', 'posted_time']].head())

    return filename

# Debug: Print the actual HTML
df = scrape_techinasia_ai_news(20)
save_results(df)

Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence

Searching for news container...
Could not find main section
No data to save


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

def get_headers():
    return {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.techinasia.com/'
    }

def scrape_techinasia_ai_news(num_articles=20):
    url = 'https://www.techinasia.com/news?category=artificial-intelligence'
    articles = []

    try:
        print("Fetching URL:", url)
        response = requests.get(url, headers=get_headers())
        response.raise_for_status()

        # Debug: Print the first part of the response
        print("\nFirst 1000 characters of response:")
        print(response.text[:1000])

        soup = BeautifulSoup(response.text, 'html.parser')

        # Debug: Print all main tags found
        print("\nAll main tags found:")
        main_tags = soup.find_all('main')
        for i, main in enumerate(main_tags):
            print(f"Main tag {i+1} classes:", main.get('class', 'No class'))

        # Debug: Print all div tags with 'infinite-scroll' in their class
        print("\nAll infinite-scroll divs found:")
        infinite_divs = soup.find_all('div', class_=lambda x: x and 'infinite-scroll' in x)
        for i, div in enumerate(infinite_divs):
            print(f"Infinite scroll div {i+1} classes:", div.get('class', 'No class'))

        # Debug: Print all article tags
        print("\nAll article tags found:")
        articles_found = soup.find_all('article')
        print(f"Found {len(articles_found)} article tags")

        if len(articles_found) == 0:
            print("\nLooks like the page content is loaded via JavaScript. We need to use Selenium.")
            return pd.DataFrame()

        # Rest of the code...

    except requests.RequestException as e:
        print(f"Error fetching page: {e}")
        return pd.DataFrame()

    if not articles:
        print("No articles were collected")
        return pd.DataFrame()

    df = pd.DataFrame(articles)
    print(f"\nSuccessfully collected {len(df)} articles")
    return df

def save_results(df):
    if df.empty:
        print("No data to save")
        return None

    filename = f'techinasia_ai_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nSaved {len(df)} articles to {filename}")

    print("\nFirst few articles:")
    print(df[['title', 'source', 'posted_time']].head())

    return filename

# Run the scraper with debug output
df = scrape_techinasia_ai_news(20)
save_results(df)

Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence

First 1000 characters of response:
<!DOCTYPE html><html lang="en-US" itemscope itemtype="http://schema.org/WebSite" prefix="og: http://ogp.me/ns#"><head><style>#zmmtg-root {
        display: none;
      }</style><title>Tech in Asia - Connecting Asia's startup ecosystem</title><meta charset="utf-8"><meta http-equiv="x-dns-prefetch-control" content="on"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1,viewport-fit=cover"><meta name="msapplication-config" content="https://static.techinasia.com/assets/browserconfig.xml"><meta name="theme-color" content="#ffffff"><meta name="twitter:widgets:csp" content="on"><meta name="fragment" content="!"><meta property="fb:app_id" content="206930646126140"><meta property="fb:pages" content="175755689129519"><meta property="og:locale" content="en_US"><link rel="dns-prefetch" href="//static.techinasia.com"><

In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def scrape_techinasia_ai_news(num_articles=20):
    articles = []

    # Setup Chrome options
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    # Add user agent
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    try:
        print("Starting Chrome webdriver...")
        # Use webdriver_manager to handle driver installation
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        url = 'https://www.techinasia.com/news?category=artificial-intelligence'
        print(f"Fetching URL: {url}")

        driver.get(url)
        print("Waiting for content to load...")

        # Wait for articles to load (maximum 20 seconds)
        wait = WebDriverWait(driver, 20)
        article_present = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, 'post-card'))
        )

        # Give extra time for all articles to load
        time.sleep(3)

        print("Page loaded, parsing content...")

        # Parse with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article_elements = soup.find_all('article', class_='post-card')
        print(f"Found {len(article_elements)} articles")

        for i, article in enumerate(article_elements, 1):
            try:
                # Extract article information
                content_div = article.find('div', class_='post-content')

                # Get title
                title_element = content_div.find('h3', class_='post-title')
                title = title_element.text.strip() if title_element else 'N/A'

                # Get source info
                source_element = content_div.find('span', class_='post-source-name')
                source = source_element.text.strip() if source_element else 'N/A'

                source_link = content_div.find('a', class_='post-source')
                source_url = source_link['href'] if source_link else None

                # Get article URL
                article_links = [a for a in content_div.find_all('a') if not 'post-source' in a.get('class', [])]
                article_url = None
                if article_links:
                    href = article_links[0]['href']
                    article_url = f"https://www.techinasia.com{href}" if not href.startswith('http') else href

                # Get time and tags from footer
                footer = article.find('div', class_='post-footer')
                time_element = footer.find('time') if footer else None
                posted_time = time_element['datetime'] if time_element else 'N/A'
                relative_time = time_element.text.strip() if time_element else 'N/A'

                # Get tags
                tags = []
                if footer:
                    tag_elements = footer.find_all('a', class_='post-taxonomy-link')
                    tags = [tag.text.strip('· ') for tag in tag_elements if tag.text.strip('· ')]

                article_data = {
                    'title': title,
                    'article_url': article_url,
                    'source': source,
                    'source_url': source_url,
                    'posted_time': posted_time,
                    'relative_time': relative_time,
                    'tags': ', '.join(tags) if tags else 'N/A',
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                print(f"Processing article {i}: {title[:50]}...")
                articles.append(article_data)

                if len(articles) >= num_articles:
                    break

            except Exception as e:
                print(f"Error parsing article {i}: {str(e)}")
                continue

    except Exception as e:
        print(f"Error during scraping: {str(e)}")
    finally:
        print("Closing browser...")
        driver.quit()

    if not articles:
        print("No articles were collected")
        return pd.DataFrame()

    df = pd.DataFrame(articles)
    print(f"\nSuccessfully collected {len(df)} articles")
    return df

def save_results(df):
    if df.empty:
        print("No data to save")
        return None

    filename = f'techinasia_ai_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nSaved {len(df)} articles to {filename}")

    print("\nFirst few articles:")
    print(df[['title', 'source', 'posted_time']].head())

    return filename

# Run the scraper
print("Starting scraper...")
df = scrape_techinasia_ai_news(20)
save_results(df)

Starting scraper...
Starting Chrome webdriver...
Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence
Waiting for content to load...
Page loaded, parsing content...
Found 15 articles
Processing article 1: AWS, General Catalyst advance AI tools to support ...
Processing article 2: Trump likely to keep US AI chip export restriction...
Processing article 3: OpenAI urges action to maintain US AI leadership...
Processing article 4: TSMC Q4 profit set to surge 58% on strong AI chip ...
Processing article 5: PUBG maker Krafton to invest $136m in game studios...
Processing article 6: Microsoft forms new engineering group to boost AI ...
Processing article 7: US tighten AI chip exports, targets China...
Processing article 8: Indonesian AI character generator nets $5m seed...
Processing article 9: Google’s AI agent enhances voice assist in Mercede...
Processing article 10: Adobe launches new AI tool for bulk image editing...
Processing article 11: Chinese AI startup Dee

'techinasia_ai_news_20250114_204009.csv'

enhanced version


In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys

def scroll_page(driver, pause_time=1.5):
    """Scroll the page and return True if new content was loaded"""
    last_height = driver.execute_script("return document.body.scrollHeight")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause_time)  # Wait for content to load
    new_height = driver.execute_script("return document.body.scrollHeight")
    return new_height > last_height

def scrape_techinasia_ai_news(num_articles=50, max_scrolls=10):
    articles = []

    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    try:
        print("Starting Chrome webdriver...")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        url = 'https://www.techinasia.com/news?category=artificial-intelligence'
        print(f"Fetching URL: {url}")

        driver.get(url)
        print("Waiting for initial content to load...")

        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'post-card')))

        # Scroll to load more articles
        scroll_count = 0
        while len(articles) < num_articles and scroll_count < max_scrolls:
            if scroll_page(driver):
                scroll_count += 1
                print(f"Scrolled page {scroll_count} times, loading more articles...")
            else:
                print("No more new content loaded")
                break

        print("Parsing content...")
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        article_elements = soup.find_all('article', class_='post-card')
        print(f"Found {len(article_elements)} articles")

        for i, article in enumerate(article_elements, 1):
            try:
                content_div = article.find('div', class_='post-content')

                # Get title and its link
                title_element = content_div.find('h3', class_='post-title')
                title = title_element.text.strip() if title_element else 'N/A'

                # Get article URL and extract article ID
                article_links = [a for a in content_div.find_all('a') if not 'post-source' in a.get('class', [])]
                article_url = None
                article_id = None
                if article_links:
                    href = article_links[0]['href']
                    article_url = f"https://www.techinasia.com{href}" if not href.startswith('http') else href
                    # Extract article ID from URL
                    article_id = href.split('/')[-1] if href else None

                # Get source information
                source_element = content_div.find('span', class_='post-source-name')
                source = source_element.text.strip() if source_element else 'N/A'

                source_link = content_div.find('a', class_='post-source')
                source_url = source_link['href'] if source_link else None

                # Get image information
                image_div = article.find('div', class_='post-image')
                image_url = None
                if image_div:
                    img_tag = image_div.find('img')
                    if img_tag:
                        image_url = img_tag.get('src')

                # Get time and tags
                footer = article.find('div', class_='post-footer')
                time_element = footer.find('time') if footer else None
                posted_time = time_element['datetime'] if time_element else 'N/A'
                relative_time = time_element.text.strip() if time_element else 'N/A'

                # Get categories and tags
                categories = []
                tags = []
                if footer:
                    tag_elements = footer.find_all('a', class_='post-taxonomy-link')
                    for tag in tag_elements:
                        tag_text = tag.text.strip('· ')
                        if tag_text:
                            if tag.get('href', '').startswith('/category/'):
                                categories.append(tag_text)
                            elif tag.get('href', '').startswith('/tag/'):
                                tags.append(tag_text)

                article_data = {
                    'article_id': article_id,
                    'title': title,
                    'article_url': article_url,
                    'source': source,
                    'source_url': source_url,
                    'image_url': image_url,
                    'posted_time': posted_time,
                    'relative_time': relative_time,
                    'categories': ', '.join(categories) if categories else 'N/A',
                    'tags': ', '.join(tags) if tags else 'N/A',
                    'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                print(f"Processing article {i}: {title[:50]}...")
                articles.append(article_data)

                if len(articles) >= num_articles:
                    break

            except Exception as e:
                print(f"Error parsing article {i}: {str(e)}")
                continue

    except Exception as e:
        print(f"Error during scraping: {str(e)}")
    finally:
        print("Closing browser...")
        driver.quit()

    if not articles:
        print("No articles were collected")
        return pd.DataFrame()

    df = pd.DataFrame(articles)
    print(f"\nSuccessfully collected {len(df)} articles")
    return df

def save_results(df):
    if df.empty:
        print("No data to save")
        return None

    filename = f'techinasia_ai_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nSaved {len(df)} articles to {filename}")

    print("\nSample of collected data:")
    display_columns = ['title', 'source', 'posted_time', 'categories', 'tags']
    print(df[display_columns].head())

    return filename

# Run the scraper with increased article count
print("Starting enhanced scraper...")
df = scrape_techinasia_ai_news(num_articles=50, max_scrolls=10)
save_results(df)

Starting enhanced scraper...
Starting Chrome webdriver...
Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence
Waiting for initial content to load...
Scrolled page 1 times, loading more articles...
Scrolled page 2 times, loading more articles...
Scrolled page 3 times, loading more articles...
Scrolled page 4 times, loading more articles...
Scrolled page 5 times, loading more articles...
Scrolled page 6 times, loading more articles...
Scrolled page 7 times, loading more articles...
Scrolled page 8 times, loading more articles...
Scrolled page 9 times, loading more articles...
Scrolled page 10 times, loading more articles...
Parsing content...
Found 165 articles
Processing article 1: AWS, General Catalyst advance AI tools to support ...
Processing article 2: Trump likely to keep US AI chip export restriction...
Processing article 3: OpenAI urges action to maintain US AI leadership...
Processing article 4: TSMC Q4 profit set to surge 58% on strong AI chip ...
Pro

'techinasia_ai_news_20250114_204315.csv'

In [None]:
# For fewer articles:
df = scrape_techinasia_ai_news(num_articles=20, max_scrolls=5)

# For more articles:
df = scrape_techinasia_ai_news(num_articles=100, max_scrolls=20)

In [15]:
df = scrape_techinasia_ai_news(num_articles=20, max_scrolls=5)

Starting Chrome webdriver...
Fetching URL: https://www.techinasia.com/news?category=artificial-intelligence
Waiting for initial content to load...
Scrolled page 1 times, loading more articles...
Scrolled page 2 times, loading more articles...
Scrolled page 3 times, loading more articles...
Scrolled page 4 times, loading more articles...
Scrolled page 5 times, loading more articles...
Parsing content...
Found 90 articles
Processing article 1: AWS, General Catalyst advance AI tools to support ...
Processing article 2: Trump likely to keep US AI chip export restriction...
Processing article 3: OpenAI urges action to maintain US AI leadership...
Processing article 4: TSMC Q4 profit set to surge 58% on strong AI chip ...
Processing article 5: PUBG maker Krafton to invest $136m in game studios...
Processing article 6: Microsoft forms new engineering group to boost AI ...
Processing article 7: US tighten AI chip exports, targets China...
Processing article 8: Indonesian AI character generator

v

In [None]:
## v1