# Webscrapping News Articles

### Working with Post-Courier Page

#### Scraping each page

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
# opening a browser window
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import re

# --- 1. User Settings (FILL THESE IN) ---
LOGIN_URL = "https://www.postcourier.com.pg/my-account/"
YOUR_USERNAME = "username"
YOUR_PASSWORD = "password"

STARTING_STORIES_URL = "https://www.postcourier.com.pg/top-stories/"
MAX_PAGES_TO_SCRAPE = 3  # Set to 1 to only scrape the first page

# --- 2. Set up Selenium Driver ---
print("Connecting to browser...")
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10) # 10-second wait time

# --- 3. Perform Login ---
try:
    print(f"Opening login page: {LOGIN_URL}")
    driver.get(LOGIN_URL)
    
    username_field = wait.until(EC.visibility_of_element_located((By.ID, "username")))
    password_field = driver.find_element(By.ID, "password")
    login_button = driver.find_element(By.NAME, "login")

    print("Logging in...")
    username_field.send_keys(YOUR_USERNAME)
    password_field.send_keys(YOUR_PASSWORD)
    login_button.click()

    wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "woocommerce-MyAccount-content")))
    print("Login Successful!")

except Exception as e:
    print(f"Login failed: {e}")
    driver.quit()
    exit()

# --- 4. NEW: PAGINATION LOOP (Phase 1) ---
print("\n--- Phase 1: Collecting all article links ---")
driver.get(STARTING_STORIES_URL)

# This is our master list that will hold all articles from all pages
all_article_data = []
pages_scraped = 0

while pages_scraped < MAX_PAGES_TO_SCRAPE:
    pages_scraped += 1
    print(f"\nScraping page {pages_scraped}...")
    
    # Wait for the main content to load
    try:
        wait.until(EC.visibility_of_element_located((By.ID, "main")))
    except TimeoutException:
        print("  Error: Page timed out. Stopping.")
        break

    # Pass the current page source to Beautiful Soup
    page_html = driver.page_source
    soup = BeautifulSoup(page_html, "html.parser")

    main_content = soup.find("main", id="main")
    articles = main_content.find_all("article") if main_content else []
    
    articles_found_on_page = 0
    for article in articles:
        title_tag = article.find("h2", class_="entry-title")
        date_tag = article.find("time", class_="entry-date published")
        
        if title_tag and title_tag.a and date_tag:
            all_article_data.append({
                "title": title_tag.a.text,
                "url": title_tag.a['href'],
                "date": date_tag.text,
                "full_text": "" # Placeholder
            })
            articles_found_on_page += 1

    print(f"  Found {articles_found_on_page} articles on this page.")

    # --- Find and click the 'Next' button ---
    try:
        # Find the 'Older posts' link
        next_button = driver.find_element(By.CSS_SELECTOR, "a.next.page-numbers")
        
        print("  Clicking 'Older posts' button...")
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button) # Scroll to button
        time.sleep(0.5)
        next_button.click()
        
        # Wait for the new page to load (e.g., wait for the main element again)
        time.sleep(2) # Give it a moment to navigate
        
    except NoSuchElementException:
        print("  No 'Older posts' button found. This is the last page.")
        break
    except Exception as e:
        print(f"  Error clicking next button: {e}")
        break

print(f"\n--- Phase 1 Complete: Collected {len(all_article_data)} total articles ---")


# --- 5. SCRAPING LOOP (Phase 2) ---
print("\n--- Phase 2: Scraping full text for each article ---")

for article in all_article_data: 
    try:
        print(f"Fetching: {article['title']}")
        
        driver.get(article['url'])
        wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "entry-content")))
        
        article_html = driver.page_source
        article_soup = BeautifulSoup(article_html, "html.parser")
        content_div = article_soup.find("div", class_="entry-content")
        
        if content_div:
            paragraphs = content_div.find_all("p")
            full_text = "\n".join([p.text for p in paragraphs])
            article['full_text'] = full_text
            print("  ...Success! Scraped p-tag content.")
        else:
            print("  ...Could not find 'entry-content' div.")
            
    except Exception as e:
        print(f"  Error fetching {article['url']}: {e}")

# --- 6. Clean up Selenium ---
driver.quit()
print("\nSelenium driver closed.")

# --- 7. Create and Display the Table ---
print("\n--- Scraping Complete. Creating Table ---")

df = pd.DataFrame(all_article_data)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)

print(df)

# --- 8. Save the table to a file ---
try:
    output_filename = "scraped_articles_all_pages.csv"
    df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f"\nSuccessfully saved data to {output_filename}")
except Exception as e:
    print(f"\nError saving to CSV: {e}")

#### Scraping with CSV data

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

# --- 1. User Settings (FILL THESE IN) ---
LOGIN_URL = "https://www.postcourier.com.pg/my-account/"
YOUR_USERNAME = "charlieikosi@gmail.com"
YOUR_PASSWORD = "cikosi26490!"

# *** SET YOUR CSV FILENAME HERE ***
INPUT_CSV_FILE = "national_articles_pc.csv"  # e.g., "articles_to_scrape.csv"
OUTPUT_CSV_FILE = "articles_with_full_text.csv"

# --- 2. Load the CSV File ---
print(f"Loading data from {INPUT_CSV_FILE}...")
try:
    # Read the CSV into a Pandas DataFrame
    df = pd.read_csv(INPUT_CSV_FILE)
except FileNotFoundError:
    print(f"Error: File not found.")
    print(f"Please make sure '{INPUT_CSV_FILE}' is in the same folder as this script.")
    exit()
except Exception as e:
    print(f"Error reading CSV: {e}")
    exit()

# Check for the required 'URL' column
if 'URL' not in df.columns:
    print(f"Error: Your CSV must have a column named 'URL'.")
    print(f"Found columns: {df.columns.to_list()}")
    exit()

print(f"Found {len(df)} URLs to scrape.")

# --- 3. Set up Selenium Driver ---
print("Connecting to browser...")
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10) # 10-second wait time

# --- 4. Perform Login ---
try:
    print(f"Opening login page: {LOGIN_URL}")
    driver.get(LOGIN_URL)
    
    username_field = wait.until(EC.visibility_of_element_located((By.ID, "username")))
    password_field = driver.find_element(By.ID, "password")
    login_button = driver.find_element(By.NAME, "login")

    print("Logging in...")
    username_field.send_keys(YOUR_USERNAME)
    password_field.send_keys(YOUR_PASSWORD)
    login_button.click()

    wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "woocommerce-MyAccount-content")))
    print("Login Successful!")

except Exception as e:
    print(f"Login failed: {e}")
    driver.quit()
    exit()

# --- 5. SCRAPING LOOP (Using the CSV) ---
print("\n--- Starting to Scrape Full Articles ---")

# This list will store the scraped text for each row
scraped_texts = [] 

# Loop through each row in the DataFrame
for index, row in df.iterrows():
    url = row['URL']
    
    # Use 'Top.Stories' for the print message, or the URL if it's not there
    title = row.get('Top.Stories', url) 
    
    try:
        print(f"Fetching ({index + 1}/{len(df)}): {title}")
        
        driver.get(url)
        # Wait for the main article content to load
        wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "entry-content")))
        
        article_html = driver.page_source
        article_soup = BeautifulSoup(article_html, "html.parser")
        
        # Find the content div
        content_div = article_soup.find("div", class_="entry-content")
        
        if content_div:
            # Get text from <p> tags
            paragraphs = content_div.find_all("p")
            full_text = "\n".join([p.text for p in paragraphs])
            scraped_texts.append(full_text)
            print("  ...Success! Scraped p-tag content.")
        else:
            print("  ...Could not find 'entry-content' div.")
            scraped_texts.append("[Scrape Error: No content div found]")
            
    except Exception as e:
        print(f"  Error fetching {url}: {e}")
        scraped_texts.append(f"[Scrape Error: {e}]")

# --- 6. Clean up Selenium ---
driver.quit()
print("\nSelenium driver closed.")

# --- 7. Add Scraped Data to DataFrame ---
print("\n--- Scraping Complete. Adding new column to table ---")

# Add the list of texts as a new column in our original DataFrame
df['full_text'] = scraped_texts

# Set display options to show the table in the console
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)

print(df)

# --- 8. Save the new table to a file ---
try:
    df.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8')
    print(f"\nSuccessfully saved new data to {OUTPUT_CSV_FILE}")
except Exception as e:
    print(f"\nError saving to CSV: {e}")

### Working with TVWAN Page

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

wait = WebDriverWait(driver, 10) 

# Lists to store our data
titles = []
urls = []
article_dates = []
total_page = len(links)
page_count = 0

try:
    #driver.get("https://tvwan.com.pg/")
    #driver.maximize_window()
    
    # Wait for the "News" link and click it
    #wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "menu-item-link"))).click()
    
    # Wait for the news articles to be present
    #wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "feature-news-content")))
    links = driver.find_elements(By.CLASS_NAME, "feature-news-content")
    
    print(f"Found {len(links)} links. Gathering info...")

    # --- LOOP 1: GATHER ALL TITLES AND URLS ---
    for link in links:
        # Get Article Title
        titles.append(link.text)
        
        # Get URL
        link_tag = link.find_element(By.TAG_NAME, 'a')
        url_link = link_tag.get_attribute('href')
        urls.append(url_link)

    print("Finished gathering links. Now visiting each page...")

    # --- LOOP 2: VISIT EACH URL TO SCRAPE THE DATE ---
    for url in urls:
        page_count +=1
        print(f"Visiting: {url} Page: {page_count} of {total_page}.")
        driver.get(url)
        
        try:
            # Wait for the date element to load on the article page
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".author-info li")))
            
            # Get page source *after* waiting
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            
            # Select the date element
            article_date_element = soup.select_one(".author-info li")
            
            if article_date_element:
                clean_date = article_date_element.text.strip()
                article_dates.append(clean_date)
            else:
                article_dates.append("Date not found") # Append a placeholder
                
        except Exception as e:
            print(f"Could not get date from {url}. Error: {e}")
            article_dates.append("Error") # Append error placeholder
    
    print("--- Scraping Complete ---")
    print("Titles:", titles)
    print("URLs:", urls)
    print("Dates:", article_dates)

finally:
    # --- QUIT ONCE AT THE VERY END ---
    print("All done. Closing browser.")
    driver.quit()

scraped_articles_df = pd.DataFrame({
    "Title": titles,
    "Date": article_dates,
    "URL": urls
})

# See datatable
scraped_articles_df

# Export DataFrame to CSV
scraped_articles_df.to_csv('tvwan_local.csv', index=False)

## Analysis with GEMINI API

In [10]:
import google.generativeai as genai
import os
import pandas as pd
# Option 1: Set as an environment variable (Recommended)
# In your terminal: export GEMINI_API_KEY="YOUR_API_KEY"
# In your code:
# genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Option 2: Hardcode it (For quick testing only)
genai.configure(api_key="apikey")

# Select the model you want to use
# 'gemini-2.5-flash' is fast and cost-effective
model = genai.GenerativeModel('gemini-2.5-pro')


In [11]:
# Load CSV
INPUT_CSV_FILE = "articles_with_full_text.csv"

# Read the CSV into a Pandas DataFrame
df = pd.read_csv(INPUT_CSV_FILE)
df = df.head(3)

In [12]:
#model = genai.GenerativeModel('gemini-2.5-pro')
sentiment_list = []
count = 0
base_prompt = "You are going to tag the article with the relevant category that it talks about. You response must not explain or give details. You must just return the word for the tag , followed it's sentiment. Therefore i only expect one word for the tag and one for the sentiment"
for article in df['full_text']:
    count += 1
    # Combine your instructions with the article
    full_prompt = [base_prompt,"Here is the article:",article]
    response = model.generate_content(full_prompt)
    sentiment_list.append(response)
    print(f"Sentiment {count} appended")
print('done')   

Sentiment 1 appended
Sentiment 2 appended
Sentiment 3 appended


NameError: name 'done' is not defined

In [14]:
for i in sentiment_list:
    print(i.text)

Education Positive
Government Positive
Politics Neutral
