In [1]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
import re
import os

In [2]:
# Function to get the HTML content of a page
def get_html(url):
    # Create a session to handle cookies and headers
    session = requests.Session()
    session.cookies.set('over18', '1')  # Bypass age verification
    
    # Fetch the content of the page
    response = session.get(url)
    if response.status_code == 200:
        return response.content
    
    else:
        print(f'Failed to retrieve the page. Status code: {response.status_code}')

# Function to parse the given board page and extract all post titles, dates, and links
def parse_board_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    #Extract a list of all posts
    posts = soup.find_all('div', class_='r-ent')
    
    post_data = []
    for post in posts:
        title_tag = post.find('div', class_='title').a
        date_tag = post.find('div', class_='date').text.strip()
        if title_tag:
            title = title_tag.text.strip()
            link = 'https://www.ptt.cc' + title_tag['href']
            post_data.append((title, link, date_tag))
    return post_data, soup

# Function to get the link to the previous page
def get_previous_page_link(soup):
    btn_group = soup.find('div', class_='btn-group btn-group-paging')
    prev_link = btn_group.find_all('a')[1]['href']
    return 'https://www.ptt.cc' + prev_link

# Function to check if the post is within the last 7 days
def is_recent(date_str, current_year):
    post_date = datetime.strptime(f"{current_year}/{date_str}", '%Y/%m/%d')
    return post_date >= datetime.now() - timedelta(days=7)

# Function to save the data to a CSV file
def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['標題', 'URL'])
        writer.writerows([(title, url) for title, url, _ in data])


# Function to read the CSV file and extract links
def read_csv_and_get_links(filename):
    links = []
    with open(filename, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            if len(row) > 1:  # Ensure the row has enough columns
                links.append(row[1])  # The URL is in the second column
    return links

#----------------Functions for crawling each post page-------------------------#

# Cleans up post content
def clean_text(text):
    # Remove the footer starting from '--' or '--※'
    cleaned_text = re.split(r'\n--\n|--※', text)[0]
    
    # Replace multiple newlines with a single newline
    cleaned_text = re.sub(r'\n+', '\n', cleaned_text).strip()

    return cleaned_text

#Extracts post content (text only)
def get_post_content(main_content):

    # Remove all 'span' tags with meta class (author, board, title, date, and other meta information)
    for span in main_content.find_all('span', class_='article-meta-value'):
        span.decompose()
    
    for span in main_content.find_all('span', class_='article-meta-tag'):
        span.decompose()
    
    # Remove other unnecessary tags if present (e.g., 'div' with class 'push')
    for div in main_content.find_all('div', class_='push'):
        div.decompose()
    
    # The main content is now the text within the 'main-content' div
    post_text = main_content.text.strip()
    
    return clean_text(post_text)

#----Extract specific content from each comment -------#

#Get comment content
def get_content(comment):
    try:
        content = comment.find('span', class_='f3 push-content').text.split(": ")[1]
        # Check subsequent siblings for additional comment lines
        next_sibling = comment.find_next_sibling()
        while next_sibling and not next_sibling.get('class') == ['push']:
            if next_sibling.name == 'span' and 'f3' in next_sibling.get('class', []):
                content += next_sibling.text.strip()
            next_sibling = next_sibling.find_next_sibling()
    except:
        content = ""
    
    return content

#Get comment author
def get_user(comment):
    try:
        user = comment.find('span', class_='f3 hl push-userid').text
    except:
        user = ""
    return user

#Get comment date time
def get_datetime(comment):
       #Get date and time
        ip_date_time_sec = comment.find('span', class_='push-ipdatetime')
        ip_date_time = comment.find('span', class_='push-ipdatetime').text.strip()
        
        if ip_date_time:
            # Check subsequent siblings for additional comment lines
            next_sibling = ip_date_time_sec.next_element.next_element
            while next_sibling and next_sibling.name is None:
                ip_date_time += next_sibling.strip()
                next_sibling = next_sibling.next_element
                
        #If date and time exist outside the span (on a new line)
        else:
            next_sibling = ip_date_time_sec.next_element

            while (next_sibling and ( (next_sibling.name is None) or (next_sibling.name == 'span' and next_sibling.get('class') == ['f3']))):
                ip_date_time = next_sibling
                next_sibling = next_sibling.next_element
        
        ip_date_time = [item for item in ip_date_time.split(" ") if item!=""]

        return ip_date_time

#-----------------------------------------#

#Functions to extract comments (author, content, time)
def get_comments(main_content):
    
    # Remove the post metadata (post author, title, etc.)
    metas = main_content.find_all('div', class_='article-metaline')
    for meta in metas:
        meta.extract()
    metas_right = main_content.find_all('div', class_='article-metaline-right')
    for meta in metas_right:
        meta.extract()
    
    # Remove the IP address information
    ip_info = main_content.find('span', class_='f2')
    if ip_info:
        ip_info.extract()

    
    # Extract comments 
    comments = main_content.find_all('div', class_='push') #List of all comment blocks
    comments = [comment for comment in comments if comment.get("class")==['push']] #Filter out element whose class isn't identical to 'push' (non-comment)
    comments_data = []
    
    for comment in comments:

        #Get user and content
        user = get_user(comment)
        content = get_content(comment)
        ip_date_time = get_datetime(comment)
        
        try:
            date = ip_date_time[1]
            time = ip_date_time[2].split("\n")[0]
            date_time = " ".join([date,time])
        except:
            print("No Date time")
            date_time = ""
        comments_data.append((user, date_time, content))

        #print(f'User: {user}, Comment: {content}, Time: {date} {time}')
    return comments_data

#Functions to get post data and all comment data
def get_post (url):
    post_html = get_html(url) 
    soup = BeautifulSoup(post_html, 'html.parser')
    post_data = {}

    #Get relevant info of the post (author, title, time)
    post_data["作者"], category, post_data["標題"], post_data["時間"] = map(lambda x: x.text.strip(),soup.find_all("span", class_= "article-meta-value"))

    #Extract main content 
    main_content = soup.find('div', id='main-content')

    #Get comment data from main content
    comment_data = get_comments(main_content)
    #Get post text from main content
    post_data["內文"] = get_post_content(main_content)

    return post_data, comment_data 
    #Returns a dictionary with 4 keys 作者、標題、時間、內文 and a list of tuples for comments (one tuple for each comment, formatted (作者,時間,留言內文）)


#Save post data to txt file
def save_to_txt (post_content, comment_content, index):
    
    if not os.path.exists('gossip_posts'):
        os.makedirs('gossip_posts')

    # Format the post content
    post_text = (f"作者: {post_content['作者']}\n"
                 f"標題: {post_content['標題']}\n"
                 f"時間: {post_content['時間']}\n\n"
                 f"{post_content['內文']}\n")
    
    # Format the comments
    comments_text = "\n\n留言:\n\n"
    for comment in comment_content:
        comments_text += f"作者: {comment[0]}\n時間: {comment[1]}\n內容: {comment[2]}\n\n"
    
    # Combine post content and comments
    full_text = post_text + comments_text
    
    # Write to a text file
    with open(f'gossip_posts/gossip_post_{index}.txt', 'w', encoding='utf-8') as f:
        f.write(full_text)
    
    print(f"gossip_post_{index}.txt")

In [None]:
if __name__ == '__main__':
    
    # URL of the PTT Gossiping board (i.e where the newest gossiping posts are)
    base_url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
    
    all_posts = []
    current_url = base_url
    
    # Get the current year
    current_year = datetime.now().year
    
    while True:
        # Get the HTML content of the current page
        html_content = get_html(current_url)
        
        # Parse the board page and extract post data
        post_data, soup = parse_board_page(html_content)
        
        # Filter posts from the last 7 days
        recent_posts = [post for post in post_data if is_recent(post[2], current_year)]
        
        # If no recent posts are found, break the loop, no need to continue to prev page
        if not recent_posts:
            break
        
        # Add the recent posts to the list of all posts
        all_posts.extend(recent_posts)
        
        # Get the link to the previous page
        current_url = get_previous_page_link(soup)
    
    # Save the data to a CSV file
    csv_filename = 'ptt_gossiping_recent_posts.csv'
    save_to_csv(all_posts, csv_filename)
    
    print(f"Data saved to {csv_filename}")

    #--------------------------------------------------------------#
    
    # Get links of posts from last 7 days to crawl
    csv_filename = 'ptt_gossiping_recent_posts.csv'
    links = read_csv_and_get_links(csv_filename)

    problem_links = []
    link_index = 0
    
    #Crawl each post and save data to txt file
    for link in links:
        try:
            post, comment = get_post(link)
            print (f"Retrieved {link_index}")
            save_to_txt(post, comment, link_index)
        except:
            print(f"Problem with {link_index}")
            problem_links.append(link)
        link_index += 1
        