In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
import csv
import praw # Python Reddit API Wrapper

# Data crawling for books using requests & BeutifulSoup

In [2]:
BASE_URL = "https://books.toscrape.com/"
START_URL = BASE_URL + "catalogue/page-1.html"

def get_soup(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve: {url}")
        return None
    return BeautifulSoup(response.content, 'html.parser')

def extract_book_info(book):
    title = book.h3.a['title']
    price = book.find('p', class_='price_color').text.strip()
    availability = book.find('p', class_='instock availability').text.strip()
    return {
        'title': title,
        'price': price,
        'availability': availability
    }

def scrape_books(first_pages=5):
    books_data = []
    page_num = 1
    while page_num <= first_pages:
        page_url = f"{BASE_URL}catalogue/page-{page_num}.html"
        soup = get_soup(page_url)
        if not soup:
            break
        
        book_items = soup.find_all('article', class_='product_pod')
        if not book_items:
            break  # No more books
        
        for book in book_items:
            info = extract_book_info(book)
            books_data.append(info)
        
        print(f"Scraped page {page_num}")
        page_num += 1
        time.sleep(1)  # Be polite

    return books_data

# Run and print the first 5 results
all_books = scrape_books(first_pages=5)
for book in all_books[:5]:
    print(book)
print(f"\nTotal books scraped: {len(all_books)}")


Scraped page 1
Scraped page 2
Scraped page 3
Scraped page 4
Scraped page 5
{'title': 'A Light in the Attic', 'price': '£51.77', 'availability': 'In stock'}
{'title': 'Tipping the Velvet', 'price': '£53.74', 'availability': 'In stock'}
{'title': 'Soumission', 'price': '£50.10', 'availability': 'In stock'}
{'title': 'Sharp Objects', 'price': '£47.82', 'availability': 'In stock'}
{'title': 'Sapiens: A Brief History of Humankind', 'price': '£54.23', 'availability': 'In stock'}

Total books scraped: 100


In [3]:
df = pd.DataFrame(all_books)
df

Unnamed: 0,title,price,availability
0,A Light in the Attic,£51.77,In stock
1,Tipping the Velvet,£53.74,In stock
2,Soumission,£50.10,In stock
3,Sharp Objects,£47.82,In stock
4,Sapiens: A Brief History of Humankind,£54.23,In stock
...,...,...,...
95,Lumberjanes Vol. 3: A Terrible Plan (Lumberjan...,£19.92,In stock
96,"Layered: Baking, Building, and Styling Spectac...",£40.11,In stock
97,Judo: Seven Steps to Black Belt (an Introducto...,£53.90,In stock
98,Join,£35.67,In stock


In [4]:
# Save the DataFrame to a CSV file
df.to_csv('scraped_books_info.csv', index=False)

# Data crawling for Reddit posts using API & praw

First, create reddit app to access the API: https://www.reddit.com/prefs/apps 

In [5]:

# Authenticate
reddit = praw.Reddit(
    client_id="MZtrDHjJRgeivEtcMKmZhA",
    client_secret="vDFDHibQr6TnTVxUBevkXyRQyyNBRQ",
    user_agent="SpamDetectionBot/0.1 by hflsjl8swq"
)

# Test read-only mode
print(reddit.read_only)  # Should print: True

True


In [6]:
subreddit = reddit.subreddit("technology")  # or a specific one like "technology"
# check attributes of the subreddit post
for post in subreddit.new(limit=1):
    print(dir(post))

['STR_FIELD', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_additional_fetch_params', '_chunk', '_comments_by_id', '_edit_experimental', '_fetch', '_fetch_data', '_fetch_info', '_fetched', '_kind', '_reddit', '_replace_richtext_links', '_reset_attributes', '_safely_add_arguments', '_url_parts', '_vote', 'add_fetch_param', 'all_awardings', 'allow_live_comments', 'approved_at_utc', 'approved_by', 'archived', 'author', 'author_flair_background_color', 'author_flair_css_class', 'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 'author_flair_type', 'author_fullname', 'author_is_blocked', 'author_patreon_flair', 'aut

In [7]:
# Get the 500 newest posts from a subreddit
subreddit = reddit.subreddit("technology")  # or a specific one like "technology"
for post in subreddit.new(limit=5):
    print({
        "title": post.title,
        # "text": post.selftext,
        "author": str(post.author),
        "score": post.score,
        "num_comments": post.num_comments,
        "url": post.url,
        "is_original_content": post.is_original_content,
    })

# save to CSV
with open("reddit_new_posts.csv", "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["title", "author", "score", "num_comments", "url", "is_original_content"])

    for post in subreddit.new(limit=500):
        writer.writerow([post.title, str(post.author), post.score, post.num_comments, post.url, post.is_original_content])


{'title': 'Autonomous AI systems can help tackle global food insecurity', 'author': 'upyoars', 'score': 0, 'num_comments': 0, 'url': 'https://phys.org/news/2025-06-autonomous-ai-tackle-global-food.html', 'is_original_content': False}
{'title': 'Deadline for Getting Payment on Azure Power $23M Settlement Is in a Few Weeks', 'author': '11thestate', 'score': 2, 'num_comments': 0, 'url': 'https://11th.com/cases/azure-investor-settlement', 'is_original_content': False}
{'title': '‘Fortnite’ Players to Receive More Than $126 Million in Refunds From FTC', 'author': 'a_Ninja_b0y', 'score': 10, 'num_comments': 3, 'url': 'https://variety.com/2025/gaming/news/how-to-get-fortnite-ftc-refund-1236441827/', 'is_original_content': False}
{'title': 'How Cops Can Get Your Private Online Data', 'author': 'SaveDnet-FRed0', 'score': 1, 'num_comments': 4, 'url': 'https://www.eff.org/deeplinks/2025/06/how-cops-can-get-your-private-online-data', 'is_original_content': False}
{'title': 'Xiaomi YU7 over 200,000

In [8]:
# Get the 5 newest comments from a subreddit
for comment in subreddit.comments(limit=5):
    print({
        "comment_body": comment.body,
        "author": str(comment.author),
        "score": comment.score,
        "permalink": comment.permalink
    })

# Save comments to CSV
with open("reddit_new_comments.csv", "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["comment_body", "author", "score", "permalink"])

    for comment in subreddit.new(limit=500):
        writer.writerow([comment.title, str(comment.author), comment.score, comment.permalink])

{'comment_body': 'still a BSOD, black screen of death........ok', 'author': 'sixbone', 'score': 1, 'permalink': '/r/technology/comments/1ll4fin/windows_is_getting_rid_of_the_blue_screen_of/mzxf9ck/'}
{'comment_body': "That's simultaneously the most and least believable thing in the world", 'author': 'Accurate_Koala_4698', 'score': 1, 'permalink': '/r/technology/comments/1ll6a2j/salesforce_ceo_claims_half_of_the_companys_work/mzxf90r/'}
{'comment_body': "if he will become a hobo due to it at some point, the society is going to suffer. also right now he isn't as productive as he could be. the society is already suffering. it's selfish", 'author': 'polacy_do_pracy', 'score': 1, 'permalink': '/r/technology/comments/1lka83a/bernie_sanders_says_that_if_ai_makes_us_so/mzxf8z4/'}
{'comment_body': 'This country is run by morons, and sleezebags.', 'author': 'FoggyGanj', 'score': 1, 'permalink': '/r/technology/comments/1ll51e9/critical_hurricane_forecast_tool_abruptly/mzxf8qb/'}
{'comment_body': 