In [24]:
import requests
from bs4 import BeautifulSoup

import json
from tqdm import tqdm

def scrape_text_from_url(url):
    try:
        # Send an HTTP request to the URL
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all elements with the specified class
            elements = soup.find_all("div", class_="flex flex-col flex-grow items-start")
            
            # Initialize lists to store subreddit data
            subreddit_data = []
            
            # Loop through each element and extract data
            for element in elements:
                subreddit_name = element.find("a", class_="m-0 font-bold text-12 text-current truncate max-w-[11rem]")
                community_type = element.find("h6", class_="flex-grow h-md text-12 truncate py-[0.125rem] w-[11rem] m-0")
                member_count_elem = element.find("h6", class_="text-12 text-neutral-content-weak m-0 truncate w-[11rem]")
                
                # Check if elements exist before extracting text
                if subreddit_name and community_type and member_count_elem:
                    # Extract member count
                    member_count = member_count_elem.find("faceplate-number")
                    if member_count:
                        member_count = member_count['number']
                    else:
                        member_count = "Unknown"
                    
                    subreddit_data.append({
                        "Subreddit": subreddit_name.text.strip(),
                        "Community Category": community_type.text.strip(),
                        "Member Count": member_count
                    })
            
            return subreddit_data
        else:
            # If the request was unsuccessful, print an error message
            print("Error: Unable to retrieve data from URL.")
            return None
    except Exception as e:
        print("An error occurred:", e)
        return None


In [25]:
def write_to_json(data, filename):
    try:
        with open(filename, 'w') as json_file:
            json.dump(data, json_file, indent=4)
        print(f"Data has been written to {filename} successfully.")
    except Exception as e:
        print(f"An error occurred while writing to {filename}: {e}")

In [26]:
base_url = "https://www.reddit.com/best/communities/"

num_pages = 150

def scrape_all_tops(base_url, num_pages):
    for page_num in tqdm(range(1, num_pages + 1), desc="Scraping Pages"):
        url_to_scrape = f"{base_url}{page_num}"
        scraped_data = scrape_text_from_url(url_to_scrape)
        
        if scraped_data:
            # File name for this page's data
            json_filename = f"top_{page_num}.json"
            
            # Write this page's data to a JSON file
            write_to_json(scraped_data, json_filename)

In [30]:
import os

def combine_json_files(folder_path):
    combined_data = []

    # Get a list of all JSON files in the folder
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

    # Sort the list of files
    json_files.sort()

    # Initialize tqdm with the total number of files
    pbar = tqdm(json_files, desc="Combining JSON Files", unit="file")

    # Loop through each JSON file
    for json_file in pbar:
        file_path = os.path.join(folder_path, json_file)
        
        # Load data from the JSON file
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Append data to the combined list
        combined_data.extend(data)
    
    return combined_data

# Folder path containing the JSON files
folder_path = "top_bare"

# Call the function to combine JSON files
combined_data = combine_json_files(folder_path)

# Sort the combined data by 'Member Count'
combined_data.sort(key=lambda x: int(x.get('Member Count', '0')), reverse=True)

# File name to write combined data
combined_json_filename = "combined_top_bare.json"

# Write the combined data to a JSON file
with open(combined_json_filename, 'w') as json_file:
    json.dump(combined_data, json_file)

print(f"Combined data has been written to {combined_json_filename}")


Combining JSON Files: 100%|██████████| 150/150 [00:00<00:00, 2416.71file/s]

Combined data has been written to combined_top_bare.json





In [31]:
def filter_and_write_filtered_data(input_filename, output_filename):
    try:
        with open(input_filename, 'r') as json_file:
            data = json.load(json_file)
        
        # Filter the data based on 'Community Type' containing "politic" or "activis"
        filtered_data = [item for item in data if "politic" in item.get('Community Category', '').lower() or "activis" in item.get('Community Category', '').lower()]
        
        # Write the filtered data to a new JSON file
        with open(output_filename, 'w') as output_file:
            json.dump(filtered_data, output_file, indent=4)

        print(f"Filtered data has been written to {output_filename}")
    except Exception as e:
        print(f"An error occurred: {e}")


# Output file for filtered data
output_politics_filter_filename = "politics_activism_top.json"

# Call the function to filter and write filtered data
filter_and_write_filtered_data(combined_json_filename, output_politics_filter_filename)

Filtered data has been written to politics_activism_top.json


In [32]:
with open(output_politics_filter_filename, 'r') as json_file:
    data = json.load(json_file)
    print(len(data))

613
