In [1]:
import requests
import uuid
import random
import string
from datetime import datetime

# Helper: generate random short code
def random_index_code(length=8):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Transform Reddit post into YikYak-style schema
def reddit_to_yikyak(post, group_id=None):
    return {
        "id": post["id"],
        "text": f'{post.get("title","")} {post.get("selftext","")}'.strip(),
        "created_at": datetime.utcfromtimestamp(post["created_utc"]).isoformat() + "Z",
        "vote_total": post.get("ups", 0),
        "comment_count": post.get("num_comments", 0),
        "alias": "Anonymous",
        "group_id": group_id if group_id else str(uuid.uuid4()),
        "index_code": random_index_code()
    }

# Fetch posts from a subreddit listing
def fetch_reddit_posts(subreddit, listing="controversial", limit=50, time="day"):
    url = f"https://www.reddit.com/r/{subreddit}/{listing}.json"
    params = {"limit": limit, "t": time}
    headers = {"User-Agent": "Mozilla/5.0 (compatible; DataScraper/1.0)"}
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()
    data = response.json()
    return [child["data"] for child in data["data"]["children"]]

# Example usage
if __name__ == "__main__":
    subreddit = "worldnews"

    # Fetch controversial and top posts
    controversial_posts = fetch_reddit_posts(subreddit, "controversial", limit=20, time="day")
    top_posts = fetch_reddit_posts(subreddit, "top", limit=20, time="day")

    # Clean them into YikYak schema
    group_id = str(uuid.uuid4())  # one group ID for this batch
    dataset = [reddit_to_yikyak(p, group_id) for p in controversial_posts + top_posts]

    # Print or save
    for entry in dataset:
        print(entry)


{'id': '1pkm0v2', 'text': 'Majority of journalists killed in Gaza linked to terror organizations, study says', 'created_at': '2025-12-12T07:33:45Z', 'vote_total': 0, 'comment_count': 48, 'alias': 'Anonymous', 'group_id': 'cceeb7e5-97ef-46d3-b8d9-83860bf41387', 'index_code': 'YR2Fuc7h'}
{'id': '1pkqfa5', 'text': 'Israel to review reports that troops killed three-year-old in Gaza', 'created_at': '2025-12-12T12:15:23Z', 'vote_total': 2, 'comment_count': 34, 'alias': 'Anonymous', 'group_id': 'cceeb7e5-97ef-46d3-b8d9-83860bf41387', 'index_code': 'CuzDBV8F'}
{'id': '1pkkct8', 'text': 'Flood misery for Gazans awaiting next stage of peace plan', 'created_at': '2025-12-12T05:53:41Z', 'vote_total': 0, 'comment_count': 14, 'alias': 'Anonymous', 'group_id': 'cceeb7e5-97ef-46d3-b8d9-83860bf41387', 'index_code': 'SxYD3jP4'}
{'id': '1pkuibp', 'text': 'EU in ‘chaos’: Nawrocki says Poland’s security strategy should align with US', 'created_at': '2025-12-12T15:19:56Z', 'vote_total': 0, 'comment_count': 

In [6]:
import requests
import uuid
import random
import string
import json
from datetime import datetime

# Helper: generate random short code
def random_index_code(length=8):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Helper: check if post is text-only (skip images/videos)
def is_text_post(post):
    if post.get("is_video"):
        return False
    if post.get("post_hint") in ["image", "video"]:
        return False
    if "preview" in post:  # images usually have a preview
        return False
    if str(post.get("url_overridden_by_dest", "")).lower().endswith((".jpg", ".jpeg", ".png", ".gif")):
        return False
    return True

# Transform Reddit post into YikYak-style schema + controversial flag
def reddit_to_yikyak(post, listing, group_id=None):
    if not is_text_post(post):
        return None  # skip non-text posts
    return {
        "id": post["id"],
        "text": f'{post.get("title","")} {post.get("selftext","")}'.strip(),
        "created_at": datetime.utcfromtimestamp(post["created_utc"]).isoformat() + "Z",
        "vote_total": post.get("ups", 0),
        "comment_count": post.get("num_comments", 0),
        "alias": "Anonymous",
        "group_id": group_id if group_id else str(uuid.uuid4()),
        "index_code": random_index_code(),
        "controversial_flag": 1 if listing == "controversial" else 0
    }

# Fetch posts from a subreddit listing
def fetch_reddit_posts(subreddit, listing="controversial", limit=50, time="day"):
    url = f"https://www.reddit.com/r/{subreddit}/{listing}.json"
    params = {"limit": limit, "t": time}
    headers = {"User-Agent": "Mozilla/5.0 (compatible; DataScraper/1.0)"}
    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()
    data = response.json()
    return [child["data"] for child in data["data"]["children"]]

if __name__ == "__main__":
    # Pick ~20 top subreddits (you can adjust this list)
    subreddits = [
        "worldnews", "news", "politics", "technology", "science",
        "gaming", "movies", "music", "sports", "books",
        "askreddit", "funny", "pics", "todayilearned", "dataisbeautiful",
        "art", "history", "space", "explainlikeimfive", "economics",
        "popculturechat", "mildlyinfuriating", "interesting"
    ]

    listings = ["controversial", "top", "hot", "new"]
    dataset = []

    for subreddit in subreddits:
        group_id = str(uuid.uuid4())  # one group ID per subreddit batch
        for listing in listings:
            try:
                posts = fetch_reddit_posts(subreddit, listing, limit=50, time="day")
                for p in posts:
                    transformed = reddit_to_yikyak(p, listing, group_id)
                    if transformed:  # only add text posts
                        dataset.append(transformed)
            except Exception as e:
                print(f"Error fetching {listing} from {subreddit}: {e}")

    # Save to JSON file
    with open("reddit_yikyak_dataset.json", "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(dataset)} text-only posts to reddit_yikyak_dataset.json")


Error fetching controversial from worldnews: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/worldnews/controversial.json?limit=50&t=day
Error fetching top from worldnews: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/worldnews/top.json?limit=50&t=day
Error fetching hot from worldnews: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/worldnews/hot.json?limit=50&t=day
Error fetching new from worldnews: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/worldnews/new.json?limit=50&t=day
Error fetching controversial from news: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/news/controversial.json?limit=50&t=day
Error fetching top from news: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/news/top.json?limit=50&t=day
Error fetching hot from news: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/news/hot.json?limit=50&t=day
Saved 1544 text-o

In [10]:
A = ["AskReddit", "OutOfTheLoop", "ExplainItLikeImFive", "AskUK", "AskMen"]
B = ["worldnews", "UnderReportedNews", "politics"]
C = ["todayilearned", "science", "technology", "interesting"]
D = ["movies", "television", "Music", "popculturechat", "Fauxmoi"]
E = ["AITAH", "AmItheAsshole", "tifu", "antiwork", "recruitinghell", "jobs", "complaints"]
F = ["Piracy", "KitchenConfidential", "buildapc", "nfl", "formula1", "sports", "travel"]


In [11]:
import requests, time, uuid, random, string, json
from datetime import datetime
from pathlib import Path

def random_index_code(length=8):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def is_text_post(post):
    if post.get("is_video"):
        return False
    if post.get("post_hint") in ["image", "video", "hosted:video", "rich:video"]:
        return False
    if "preview" in post:  # common signal for images
        return False
    url = str(post.get("url_overridden_by_dest", "")).lower()
    if url.endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")):
        return False
    return True

def reddit_to_yikyak(post, listing, group_id=None):
    if not is_text_post(post):
        return None
    return {
        "id": post["id"],
        "text": f'{post.get("title","")} {post.get("selftext","")}'.strip(),
        "created_at": datetime.utcfromtimestamp(post["created_utc"]).isoformat() + "Z",
        "vote_total": post.get("ups", 0),
        "comment_count": post.get("num_comments", 0),
        "alias": "Anonymous",
        "group_id": group_id if group_id else str(uuid.uuid4()),
        "index_code": random_index_code(),
        "controversial_flag": 1 if listing == "controversial" else 0
    }

def get_with_backoff(session, url, params, headers, max_retries=6, base_sleep=2.0):
    """
    - Sleeps a bit between requests (base_sleep)
    - On 429, respects Retry-After if present, otherwise exponential backoff
    """
    attempt = 0
    while True:
        time.sleep(base_sleep)  # steady throttle for every request
        resp = session.get(url, params=params, headers=headers, timeout=30)

        if resp.status_code != 429:
            resp.raise_for_status()
            return resp

        # 429
        attempt += 1
        if attempt > max_retries:
            resp.raise_for_status()

        retry_after = resp.headers.get("Retry-After")
        if retry_after is not None:
            sleep_s = float(retry_after) + 1.0
        else:
            sleep_s = (2 ** attempt) + random.random()  # exp backoff + jitter

        print(f"429 hit. Sleeping {sleep_s:.1f}s then retrying...")
        time.sleep(sleep_s)

def fetch_reddit_posts(session, subreddit, listing="hot", limit=25, time_filter="day", after=None):
    url = f"https://www.reddit.com/r/{subreddit}/{listing}.json"
    params = {"limit": limit}
    if listing in ("top", "controversial"):
        params["t"] = time_filter
    if after:
        params["after"] = after

    headers = {"User-Agent": "yikyak-dataset-bot/0.1 (by u/yourusername)"}
    resp = get_with_backoff(session, url, params=params, headers=headers)
    data = resp.json()
    children = [c["data"] for c in data["data"]["children"]]
    after = data["data"].get("after")
    return children, after

def scrape_group(
    subreddits,
    listings=("controversial", "top", "hot", "new"),
    per_request_limit=25,
    max_posts_per_listing=50,   # paginate up to this many per listing
    time_filter="day",
    base_sleep=2.0,
    out_dir="reddit_batches",
    out_name="batch.json",
):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    dataset = []
    session = requests.Session()

    for subreddit in subreddits:
        group_id = str(uuid.uuid4())
        for listing in listings:
            collected = 0
            after = None
            while collected < max_posts_per_listing:
                posts, after = fetch_reddit_posts(
                    session,
                    subreddit=subreddit,
                    listing=listing,
                    limit=per_request_limit,
                    time_filter=time_filter,
                    after=after,
                )

                for p in posts:
                    item = reddit_to_yikyak(p, listing, group_id)
                    if item:
                        dataset.append(item)

                collected += len(posts)
                if not after:
                    break  # no more pages

            # extra pause between listings (helps a lot)
            time.sleep(base_sleep)

    out_path = Path(out_dir) / out_name
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(dataset, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(dataset)} text-only posts to {out_path}")
    return dataset


In [12]:
dataA = scrape_group(
    A,
    per_request_limit=25,
    max_posts_per_listing=50,
    base_sleep=2.5,
    out_name="batch_A.json",
)

429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...


HTTPError: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/AskReddit/controversial.json?limit=25&t=day

In [None]:
dataB = scrape_group(
    B,
    per_request_limit=25,
    max_posts_per_listing=50,
    base_sleep=2.5,
    out_name="batch_B.json",
)

429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...
429 hit. Sleeping 1.0s then retrying...


In [None]:
dataC = scrape_group(
    C,
    per_request_limit=25,
    max_posts_per_listing=50,
    base_sleep=2.5,
    out_name="batch_C.json",
)

In [None]:
dataD = scrape_group(
    D,
    per_request_limit=25,
    max_posts_per_listing=50,
    base_sleep=2.5,
    out_name="batch_D.json",
)

In [None]:
dataE = scrape_group(
    E,
    per_request_limit=25,
    max_posts_per_listing=50,
    base_sleep=2.5,
    out_name="batch_E.json",
)

In [None]:
dataF = scrape_group(
    F,
    per_request_limit=25,
    max_posts_per_listing=50,
    base_sleep=2.5,
    out_name="batch_F.json",
)

In [30]:


import requests
import uuid
import random
import string
import json
import os
from datetime import datetime

MERGED_PATH = "merged_file.json"

def random_index_code(length=8):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def is_text_post(post):
    if post.get("is_video"):
        return False
    if post.get("post_hint") in ["image", "video", "hosted:video", "rich:video"]:
        return False
    if "preview" in post:
        return False
    if str(post.get("url_overridden_by_dest", "")).lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")):
        return False
    return True

def reddit_to_yikyak(post, listing, group_id=None):
    if not is_text_post(post):
        return None
    return {
        "id": post["id"],
        "text": f'{post.get("title","")} {post.get("selftext","")}'.strip(),
        "created_at": datetime.utcfromtimestamp(post["created_utc"]).isoformat() + "Z",
        "vote_total": post.get("ups", 0),
        "comment_count": post.get("num_comments", 0),
        "alias": "Anonymous",
        "group_id": group_id if group_id else str(uuid.uuid4()),
        "index_code": random_index_code(),
        "controversial_flag": 1 if listing == "controversial" else 0
    }

def fetch_reddit_posts(subreddit, listing="controversial", limit=50, time_filter="day"):
    url = f"https://www.reddit.com/r/{subreddit}/{listing}.json"
    params = {"limit": limit}
    if listing in ("controversial", "top"):
        params["t"] = time_filter
    headers = {"User-Agent": "yikyak-dataset-bot/0.1 (by u/yourusername)"}
    r = requests.get(url, params=params, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()
    return [child["data"] for child in data["data"]["children"]]

def load_merged(path):
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            return data if isinstance(data, list) else []
        except json.JSONDecodeError:
            return []

def save_merged(path, items):
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(items, f, indent=2, ensure_ascii=False)
    os.replace(tmp, path)

if __name__ == "__main__":
    subreddits = ["askscience", "cfb", "dating_advice", "overheard", "mac", "changemyview", "rareinsults", "snorkblot"]    
    listings = ["controversial", "top", "hot", "new"]

    merged = load_merged(MERGED_PATH)
    print(f"Loaded {len(merged)} existing items from {MERGED_PATH}")

    new_items = []
    for subreddit in subreddits:
        group_id = str(uuid.uuid4())
        for listing in listings:
            try:
                posts = fetch_reddit_posts(subreddit, listing, limit=50, time_filter="day")
                for p in posts:
                    transformed = reddit_to_yikyak(p, listing, group_id)
                    if transformed:
                        new_items.append(transformed)
            except Exception as e:
                print(f"Error fetching {listing} from {subreddit}: {e}")

    merged.extend(new_items)
    save_merged(MERGED_PATH, merged)

    with open("new_items_this_run.json", "w", encoding="utf-8") as f:
        json.dump(new_items, f, indent=2, ensure_ascii=False)

    print(f"Added {len(new_items)} items (no dedupe).")
    print(f"Merged file now has {len(merged)} items: {MERGED_PATH}")




Loaded 4191 existing items from merged_file.json
Added 828 items (no dedupe).
Merged file now has 5019 items: merged_file.json


In [31]:


import requests
import uuid
import random
import string
import json
import os
from datetime import datetime

MERGED_PATH = "merged_file.json"

def random_index_code(length=8):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def is_text_post(post):
    if post.get("is_video"):
        return False
    if post.get("post_hint") in ["image", "video", "hosted:video", "rich:video"]:
        return False
    if "preview" in post:
        return False
    if str(post.get("url_overridden_by_dest", "")).lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")):
        return False
    return True

def reddit_to_yikyak(post, listing, group_id=None):
    if not is_text_post(post):
        return None
    return {
        "id": post["id"],
        "text": f'{post.get("title","")} {post.get("selftext","")}'.strip(),
        "created_at": datetime.utcfromtimestamp(post["created_utc"]).isoformat() + "Z",
        "vote_total": post.get("ups", 0),
        "comment_count": post.get("num_comments", 0),
        "alias": "Anonymous",
        "group_id": group_id if group_id else str(uuid.uuid4()),
        "index_code": random_index_code(),
        "controversial_flag": 1 if listing == "controversial" else 0
    }

def fetch_reddit_posts(subreddit, listing="controversial", limit=50, time_filter="day"):
    url = f"https://www.reddit.com/r/{subreddit}/{listing}.json"
    params = {"limit": limit}
    if listing in ("controversial", "top"):
        params["t"] = time_filter
    headers = {"User-Agent": "yikyak-dataset-bot/0.1 (by u/yourusername)"}
    r = requests.get(url, params=params, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()
    return [child["data"] for child in data["data"]["children"]]

def load_merged(path):
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            return data if isinstance(data, list) else []
        except json.JSONDecodeError:
            return []

def save_merged(path, items):
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(items, f, indent=2, ensure_ascii=False)
    os.replace(tmp, path)

if __name__ == "__main__":
    subreddits = ["privacy", "mildlyinteresting", "dating", "askanamerican", "moviesuggestions", "northcarolina", "openai", "webdev", "iama"]
    listings = ["controversial", "top", "hot", "new"]

    merged = load_merged(MERGED_PATH)
    print(f"Loaded {len(merged)} existing items from {MERGED_PATH}")

    new_items = []
    for subreddit in subreddits:
        group_id = str(uuid.uuid4())
        for listing in listings:
            try:
                posts = fetch_reddit_posts(subreddit, listing, limit=50, time_filter="day")
                for p in posts:
                    transformed = reddit_to_yikyak(p, listing, group_id)
                    if transformed:
                        new_items.append(transformed)
            except Exception as e:
                print(f"Error fetching {listing} from {subreddit}: {e}")

    merged.extend(new_items)
    save_merged(MERGED_PATH, merged)

    with open("new_items_this_run.json", "w", encoding="utf-8") as f:
        json.dump(new_items, f, indent=2, ensure_ascii=False)

    print(f"Added {len(new_items)} items (no dedupe).")
    print(f"Merged file now has {len(merged)} items: {MERGED_PATH}")



Loaded 5019 existing items from merged_file.json
Added 922 items (no dedupe).
Merged file now has 5941 items: merged_file.json


In [32]:
import requests
import uuid
import random
import string
import json
import os
from datetime import datetime

MERGED_PATH = "merged_file.json"

def random_index_code(length=8):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def is_text_post(post):
    if post.get("is_video"):
        return False
    if post.get("post_hint") in ["image", "video", "hosted:video", "rich:video"]:
        return False
    if "preview" in post:
        return False
    if str(post.get("url_overridden_by_dest", "")).lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp")):
        return False
    return True

def reddit_to_yikyak(post, listing, group_id=None):
    if not is_text_post(post):
        return None
    return {
        "id": post["id"],
        "text": f'{post.get("title","")} {post.get("selftext","")}'.strip(),
        "created_at": datetime.utcfromtimestamp(post["created_utc"]).isoformat() + "Z",
        "vote_total": post.get("ups", 0),
        "comment_count": post.get("num_comments", 0),
        "alias": "Anonymous",
        "group_id": group_id if group_id else str(uuid.uuid4()),
        "index_code": random_index_code(),
        "controversial_flag": 1 if listing == "controversial" else 0
    }

def fetch_reddit_posts(subreddit, listing="controversial", limit=50, time_filter="day"):
    url = f"https://www.reddit.com/r/{subreddit}/{listing}.json"
    params = {"limit": limit}
    if listing in ("controversial", "top"):
        params["t"] = time_filter
    headers = {"User-Agent": "yikyak-dataset-bot/0.1 (by u/yourusername)"}
    r = requests.get(url, params=params, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()
    return [child["data"] for child in data["data"]["children"]]

def load_merged(path):
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            return data if isinstance(data, list) else []
        except json.JSONDecodeError:
            return []

def save_merged(path, items):
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(items, f, indent=2, ensure_ascii=False)
    os.replace(tmp, path)

if __name__ == "__main__":
    subreddits = [
        "applyingtocollege", "csmajors", "btechtards", "recruitinghell",
        "sysadmin", "careerguidance", "strangerthings", "unpopularopinion",
        "ama", "casualconversation", "twohottakes"
    ]
    listings = ["controversial", "top", "hot", "new"]

    merged = load_merged(MERGED_PATH)
    print(f"Loaded {len(merged)} existing items from {MERGED_PATH}")

    new_items = []
    for subreddit in subreddits:
        group_id = str(uuid.uuid4())
        for listing in listings:
            try:
                posts = fetch_reddit_posts(subreddit, listing, limit=50, time_filter="day")
                for p in posts:
                    transformed = reddit_to_yikyak(p, listing, group_id)
                    if transformed:
                        new_items.append(transformed)
            except Exception as e:
                print(f"Error fetching {listing} from {subreddit}: {e}")

    merged.extend(new_items)
    save_merged(MERGED_PATH, merged)

    with open("new_items_this_run.json", "w", encoding="utf-8") as f:
        json.dump(new_items, f, indent=2, ensure_ascii=False)

    print(f"Added {len(new_items)} items (no dedupe).")
    print(f"Merged file now has {len(merged)} items: {MERGED_PATH}")


Loaded 5941 existing items from merged_file.json
Error fetching controversial from ama: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/ama/controversial.json?limit=50&t=day
Error fetching top from ama: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/ama/top.json?limit=50&t=day
Error fetching hot from ama: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/ama/hot.json?limit=50
Error fetching new from ama: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/ama/new.json?limit=50
Error fetching controversial from casualconversation: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/casualconversation/controversial.json?limit=50&t=day
Error fetching top from casualconversation: 429 Client Error: Too Many Requests for url: https://www.reddit.com/r/casualconversation/top.json?limit=50&t=day
Error fetching hot from casualconversation: 429 Client Error: Too Many Requests for url: https://www.