# Scrolller Scraper

[Scrolller](https://scrolller.com) allows subreddits to be viewed using a scrollable, Instagram like interface.

### Imports

In [7]:
import os
import requests
import json
import concurrent.futures

### Configs

In [8]:
SUBREDDITS = [
    '/r/audi',
    '/r/bmw',
    '/r/porsche',
    '/r/spotted'
]

### Helper Functions

In [9]:
def fetch_subreddit_data(subreddit_url):
    url = 'https://api.scrolller.com/api/v2/graphql'

    headers = {
        'authority': 'api.scrolller.com',
        'accept': '*/*',
        'accept-language': 'en-US,en;q=0.9',
        'content-type': 'text/plain;charset=UTF-8',
        'origin': 'https://scrolller.com',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }

    data = {
        "query": " query SubredditQuery( $url: String! $filter: SubredditPostFilter $iterator: String ) { getSubreddit(url: $url) { children( limit: $limit iterator: $iterator filter: $filter disabledHosts: null ) { iterator items { __typename id url title subredditId subredditTitle subredditUrl redditPath albumUrl hasAudio fullLengthSource mediaSources { url width height } } } } } ",
        "variables": {
            "url": subreddit_url,
            "limit": 500
        },
    }

    try:
        response = requests.post(url, headers=headers, json=data)
        response_data = json.loads(response.text)
    except Exception as err:
        print(f"Error getting request data: {err}")

    return response_data['data']['getSubreddit']['children']['items']

In [10]:
def prioritize_mp4(media_sources):
    for source in media_sources:
        if source['url'].endswith('.mp4'):
            return source['url']
    return media_sources[0]['url']

In [11]:
def download_media(post, folder_path):    
    # Sort mediaSources by image dimensions in descending order
    media_sources = sorted(post['mediaSources'], key=lambda x: x['width'] * x['height'], reverse=True)
    largest_media_url = prioritize_mp4(media_sources)
    
    file_name = largest_media_url.split('/')[-1]
    file_path = os.path.join(folder_path, file_name)
    
    if not os.path.exists(file_path):
        media_response = requests.get(largest_media_url)
        with open(file_path, 'wb') as f:
            f.write(media_response.content)
        print(f"'{post['title']}' saved:", file_path)
        print()
    else:
        print(f"Skipping '{post['title']}' ... file already exists: {file_path}")
        print()

### Create Folders & Download Media

In [12]:
for subreddit in SUBREDDITS:
    subreddit_name = subreddit.split('/')[-1]
    folder_path = f"./scrolller/{subreddit_name}"
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    posts = fetch_subreddit_data(subreddit)
    
    # Multi-threaded execution
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(download_media, posts, [folder_path] * len(posts))

print("Downloads Complete!")

KeyError: 'data'