In [1]:
import requests
from bs4 import BeautifulSoup
from os import path as osPath
import time
import html_to_json as htmlToJson
import json
from inputimeout import inputimeout
import random
from enum import Enum
from validations import ValidationService
from validations import ValidationUtils
from constants import Constants
import time

In [80]:
import requests
from bs4 import BeautifulSoup
from enum import Enum
import time

class RedditTimeFilterType(Enum):
    ALL = 'ALL'
    YEAR = 'YEAR'
    WEEK = 'WEEK'
    MONTH = 'MONTH'
    HOUR = 'HOUR'

    _filterMapPerValue = {
        'ALL': ALL,
        'YEAR': YEAR,
        'WEEK': WEEK,
        'MONTH': MONTH,
        'HOUR': HOUR       
    }

    def getFilterByString(filter:str):
        if filter in RedditTimeFilterType._filterMapPerValue:
            return RedditTimeFilterType._filterMapPerValue[filter]
        else:
            RedditTimeFilterType.ALL

class RedditSortType(Enum):
    HOT = 'hot'
    NEW = 'new'
    TYPE = 'type'
    RISING = 'rising'

    _filterMapPerValue = {
        'hot': HOT,
        'new': NEW,
        'type': TYPE,
        'rising': RISING  
    }

    def getFilterByString(sortType:str):

        if sortType in RedditTimeFilterType._filterMapPerValue:
            return RedditSortType._filterMapPerValue[sortType]
        else:
            return RedditSortType.NEW

class Readit:
    def __init__(self, subreddit:str = None, url:str = None) -> None:
        self._path = None
        self.pathForMore = "/svc/shreddit/community-more-posts"
        self._url = None
        self.subreddit = None
        self.sort = RedditSortType.NEW
        self.timeFilter = RedditTimeFilterType.ALL

        self.setRedditUrl(url)
        self.setSubreddit(subreddit, url)
    
    def setRedditUrl(self, url: str = None):
        # ValidationService.isUrlValid(url)
        self._url = url
        self.setPathFromURL(url)
    
    def setSubreddit(self, subreddit: str = None, url: str = None):

            if len(subreddit)>0: 
                # or ValidationUtils.isNotEmptyString(subreddit):
                self.subreddit = f'r/{subreddit}' if subreddit[:2]!= 'r/' else subreddit
            elif len(url)>0:
                # ValidationService.isUrlValid(url)
                # add r/ at first then remove reddit.com/r/ and the rest will have NAME/SORT/?after=
                # split by / then at 0 we will get name of the subreddit which is added to r/ at start
                self.subreddit = 'r/' + url.split(f'{Constants.domain[-10:]}/r/')[1].split("/")[0]
    
    def setPathFromURL(self, url: str)-> None:
        self._path = "".join(url.split("reddit.com")[1:])
                
    
    def getRedditUrlForPostsAfter(self, after: str = None, feedlength: int= 100):
        if after is not None and len(after):
            # ValidationUtils.isNotEmptyString(after): 
            self._url = f'https://www.reddit.com/{self.pathForMore}/{self.sort.value}/?after={after}%3D%3D&name={self.subreddit[2:]}&feedLength={feedlength}'
        else:
            self._url = f'https://www.reddit.com/{self.subreddit}/{self.sort.value}'
        return self._url

    # def getAllSubmissions(self, sort: str = None, timeFilter: str = None, limit: int = 1000):

    #     if ValidationUtils.isNotEmptyString(sort): 
    #         self.sort = RedditSortType.getFilterByString(sort)

    #     if ValidationUtils.isNotEmptyString(timeFilter): 
    #         self.timeFilter = RedditTimeFilterType.getFilterByString(timeFilter)

    #     redditUrl = self.getRedditUrlForPostsAfter()
    #     return self.getSubmissions(redditUrl=redditUrl, limit=limit)
    
    def getSubmissions(self, redditUrl: str = None, size: int = 3, limit: int = 1000):
        
        if limit <= 0:
            return None
        
        timeOfRequest = time.time_ns()
        print(f"url ({timeOfRequest}): {redditUrl}")
        response = requests.get(redditUrl, verify= True)

        with open(f"lastResponsePages{timeOfRequest}.html", "w+") as page:
            page.write(response.text)

        if response.status_code == 200:
            redditSoup = BeautifulSoup(response.text, "html.parser")
            posts = redditSoup.find_all("shreddit-post")
            if posts is not None:
                nextToken, jsonPosts = self.processPosts(posts)
                print(f'loaded: {len(jsonPosts)}')
                loadMoreUrl = None
                if len(nextToken)>0:
                    # ValidationUtils.isNotEmptyString(nextToken):
                    loadMoreUrl = self.getRedditUrlForPostsAfter(nextToken, feedlength=size+25)
                # else:
                #     nextPageMetaData= redditSoup.find('faceplate-partial', {'slot': 'load-after'})
                #     if nextPageMetaData is not None and nextPageMetaData.attrs is not None and ValidationUtils.isNotEmptyString(nextPageMetaData.attrs['src']):
                #         loadMoreUrl = f'{Constants.domain}{nextPageMetaData.attrs["src"]}'

                if len(loadMoreUrl)>0:
                    # ValidationUtils.isNotEmptyString(loadMoreUrl):
                    nextJsonPosts = self.getSubmissions(redditUrl=loadMoreUrl, limit=limit - len(jsonPosts), size=size+25)
                    if nextJsonPosts is not None: 
                        jsonPosts.union(nextJsonPosts)

                return jsonPosts
        else:
            return None
    
    def processPosts(self, postsContainer):
        posts = set()
        nextToken = None
        for post in postsContainer:
            if post != None and post.name != None:
                # convert to json
                posts.add(post)
                if post.attrs is not None and 'more-posts-cursor' in post.attrs:
                    # continue
                    nextToken = post.attrs['more-posts-cursor']
        return nextToken, posts

    def getFullPostLinks(self, posts):
        urlToPostMap = {}
        for post in posts:
            fullPostTag= post.find("a", {"slot": 'full-post-link'})
            if fullPostTag is not None and fullPostTag.attrs is not None:
                if fullPostTag.attrs['href'] is not None:
                    urlToPostMap[fullPostTag.attrs['href']] = post
                elif post is not None and post.attrs is not None and post.attrs["permalink"]:
                    urlToPostMap[post.attrs["permalink"]] = post
                else:
                    print("No post url found at all :(")
            else:
                urlToPostMap[post.attrs["content-href"]] = post
        return urlToPostMap





In [81]:
readit = Readit(subreddit="r/AskReddit", url= "https://www.reddit.com/r/AskReddit/hot/?")

In [82]:
redditUrl = readit.getRedditUrlForPostsAfter()

In [83]:
posts = readit.getSubmissions(redditUrl=redditUrl, limit=1)

url (1726421800644766000): https://www.reddit.com/r/AskReddit/new
loaded: 3


In [84]:
post = None
for i in posts:
    post = i
    break

In [85]:
fullPostUrl = post.find("a", {"slot": 'full-post-link'})

In [86]:
fullPostUrl.attrs['href']

'/r/AskReddit/comments/1fhi7zs/whats_the_funniest_thing_youve_seen_someone/'

In [87]:
post.attrs["permalink"]

'/r/AskReddit/comments/1fhi7zs/whats_the_funniest_thing_youve_seen_someone/'

In [88]:
fullPostUrl.attrs['href'] == post.attrs["permalink"]

True

In [89]:
urlPostMap = readit.getFullPostLinks(posts)

In [90]:
urlPostMap.keys()

dict_keys(['/r/AskReddit/comments/1fhi7zs/whats_the_funniest_thing_youve_seen_someone/', '/r/AskReddit/comments/1fhi8ad/how_do_you_overcome_fear_of_failure/', '/r/AskReddit/comments/1fhi7dy/controversial_question_but_who_do_you_think_is/'])

In [91]:
for path in urlPostMap.keys():
    print(f'https://www.reddit.com{path}')

https://www.reddit.com/r/AskReddit/comments/1fhi7zs/whats_the_funniest_thing_youve_seen_someone/
https://www.reddit.com/r/AskReddit/comments/1fhi8ad/how_do_you_overcome_fear_of_failure/
https://www.reddit.com/r/AskReddit/comments/1fhi7dy/controversial_question_but_who_do_you_think_is/


In [92]:
post.attrs["content-href"]

'https://www.reddit.com/r/AskReddit/comments/1fhi7zs/whats_the_funniest_thing_youve_seen_someone/'

In [96]:
fullPostResponse = requests.get("https://www.reddit.com/r/AskReddit/comments/1fhi7zs/whats_the_funniest_thing_youve_seen_someone/")

In [100]:
fullPost = BeautifulSoup(fullPostResponse.text, "html.parser")

In [104]:
mainPost = fullPost.find("main", {"id": "main-content"})

In [105]:
mainPost

<main class="main w-full flex-grid--main-container-card right-sidebar-xs" id="main-content"><shreddit-title title="What's the funniest thing you've seen someone trying to eat using a knife and fork? : r/AskReddit"></shreddit-title>
<shreddit-post app="" author="trojan_leon" author-id="t2_13srq4djk0" class="block xs:mt-xs xs:-mx-xs xs:px-xs xs:rounded-[16px] pt-xs nd:pt-xs bg-[color:var(--shreddit-content-background)] box-border mb-xs nd:visible nd:pb-2xl" comment-count="0" content-href="https://www.reddit.com/r/AskReddit/comments/1fhi7zs/whats_the_funniest_thing_youve_seen_someone/" created-timestamp="2024-09-15T17:36:17.694000+0000" domain="self.AskReddit" icon="https://preview.redd.it/snoovatar/avatars/1839fd21-a8e6-4334-b05f-2b05de27a8cd-headshot.png?width=64&amp;height=64&amp;crop=smart&amp;auto=webp&amp;s=ee506ba6e5dbf097d9444c85de29b326ceb4d7c3" id="t3_1fhi7zs" is-awardable="" is-desktop-viewport="" is-embeddable="" moderation-verdict="" permalink="/r/AskReddit/comments/1fhi7zs/w

In [110]:
load_more_comment_url = None
for tag in mainPost.children:
    if tag.name is not None and tag.name == "faceplate-partial":
        load_more_comment_url = tag.attrs['src']

In [111]:
load_more_comment_url

'/svc/shreddit/comments/r/askreddit/t3_1fhi7zs?render-mode=partial&is_lit_ssr=false'

In [112]:
f'https://www.reddit.com{load_more_comment_url}'

'https://www.reddit.com/svc/shreddit/comments/r/askreddit/t3_1fhi7zs?render-mode=partial&is_lit_ssr=false'