In [None]:
import requests, boto3, pandas as pd, os, sys, pprint
from pathlib import Path
pp = pprint.PrettyPrinter(indent=2, compact=True, width=80)

# from entities.RedditAccount import RedditAccount
# from entities.DailyUpload import DailyUpload

#TODO: Change path accordingly in handler


# Making the current directory in which this file is in discoverable to python.
# Commenting it here because it will not work in jupyter notebook. It will work in lambda though.
#sys.path.append(os.path.join(os.path.dirname(__file__)))

# Below should be used only in jupyter notebook
sys.path.append('./')


REDDIT_AUTH_URL = 'https://www.reddit.com/api/v1/access_token'
REDDIT_ACCOUNTS_TABLE_NAME = 'RedditAccountsTable-dev'
DAILY_UPLOADS_TABLE = "DailyUploadsTable-dev"
REDDIT_API_URL_TOP = "https://oauth.reddit.com/r/placeholder_value/top"
REDDIT_API_URL_SORT = "https://oauth.reddit.com/r/placeholder_value/sort"

ddb = boto3.client("dynamodb", region_name="ap-south-1")

# Daily Upload Class

In [None]:
from datetime import datetime
import ddb_helpers


class DailyUpload:
    post_keys_to_keep = [
        "title",
        "url",
        "upvote_ratio",
        "ups",
        "author",
        "name",
        "total_awards_received",
    ]

    def __init__(self, subreddit) -> None:
        self.subreddit = subreddit
        self.date = str(datetime.today().date())  ## Of the format yyyy-mm-dd
        self.total_duration = 0
        self.urls = []
        # self.df_top = pd.DataFrame()
        self.latest_post = None
        self.eligible_posts = []

    # Renamed from date_subreddit_key()
    def key(self) -> dict:
        """Returns a dictionary with date as PK, subreddit as SK.

        Returns:
            Dict: Containing serialized subreddit and date
        """

        return {
            "PK": DailyUpload.__serialize_date(self.date),
            "SK": DailyUpload.__serialize_subreddit(self.subreddit),
        }

    # def subreddit_date_key(self) -> dict:
    #     """Returns a dictionary with subreddit as PK date as SK.

    #     Returns:
    #         Dict: Containing serialized subreddit and date.
    #     """
    # return {"PK": self.__serialize_subreddit(), "SK": self.__serialize_date()}

    # Renamed from serialize_date_subreddit()
    def serialize_to_item(self):
        """Serializes member variable data of this object for the access pattern:
        date-Partition Key
        subreddit- Sort Key

        Returns:
            Dict: Ready to be used by boto3 to insert item into DynamoDB.
        """
        item = self.key()
        item["posts"] = DailyUpload.__serialize_posts(self.eligible_posts)
        return item

    @staticmethod
    def __removed_post_is_worthy(post):
        if post["removed_by"] or post["removal_reason"]:
            if post["num_comments"] > 5 and post["score"] > 10:
                return True
            else:
                return False

        return True

    @staticmethod
    def __is_eligible(post):
        if (
            post["is_video"]
            and post["ups"] > 0
            and post["num_comments"] > 0
            and not post["over_18"]
            and not post["stickied"]
        ):
            return True

        return False

    def parse_posts(self, posts):
        """Parse posts and insert into a dataframe.
        The last parsed post will updated in a member variable.

        Args:
            posts (list): List of posts from reddit API
        """
        posts = posts["data"]["children"]
        for post in posts:
            post = post["data"]
            self.latest_post = post

            if DailyUpload.__is_eligible(post) and DailyUpload.__removed_post_is_worthy(
                post
            ):

                temp = {key: post[key] for key in DailyUpload.post_keys_to_keep}
                self.eligible_posts.append(temp)
                self.total_duration += int(post["media"]["reddit_video"]["duration"])
                # self.df_top = self.df_top.append(
                #     {
                #         "title": post["title"],
                #         "upvote_ratio": post["upvote_ratio"],
                #         "ups": post["ups"],
                #         "downs": post["downs"],
                #         "score": post["score"],
                #         "url": post["url"],
                #     },
                #     ignore_index=True,
                # )

    # def sort_and_update_urls(self):
    #     self.df_top = self.df_top.sort_values(
    #         ["score", "total_awards_received", "ups", "upvote_ratio"],
    #         ascending=False,
    #         axis=0,
    #     )

    # self.urls = self.urls + self.df_top["url"].tolist()

    # def serialize_subreddit_date(self):
    #     item = self.subreddit_date_key()
    #     item["total_duration"] = self.__serialize_total_duration()
    #     return item

    # def __serialize_urls(self):
    #     serialized_urls = {"L": [{"S": url} for url in self.urls]}
    #     return serialized_urls

    #  post["title"],
    #                     post["upvote_ratio"],
    #                     post["ups"],
    #                     post["score"],
    #                     post["url"],
    #                     post["author"],

    @staticmethod
    def __serialize_posts(posts):
        serialized_posts = {"L": [DailyUpload.__serialize_post(post) for post in posts]}

        return serialized_posts

    @staticmethod
    def __serialize_post(post):
        serialized_post = {"M": {}}

        for key in DailyUpload.post_keys_to_keep:
            serialized_post["M"][key] = {
                ddb_helpers.get_datatype(post[key]): str(post[key])
            }

        return serialized_post
    
    

    @staticmethod
    def __serialize_subreddit(subreddit):
        return {"S": subreddit}


    @staticmethod
    def __serialize_date(date):
        return {"S": date}


    @staticmethod
    def deserialize_PK_SK_count(item):
        deserialized_item = {}
        for key, value in item.items():
            for _key, _value in value.items():
                deserialized_item[key] = _value
        return deserialized_item



# Reddit Account Class

In [None]:
import requests

class RedditAccount:
    def __init__(self, subreddit, ddb):
        self.subreddit = subreddit
        self.client_id = None
        self.secret_key = None
        self.username = None
        self.password = None
        self.auth = None
        self.headers = {"User-Agent": f"{subreddit}API/0.0.1"}
        self.data = {"grant_type": "password", "username": None, "password": None}
        self.access_token = None
        self.ddb = ddb

     
    def key(self):
        return {"PK": {"S": self.subreddit}}

    def fetch_and_update_account_details(self, REDDIT_ACCOUNTS_TABLE_NAME):
        try:
            response = self.ddb.get_item(
                TableName=REDDIT_ACCOUNTS_TABLE_NAME, Key=self.key()
            )
            item = RedditAccount.deserialize_item(response["Item"])
            self.client_id = item["personal_use_script"]
            self.secret_key = item["secret_key"]
            self.username = item["username"]
            self.password = item["password"]

        except Exception as e:
            print(f"Failed with exception: {e}")

        self.data["username"] = self.username
        self.data["password"] = self.password

    @staticmethod
    def deserialize_item(item):
        new_item = {}
        for key in item:
            new_item[key] = RedditAccount.extract_value(item[key])

        return new_item

    @staticmethod
    def extract_value(dictionary):
        data_type, value = list(dictionary.keys())[0], list(dictionary.values())[0]

        if data_type == "S":
            return value

    def authenticate_with_api(self):
        self.auth = requests.auth.HTTPBasicAuth(self.client_id, self.secret_key)

    def fetch_and_update_access_token(self, REDDIT_AUTH_URL):
        # Authorise and request for access token from Reddit API
        res = requests.post(
            REDDIT_AUTH_URL, auth=self.auth, data=self.data, headers=self.headers
        )
        self.access_token = res.json()["access_token"]
        self.headers["Authorization"] = f"bearer {self.access_token}"

    def fetch_posts_as_json(self, url, params={}):
        res = requests.get(url, headers=self.headers, params=params)
        return res.json()

# Event handler code

In [None]:
import boto3, os, sys
from pathlib import Path

# Making the current directory in which this file is in discoverable to python
# sys.path.append(os.path.join(os.path.dirname(__file__)))

# from entities.DailyUpload import DailyUpload
# from entities.RedditAccount import RedditAccount
from subreddit_groups import subreddit_groups

ddb = boto3.client("dynamodb", region_name="ap-south-1")
sqs = boto3.client("sqs")

# REDDIT_AUTH_URL = os.getenv("REDDIT_AUTH_URL")
# REDDIT_ACCOUNTS_TABLE_NAME = os.getenv("REDDIT_ACCOUNTS_TABLE_NAME")
# DAILY_UPLOADS_TABLE = os.getenv("DAILY_UPLOADS_TABLE_NAME")
# PROCESS_URLS_FOR_SUBREDDIT_GROUP_QUEUE_URL = os.getenv(
#     "PROCESS_URLS_FOR_SUBREDDIT_GROUP_QUEUE_URL"
# )


def run(event, context):

    # TODO: Hardcoding subreddit value for now. In production, should extract from queue:
    # subreddit = "funny"
    subreddit = str(event["Records"][0]["body"])


    # Getting from env here because, if container is warm, it will fetch from the previously
    # executed subreddit url.
    #TODO: Remove below line and uncomment the one below it.
    REDDIT_API_URL_TOP = REDDIT_API_URL_TOP = "https://oauth.reddit.com/r/placeholder_value/top"
#     REDDIT_API_URL_TOP = os.getenv("REDDIT_API_URL_TOP")
    REDDIT_API_URL_TOP = REDDIT_API_URL_TOP.replace("placeholder_value", subreddit)
    REDDIT_ACCOUNTS_TABLE_NAME = 'RedditAccountsTable-dev'

    daily_upload = DailyUpload(subreddit=subreddit)
    reddit_account = RedditAccount(
        subreddit=subreddit,
        ddb=ddb)
    
    reddit_account.fetch_and_update_account_details(REDDIT_ACCOUNTS_TABLE_NAME)
    reddit_account.authenticate_with_api()
    reddit_account.fetch_and_update_access_token(REDDIT_AUTH_URL)
    
#         REDDIT_ACCOUNTS_TABLE_NAME=REDDIT_ACCOUNTS_TABLE_NAME,
#         REDDIT_AUTH_URL=REDDIT_AUTH_URL,
    
    print(f"Subreddit : {subreddit} is being processed")

    # Keep fetching and parsing posts from reddit api till daily_upload.total_duration
    # is more than 600 seconds. Will use the 'after' param to keep going backwards.
    after = None
    while daily_upload.total_duration < 601:
        print(f"Fetching {subreddit} posts after {after}")
        posts = reddit_account.fetch_posts_as_json(
            REDDIT_API_URL_TOP, params={"limit": "100", "after": after}
        )
        daily_upload.parse_posts(posts)
        after = daily_upload.latest_post["name"]
        print(" total duration ",daily_upload.total_duration)
        print(" eligible posts len:  ", len(daily_upload.eligible_posts))

    # After uploading this subreddits' urls, update the count of todays_subreddits_count
    # doing this as a transaction.
    try:
        res = ddb.transact_write_items(
            TransactItems=[
                {
                    "Put": {
                        "TableName": DAILY_UPLOADS_TABLE,
                        "Item": daily_upload.serialize_to_item(),
                    }
                },
                {
                    "Update": {
                        "TableName": DAILY_UPLOADS_TABLE,
                        "Key": {
                            "PK": {"S": daily_upload.date},
                            "SK": {"S": "todays_subreddits_count"},
                        },
                        "ConditionExpression": "attribute_exists(PK) and attribute_exists(SK)",
                        "UpdateExpression": "SET #count = #count + :inc",
                        "ExpressionAttributeNames": {"#count": "count"},
                        "ExpressionAttributeValues": {":inc": {"N": "1"}},
                    }
                },
            ]
        )

        if res["ResponseMetadata"]["HTTPStatusCode"] != 200:
            raise Exception(
                f"Failed to write transaction for {subreddit} on {daily_upload.date}"
            )

        print(f"Successfully updated DB with posts for {subreddit} subreddit")

    except Exception as e:
        print(e)
        return {"error": e}

    # Prepping up for fetching todays_subreddits_count an total_subreddits_count from DailyUploads table.
    key = daily_upload.key()

    total_subreddits_key = daily_upload.key()
    todays_subreddits_key = daily_upload.key()

    total_subreddits_key["SK"]["S"] = "total_subreddits_count"
    todays_subreddits_key["SK"]["S"] = "todays_subreddits_count"

    try:
        res = ddb.transact_get_items(
            TransactItems=[
                {
                    "Get": {
                        "Key": total_subreddits_key,
                        "TableName": DAILY_UPLOADS_TABLE,
                    },
                },
                {
                    "Get": {
                        "Key": todays_subreddits_key,
                        "TableName": DAILY_UPLOADS_TABLE,
                    },
                },
            ]
        )

    except Exception as e:
        print(e)
        return {"error": e}

    print(f"{subreddit} subreddit has updated todays_subreddit_count ")
    # Extract items from response
    print( "Response is : ", res)
    items = [response["Item"] for response in res["Responses"]]

    # Deserialize the items and extract total_subreddits_count and todays_subreddits_count items.
    total_subreddits_item_deserialized, todays_subreddit_item_deserialized = [
        DailyUpload.deserialize_PK_SK_count(item) for item in items
    ]

    # If evaluates to true, then push subreddit groups to ProcessUrlsQueue
    if (
        total_subreddits_item_deserialized["count"]
        == todays_subreddit_item_deserialized["count"]
    ):
        push_subreddit_groups_to_queue()

        # and then send custom response to show today's urls
        # of all subreddits have been processed.
        return {
            "success": f"All subreddits have been processed and uploaded urls for date {daily_upload.date}.\nPushed subreddit_groups to: {PROCESS_URLS_QUEUE_URL}"
        }

    return {
        subreddit: f"successfully processed {subreddit} for date: {daily_upload.date}"
    }


def push_subreddit_groups_to_queue():
    for group in subreddit_groups:
        res = sqs.send_message(
            QueueUrl=PROCESS_URLS_FOR_SUBREDDIT_GROUP_QUEUE_URL, MessageBody=group
        )


In [None]:
event = {"Records": [ {"body": "funny"} ]}

In [None]:
run(event, {})

# Pushshift api tryout

In [None]:
api = PushshiftAPI()

In [None]:
from datetime import datetime, timedelta

In [None]:
today =  datetime.today()
yesterday = datetime.today() - timedelta(days=1)
day_before_yesterday = yesterday - timedelta(days=1)

In [None]:
today = datetime(today.year, today.month, today.day, 0,0,0).timestamp()
yesterday = datetime(yesterday.year, yesterday.month, yesterday.day,0,0,0).timestamp()
day_before_yesterday = datetime(day_before_yesterday.year, day_before_yesterday.month, day_before_yesterday.day,0,0,0).timestamp()

In [None]:
 a = list(api.search_submissions(after=day_before_yesterday, before=today, subreddit='funny', filter=['url', 'title'], limit = 10))

In [None]:
int(day_before_yesterday)

In [None]:
def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/submission/?subreddit=funny&num_comments=>0&after={int(day_before_yesterday)}&before={int(yesterday)}&is_video=true&sort_type=score&sort=score:asc&size=100&aggs=subreddit"
#     payload = {}
#     print(payload)
    request = requests.get(base_url)
    return request.json()

In [None]:
data_type="submission"     # give me comments, use "submission" to publish something
query="funny"          # Add your query
duration="1d"          # Select the timeframe. Epoch value or Integer + "s,m,h,d" (i.e. "second", "minute", "hour", "day")
size=1000               # maximum 1000 comments
sort_type="score"       # Sort by score (Accepted: "score", "num_comments", "created_utc")
sort="desc"             # sort descending
aggs="subreddit"        #"author", "link_id", "created_utc", "subreddit"

In [None]:
base_url = f"https://api.pushshift.io/reddit/search/submission/?subreddit=funny&num_comments=>0&over_18=false&after={int(day_before_yesterday)}&before={int(yesterday)}&is_video=true&sort_type=score&sort=score:desc&size=100&aggs=subreddit"

request = requests.get(base_url)
b = request.json()

In [None]:
b['data'][59]

In [None]:
for post in a['data']:    
    pp.pprint(post)
    break

   
    
        


In [None]:
post