# Overview
* scrape one submission

# Dependencies

In [None]:
# jupyter
from IPython.display import clear_output, display

# general
import datetime
import os
import re
import time

# data
import csv
import requests
import json
import pandas as pd
import praw

# Constants

In [None]:
gbl_YYYYMMDDHHMM = datetime.datetime.now().strftime('%Y-%m-%d %H%M')
path_post_id_data = f"./data/{gbl_YYYYMMDDHHMM}_post_ids.csv"
path_post_id_data_error = f"./data/{gbl_YYYYMMDDHHMM}__post_ids_error.csv"
path_reddit_data = f"./data/{gbl_YYYYMMDDHHMM}_reddit_data.csv"
path_praw_config = "./config/praw.json"

threshold_score_minimum = 3  # score must be greater than this to get scraped

# Funcs

In [None]:
def getValueForCSV(val):
    if isinstance(val, str):

        # clean up weird newline characters
        val_clean = re.sub(
            r"\\t|\\n|\\r",
            "",
            re.sub(
                "\t|\n|\r",
                "",
                val
            )
        )

        return str(val_clean.encode("cp1252", "ignore")).strip("b\'\"") # encode string

    else:
        return val

# Get PRAW Config

In [None]:
with open(path_praw_config) as f:
    keys = json.load(f)

# Get Reddit API Client

In [None]:
reddit = praw.Reddit(client_id=keys['client_id'],
                     client_secret=keys['client_secret'],
                     user_agent=keys['user_agent'])

In [None]:
# test
print(reddit.read_only)

# Get Post IDs

In [None]:
# all
# df_post_id_data = pd.read_csv(path_post_id_data)
# ls_post_ids = df_post_id_data['id'].tolist()

In [None]:
# just one
ls_post_ids = ["mmp4t"]

# Get Reddit Data

In [None]:
start_time = time.time()

bool_header = not os.path.exists(path_reddit_data)

with open(path_reddit_data, "a", newline='', encoding="cp1252") as f:

    # set up output file
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    if bool_header:
        header = ["id", "url", "timestamp", "title", "selftext", "edited", "link_flair_text", "score", "num_comments", "upvote_ratio"]
        writer.writerow(header)

    # get data
    counter = 0             # total post count
    counter_success = 0     # count of posts successfully written
    counter_min = 0         # count of posts not meeting minimum score
    counter_ignore = 0      # count of posts with ignorable text
    counter_errors = 0      # count of posts that ran into an error
    for idx in ls_post_ids:

        # counter
        counter += 1  # i.e., counter is 1-indexed

        # clean idx
        idx = idx.replace("\n", "")

        # log
        print(f"Total Post Count: {counter}")
        print(f"Time Elapsed (min): {round((time.time() - start_time)/60, 2)}")
        print(f"Target Post ID: {idx}")

        # get reddit instance
        try:
            if reddit.auth.limits['remaining'] == 0:
                reddit = praw.Reddit(client_id=keys['client_id'],
                         client_secret=keys['client_secret'],
                         user_agent=keys['user_agent'])
        except:
            reddit = praw.Reddit(client_id=keys['client_id'],
                         client_secret=keys['client_secret'],
                         user_agent=keys['user_agent'])

        # get submission
        try:
            post = reddit.submission(idx)
            score = post.score
            url = getValueForCSV(post.url)

            # log
            print(f"Target Post Link: {url}")
            
        except Exception as e:
            counter_errors += 1
            # save errors for debugging

            with open(path_post_id_data_error, "a", newline='', encoding="cp1252") as f_errors:
                try:
                    f_errors.write(", ".join(["requesting", str(idx), "", url, str(e)]) + "\n")

                    # log
                    print(f"Target Post Link: {url}")
                except:
                    f_errors.write(", ".join(["requesting", str(idx), "", "", str(e)]) + "\n")

                    # log
                    print(f"Target Post Link: none found")

            continue

        # log
        print(reddit.auth.limits)

        # get data
        if score >= threshold_score_minimum:

            selftext = getValueForCSV(post.selftext)

            # check selftext
            if selftext in ls_ignore_selftext:
                counter_ignore += 1
                print("Result: ignorable selftext")
                print()
                continue

            title = getValueForCSV(post.title)
            edited = post.edited
            num_comments = post.num_comments
            created_utc = datetime.datetime.utcfromtimestamp(post.created_utc)
            upvote_ratio = post.upvote_ratio

            link_flair_text = getValueForCSV(post.link_flair_text)
            if not link_flair_text:
                link_flair_text = None

            line_stuff = [idx, url, created_utc, title, selftext, edited, link_flair_text, score, num_comments, upvote_ratio]

            # write
            try:
                counter_success += 1
                writer.writerow(line_stuff)
                print("Result: successfully written")
            except Exception as e:
                # save errors for debugging
                counter_errors += 1
                with open(path_post_id_data_error, "a", newline='', encoding="cp1252") as f_errors:
                    f_errors.write(", ".join(["writing", str(idx), title, url, str(e)]) + "\n")
                print("Result: error in writing")

        else:
            counter_min += 1
            print("Result: minimum score threshold not met")

        # prep for next

        print()  # newline for log

        url = None
        created_utc = None
        title = None
        selftext = None
        edited = None
        link_flair_text = None
        score = None
        num_comments = None
        upvote_ratio = None
        line_stuff = None

        time.sleep(0.2)  # 0.1 hit rate limits