# Overview
* get list of IDs to scrape after excluding IDs that have already been scrubbed
* split input file for multiprocessing

# Dependencies

In [None]:
# general
import glob
import math

# data
import pandas as pd

# Get All Post IDs

In [None]:
df_data_raw = pd.read_csv("./data/post_ids.csv")

In [None]:
# remove ignorable selftext
gone_list = ["[deleted]", "[removed]", ""]
df_data_raw = df_data_raw[df_data_raw['selftext'].isin(gone_list) == False]

In [None]:
# remove low scores
# actually, we shouldn't do this here. the score can change

In [None]:
ls_all = df_data_raw['id'].tolist()

In [None]:
len(ls_all)

# Remove Dupes

In [None]:
ls_all = list(set(ls_all))

In [None]:
len(ls_all)

# Get IDs from Successfully Written

In [None]:
# get all successfully written data
ls_reddit_data = glob.glob("./data/*reddit_data*.csv")
ls_reddit_data

In [None]:
ls_reddit_data_ids = []
for str_path_reddit_data in ls_reddit_data:
    df_temp = pd.read_csv(str_path_reddit_data, usecols=['id'], encoding="cp1252")
    ls_temp_ids = df_temp['id'].tolist()
    ls_reddit_data_ids = ls_reddit_data_ids + ls_temp_ids
    
    # log
    print(", ".join([str_path_reddit_data, str(len(ls_temp_ids))]))
    
# # keep unique ids
# ls_reddit_data_ids = list(set(ls_reddit_data_ids))

In [None]:
len(ls_reddit_data_ids)

# Get IDs from Errors

In [None]:
# get all successfully written data
ls_errors = glob.glob("./data/*post_ids_error*.csv")
ls_errors

In [None]:
ls_error_ids = []
for str_path_error in ls_errors:
    df_temp = pd.read_csv(str_path_error, usecols=[1], encoding="cp1252", header=None)
    df_temp.columns = ['id']
    ls_temp_ids = df_temp['id'].tolist()
    ls_error_ids = ls_error_ids + ls_temp_ids
    
    # log
    print(", ".join([str_path_error, str(len(ls_temp_ids))]))
    
# # keep unique ids
# ls_error_ids = list(set(ls_error_ids))  # get unique ids

In [None]:
len(ls_error_ids)

# Get IDs from Logs

In [None]:
# get all logs
ls_logs = glob.glob("./data/log*.txt")
ls_logs

In [None]:
str_root_id_line = "Target Post ID: "

counter = 0

ls_log_ids = []
for str_path_log in ls_logs:
    
    counter_log = 0
    with open(str_path_log, "r") as f_log:

        # get lines
        ls_lines = f_log.readlines()

        # get ids
        for line in ls_lines:
            if line.startswith(str_root_id_line):
                ls_log_ids.append(line.replace(str_root_id_line, "").replace("\n",""))
                counter_log += 1
                
    # log
    print(", ".join([str_path_log, str(counter_log)]))

# # keep unique ids
# ls_log_ids = list(set(ls_log_ids))

In [None]:
len(ls_log_ids)

# Combine All Scrubbed IDs

In [None]:
ls_scrubbed_ids = ls_reddit_data_ids + ls_error_ids + ls_log_ids
ls_scrubbed_ids = list(set(ls_scrubbed_ids))  # keep unique

In [None]:
len(ls_scrubbed_ids)

# Remove Scrubbed IDs

In [None]:
ls_split_me = list(set(ls_all).difference(set(ls_scrubbed_ids)))

In [None]:
num_total_post_ids = len(ls_split_me)
num_total_post_ids

In [None]:
num_chunks = 12
chunk_size = math.ceil(num_total_post_ids/num_chunks)
print(chunk_size)
print(chunk_size * num_chunks >= num_total_post_ids)

# Split

In [None]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [None]:
%%time
counter = 0
for chunk in chunks(ls_split_me, chunk_size):
    with open(f'./data/post_ids_{str(counter).zfill(2)}.csv', 'w', newline='') as f:
        f.writelines(f"{item}\n" for item in chunk)
    counter += 1