# Following shows the steps of collecting comments for users with SRDD by using PullPush API.

- pullpush API for reddit data search: https://pullpush.io/
- each user's comments are stored as json file in the end

### Scraping Data

In [None]:
# From 2019 to 2023, each month is seperated to avoid hitting the limit rate of API request.
date_ranges = [
    ("1546300800", "1548979200"),  # 2019-01-01 to 2019-01-31
    ("1548979200", "1551398400"),  # 2019-02-01 to 2019-02-28
    ("1551398400", "1554076800"),  # 2019-03-01 to 2019-03-31
    ("1554076800", "1556668800"),  # 2019-04-01 to 2019-04-30
    ("1556668800", "1559347200"),  # 2019-05-01 to 2019-05-31
    ("1559347200", "1561939200"),  # 2019-06-01 to 2019-06-30
    ("1561939200", "1564617600"),  # 2019-07-01 to 2019-07-31
    ("1564617600", "1567296000"),  # 2019-08-01 to 2019-08-31
    ("1567296000", "1569888000"),  # 2019-09-01 to 2019-09-30
    ("1569888000", "1572566400"),  # 2019-10-01 to 2019-10-31
    ("1572566400", "1575158400"),  # 2019-11-01 to 2019-11-30
    ("1575158400", "1577836800"),  # 2019-12-01 to 2019-12-31
    ("1577836800", "1580515200"),  # 2020-01-01 to 2020-01-31
    ("1580515200", "1583020800"),  # 2020-02-01 to 2020-02-29 (leap year)
    ("1583020800", "1585699200"),  # 2020-03-01 to 2020-03-31
    ("1585699200", "1588291200"),  # 2020-04-01 to 2020-04-30
    ("1588291200", "1590969600"),  # 2020-05-01 to 2020-05-31
    ("1590969600", "1593475200"),  # 2020-06-01 to 2020-06-30
    ("1593475200", "1596153600"),  # 2020-07-01 to 2020-07-31
    ("1596153600", "1598832000"),  # 2020-08-01 to 2020-08-31
    ("1598832000", "1601424000"),  # 2020-09-01 to 2020-09-30
    ("1601424000", "1604102400"),  # 2020-10-01 to 2020-10-31
    ("1604102400", "1606694400"),  # 2020-11-01 to 2020-11-30
    ("1606694400", "1609372800"),  # 2020-12-01 to 2020-12-31
    ("1609372800", "1612051200"),  # 2021-01-01 to 2021-01-31
    ("1612051200", "1614470400"),  # 2021-02-01 to 2021-02-28
    ("1614470400", "1617148800"),  # 2021-03-01 to 2021-03-31
    ("1617148800", "1619740800"),  # 2021-04-01 to 2021-04-30
    ("1619740800", "1622419200"),  # 2021-05-01 to 2021-05-31
    ("1622419200", "1625011200"),  # 2021-06-01 to 2021-06-30
    ("1625011200", "1627689600"),  # 2021-07-01 to 2021-07-31
    ("1627689600", "1630368000"),  # 2021-08-01 to 2021-08-31
    ("1630368000", "1632960000"),  # 2021-09-01 to 2021-09-30
    ("1632960000", "1635638400"),  # 2021-10-01 to 2021-10-31
    ("1635638400", "1638230400"),  # 2021-11-01 to 2021-11-30
    ("1638230400", "1640908800"),  # 2021-12-01 to 2021-12-31
    ("1640908800", "1643587200"),  # 2022-01-01 to 2022-01-31
    ("1643587200", "1646006400"),  # 2022-02-01 to 2022-02-28
    ("1646006400", "1648684800"),  # 2022-03-01 to 2022-03-31
    ("1648684800", "1651276800"),  # 2022-04-01 to 2022-04-30
    ("1651276800", "1653955200"),  # 2022-05-01 to 2022-05-31
    ("1653955200", "1656547200"),  # 2022-06-01 to 2022-06-30
    ("1656547200", "1659225600"),  # 2022-07-01 to 2022-07-31
    ("1659225600", "1661904000"),  # 2022-08-01 to 2022-08-31
    ("1661904000", "1664496000"),  # 2022-09-01 to 2022-09-30
    ("1664496000", "1667174400"),  # 2022-10-01 to 2022-10-31
    ("1667174400", "1669766400"),  # 2022-11-01 to 2022-11-30
    ("1669766400", "1672444800"),  # 2022-12-01 to 2022-12-31
    ("1672444800", "1675123200"),  # 2023-01-01 to 2023-01-31
    ("1675123200", "1677542400"),  # 2023-02-01 to 2023-02-28
    ("1677542400", "1680220800"),  # 2023-03-01 to 2023-03-31
    ("1680220800", "1682812800"),  # 2023-04-01 to 2023-04-30
    ("1682812800", "1685491200"),  # 2023-05-01 to 2023-05-31
    ("1685491200", "1688083200"),  # 2023-06-01 to 2023-06-30
    ("1688083200", "1690761600"),  # 2023-07-01 to 2023-07-31
    ("1690761600", "1693440000"),  # 2023-08-01 to 2023-08-31
    ("1693440000", "1696032000"),  # 2023-09-01 to 2023-09-30
    ("1696032000", "1698710400"),  # 2023-10-01 to 2023-10-31
    ("1698710400", "1701302400"),  # 2023-11-01 to 2023-11-30
    ("1701302400", "1703980800")   # 2023-12-01 to 2023-12-31
]


In [None]:
# begin collection, before this step, the author list is kept in a seperate csv file.
import pandas as pd
part_df = pd.read_csv('authors_SRDD.csv', encoding='latin1')


In [None]:
part_df['author']

In [None]:
import requests
import time
import json
import os
from requests.exceptions import SSLError, ConnectionError

def fetch_data(author, start_epoch, end_epoch, retries=1):
    url = "https://api.pullpush.io/reddit/search/comment/"
    params = {
        "author": author,
        "after": start_epoch,
        "before": end_epoch,
    }

    attempt = 0
    while attempt <= retries:
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error fetching data for {author} from {start_epoch} to {end_epoch}: {response.status_code}")
        except (SSLError, ConnectionError) as e:
            print(f"Connection error ({e}) for {author} from {start_epoch} to {end_epoch}, attempt {attempt + 1} of {retries + 1}")

        # if not sucessful, wait for another 5 seconds
        wait_time = 5  #
        print(f"Waiting for {wait_time} seconds before retrying...")
        time.sleep(wait_time)
        attempt += 1

    # if all retries run out, return none
    print(f"Failed to fetch data for {author} from {start_epoch} to {end_epoch} after {retries + 1} attempts")
    return None

# get all authors with SRDD
authors = part_df['author'].unique()

for author in authors:
    merged_data = []
    print(f"Collecting for user: {author}")
    # Fetch data for each date range for the current author
    for start_epoch, end_epoch in date_ranges:
        data = fetch_data(author, start_epoch, end_epoch)
        if data is None:  # if the data is none, skip this time period
            continue
        if data.get("data"):
            merged_data.extend(data["data"])
        time.sleep(3)  # To avoid hitting rate limits

    # Save the merged data to a JSON file named after the author
    file_path = os.path.join('depressed\comment', f'{author}.json')
    with open(file_path, 'w') as f:
        json.dump(merged_data, f)
    print(f"Collection is done for user:{author}")


# Finally, the comments are stored as json files. Each author has one json file that includes all his or her comment row data from 2019 to 2023.

