# Web Crawler

In [1]:
#Import dependencies
import praw, os
from datetime import datetime, timedelta

In [None]:
# Reusable bits:

# Make safe for CSV format
def sanitize_text(text):
    text = text.replace('\n', '')
    text = text.replace(',', '.')
    return text

# Function to search Reddit for posts between two given times

def search_reddit_between_times(phrase, start_time, finish_time):
    # create a Reddit instance in praw
    reddit = praw.Reddit(
        client_id='q5pBP7wJOg4ce_P9sOkyXA',
        client_secret='oD61dQ0l20WEPRqP35GqshO8AdZz0g',
        redirect_uri='http://localhost:8000',
        user_agent='AdaAnalyticsBot',
        read_only=True,
        check_for_async=False
    )

    # define the subreddit you want to search
    subreddit = reddit.subreddit('all')# r/all 'all'

    # use the subreddit.search method to search for posts containing the given phrase 
    # that were created between the start time and the finish time
    posts = subreddit.search(
        query=phrase,
        sort=None, #other useful sort-modes: relevance, hot, top, new, comments
        syntax=None, #Can be one of: "cloudsearch", "lucene", or "plain" (default: "lucene").
        limit=None,
        time_filter='all'#'hour', 'day', 'week', 'month', 'year', or 'all'
    )

    # create a list to store the results
    results = []

    # loop through the results and add the body and score of each post that was created between the start time and the finish time
    for post in posts:
        post_time = datetime.fromtimestamp(post.created_utc)
        if start_time <= post_time <= finish_time:
            results.append({
                'posttime': post_time,
                'title': sanitize_text(post.title),
                'message': sanitize_text(post.selftext),
                'score': post.score
            })
            flat_comments = post.comments.list()
            for comment in flat_comments:
                if isinstance(comment, praw.models.Comment):
                    comment_time = datetime.fromtimestamp(comment.created_utc)
                    comment_data = {
                        'posttime': comment_time,
                        'title': sanitize_text(post.title),
                        'message': sanitize_text(comment.body),
                        'score': comment.score
                    }
                    results.append(comment_data)

    return results

In [None]:
# As Needed Crawler:

# Search Reddit for the phrase between given days, broken down into 24 hour increments each with their own .txt that contains the original post and the score it received.
search_phrase = "bitcoin"
search_start_time = log_start_time = datetime(2023, 3, 20, 0, 0, 0)#start of the search
search_end_time = log_finish_time = datetime(2023, 3, 24, 23, 59, 59)#end at 11:59:59 of the same day
interval = timedelta(days=1)
current_day = search_start_time

print(f"searching Reddit for: {search_phrase}")
while current_day <= search_end_time:
    year, month, day = current_day.year, current_day.month, current_day.day
    log_start_time = datetime(year, month, day, 0, 0, 0)#start the daily log at midnight
    log_finish_time = datetime(year, month, day, 23, 59, 59)#end at 11:59:59 of the same day

    # search for posts containing "bitcoin" between the start time and finish time
        # search for posts containing "bitcoin" between the start time and finish time
    results = search_reddit_between_times(search_phrase, log_start_time, log_finish_time)

    # create a new file for each day and write the posts to it
    filename = f"{current_day.strftime('%Y-%m-%d')}.csv"
    print(f"Results: {len(results)}. Creating {filename}")
    with open(filename, "w") as f:
        f.write("post_time,title,post,upvote_score\n")#write column labels to first line of file
        for result in results:
            f.write(f'{result["posttime"]},{result["title"]},{result["message"]},{result["score"]}\n')
    
    current_day += interval

searching Reddit for: bitcoin
Results: 2146. Creating 2023-03-20.csv
Results: 1832. Creating 2023-03-21.csv
Results: 2879. Creating 2023-03-22.csv
Results: 2069. Creating 2023-03-23.csv
Results: 1318. Creating 2023-03-24.csv


# New Section

# New Section

# To Do
## Main Project Implementation
* ~~Get SQL server running on Google Cloud~~ Completed
* ~~Get Access to Reddit API~~ Completed 23Feb2023
* Implement searching Reddit for target phrases, returning last x number of messages
* Implement searching Reddit for a target phrase over a defined time range (ie: last hour)
* Implement sentiment analysis on returned messages (Currently: Gensim word2vec, "movie reviews" training set)
* Consider other useful targets for insight (word cloud, trending posts?)
* Design SQL database data strategy (Consider what variables are important to other projects. Pay extra consideration to size of each table as SQL performance tends to degrade with large tables. If performance poor under heavy load, might have to consider alternative such as MongoDB. CloudSQL settings can be tweaked from Cloud Console for more robustness, currently in low-resource development mode as of 24Feb2023.)
* Implement SQL writes
* Implement scheduled recurrent jobs (cloud VM, chronjob, docker)
## Quality Testing
* Test dashboards (Preference for Voila because can reuse for user web dashboards and good integration with Jupyter/Colab. Could also use R/Pandas, Python/mathplotlib).

# Additional Resources

## Reddit API

* Praw (Reddit API Wrapper) documentation: [https://https://praw.readthedocs.io/en/stable/index.html](https://https://praw.readthedocs.io/en/stable/index.html)

* Reddit API Wiki: [https://www.reddit.com/wiki/api/](https://www.reddit.com/wiki/api/)

## Running the Code on a Schedule (for Production)
* AppEngine: [https://cloud.google.com/appengine/docs/python](https://cloud.google.com/appengine/docs/python)

* Running Jupyter on a cronjob on a linux VM (could also just run a .py as a chronjob for less overhead): [https://towardsdatascience.com/running-jupyter-notebook-in-google-cloud-platform-in-15-min-61e16da34d52](https://towardsdatascience.com/running-jupyter-notebook-in-google-cloud-platform-in-15-min-61e16da34d52)

