In [1]:
from facebook_scraper import get_posts
import json
import pandas as pd

## Scrape comments

In [2]:
def scrape_comments(fb_group_list, pages, cookies_file, options, words_list):
    """
    Made this dataframe based on: https://github.com/kevinzg/facebook-scraper
    
    Args:
        fb_group_list(str, list): a list of Facebook pages to scrape, must be public
        pages(int): number of pages to collect
        cookies_file(str): file location of cookies data
        options(dict): dictionary of options for collection, dictionary options can be found in the link above
        words_list(str, list): list of words to filter posts by
        
    Returns:
        DataFrame: a DataFrame containing comments and associated information from posts scraped. Posts scraped are gathered from 'fb_group_list' posts that 
                   that contain words inputted in 'words_list'
    """
    # If content in the 'comment_text' column is the commenters name it is because they commented with just a picture and no text
    post_list = []
    df_comments = pd.DataFrame()
    for fb_group in fb_group_list:
        for post in get_posts(fb_group, pages = pages, cookies = cookies_file, options = options):
            post['text'] = post['text'].split('\n',1)[0]
            words = words_list
            comments_list = []
            replies_list = []
            
            if any(word in post['text'] for word in words) == True:
                print(post['text']+'\n')
                post_list.append(post)
                x = 0
                
                while x < len(post_list):
                    #print(post['text'])
                    comments_list = post_list[x]['comments_full']
                    x+=1
                    
                    if comments_list != []:
                        i = 0
                        
                        while i < len(comments_list):
                            comments_list[i]['comment_text'] = comments_list[i]['comment_text'].split('\n',1)[0]
                            replies_list = comments_list[i]['replies']
                            i+=1
                            
                            if replies_list != []:
                                z = 0
                                
                                while z < len(replies_list):
                                    replies_list[z]['comment_text'] = replies_list[z]['comment_text'].split('\n',1)[0]
                                    z+=1
                            
            df_comments = df_comments.append(comments_list)
        return df_comments

In [3]:
# Initialize function parameters 
# This was edited 
all_groups = ['alaska.dhss', 'ancpublichealth', 'FairbanksMemorialHospital', 'voiceofcph', 'juneauempire', 'fairbanksDNM', 'alaskapublic',
              'frontiersman', 'HomerNews', 'sitkasentinel', 'ktvf.fairbanks', 'peninsulaclarion', 'KSRM920AM', 'KYUK-358157087544895'
                'NomeNuggetNews', 'KRBD-FM-Rainbird-Community-Radio-158267229179', 'thearcticsounder', 'AnchorageSchoolDistrict', 'fsdk12', 'kpbsd',
                'akmatsuk12', 'cbjuneau', 'MatSuBorough', 'FNSBGov', 'Kenai-Peninsula-Borough-Local-News-111407724460354', 'COBarrow']
# I couldn't find the page name for Bethel School District'
# there were a lot of hits, but just know that it was on the list and should be included when we find it
cookies = ('facebook.com_cookies.txt')

## Scrape posts
###

### The way I'm calling this here, I'm grabbing comments also. I have a dataframe with only URLs, but Christian's grabbing comments totally fine, so I might be unlucky, and we can all just try a notebook that assumes smooth sailing. Worst case scenario, we know that we won't have a choice and we'll need to break it up into two parts (scraping for post URLs and then scraping comments from them)

In [4]:
# Only collects posts
import time
from datetime import datetime
import random as rd
import math

'''
This will take in a list of fb pages and fill in a huge dataframe with as many
scrapped comments as it can before an error occurs. It creates backup dataframes of each group as it reaches
the end of scrapping a page, even if the full scrape was unsuccessful.

The backup dataframes are 'complete', 'concise', and 'incomplete'. Concise pkl files should be just what we need: (post_url, 
comment data) but I don't want to miss a field that happens to be important later, hence creating the 'complete' files. 
Those will have everything the concise files have, but lots of extra fields you'd have to cut off with pandas.
It's easy to just drop what we don't need later but a huge pain to have to scrape again for something we missed, so I 
thought it was best to make records just in case.

Warnings get thrown sometimes and they will clog up the output, but if a real exception occurs, the dataframe being collected
gets stored in its partially completed state. That dataframe goes to a file that marks it as 'incomplete'. There are print
statements for when that kind of stuff happens, because depending on how close it was, it might not be a priority to scrape the
rest. At the very least we'll know how big a problem that page is.

At the end, a message says that it's exiting and pickling all the data it could gather, so just scroll up the previous
output and see if the last scraped post from a page was like from Oct. 2021, because then we'll know we need to revisit it. 
The returned dataframe includes everything we could get, incomplete pages and all.

There's a lot here, but at it's core this is really simple. The most complicated part about it is honestly the print 
statements and the fact that I use two libaries to calculate time differences (could be cleaned up). The progress type print 
statements with time info are helpful but sometimes phrased backwards or simply wrong. 
That's cosmetic and those statements are just a quality of life thing but they should be revised- I just wanted something
to get out to you guys, and once I was sure it worked I didn't want to stop the process to edit them again. 
The important parts seem to be working fine.

Though this takes a long time to run, but because it produces dataframes while it scrapes, you will have data to work with
as it's scraping- just open another notebook and read the newly pickled file. As it finishes pages, you can append those 
pickled files to make an even bigger dataframe, which will be useful for things like making sure the keywords we have 
actually filter and other nlp stuff.

I tested this, but it takes it takes a long time to test. Suffice it to say that, to the best of
my knowledge, this works fine, but uh, your mileage may vary. If there's something the matter with it, please say something.
I just need to wanted to have something out so we could just start.
'''



def scrape_posts(fb_group_list, pages, cookies_file, options, post_list, only_comments):
    start = time.time()
    df_posts = pd.DataFrame()
    START_DATE = datetime(2020, 3, 1) ## date we don't want to scrape past
    now = begin = datetime.now()
    delta = (now - START_DATE).days ## a variable needed for a print statement
    print("time at beginning is: ", now.strftime("%H:%M:%S"))
    j = 0
    for j in range(len(fb_group_list)):
        specific_group = [] ## an empty list for each new group. Will populate a specific dataframe in case not all pages scrapped successfully
        i = 0
        fb_group = fb_group_list[j]
        try:
            for post in get_posts(fb_group, pages = pages, cookies = cookies_file, options = options):
                time.sleep(rd.randint(3,7))
                i +=1
                ## condition triggered if the next scrapped post goes past our target date
                ## this creates a full dataframe with every field, and a concise dataframe that's probably easier
                ## to immediately read and play around with.
                if (START_DATE - post["time"]).days > 0:
                    print(f'\nSuccessfully scrapped comments for {fb_group}from today til start date')
                    df = pd.DataFrame() 
                    df = df.append(specific_group) ## append only this group
                    df.to_pickle(f"{fb_group}_COMPLETE.pkl")
                    print(f"A backup of the full dataframe produced was put in this directory with the name {fb_group}_COMPLETE.pkl")
                    concise = df[['post_id', 'post_url', 'time', 'comments_full']]
                    concise.to_pickle(f"{fb_group}_CONCISE.pkl")
                    print(f"A backup concise dataframe produced was put in this directory with the name {fb_group}_CONCISE.pkl")
                    print('Be sure to check your directory to make sure that these pickled files exist')
                    print('This function may not be also returns a complete dataframe at the end')
                    now = datetime.now()
                    print(f"total time scrapping so far is at {now - begin}")
                    j += 1 ## update variable so we get the next group
                    break ## break this for loop so we can move to the next group
                    
                post_list.append(post) ## as json responses are sourced, they're kept track of even when the group changes
                specific_group.append(post) ## this keeps track of json responses per group and is reset when the group changes
                only_comments.append(post["comments_full"]) ## uncomment if comments=True
                print('-', end='')
                if i % 50 == 0:
                    print() ## testing for try/except
                    progress = datetime.now()
                    progress = progress.strftime("%H:%M:%S")
                    print(f'post {i} collected at {progress}')
                    print(f'post datetime at {post["time"]}')
                    end = time.time()
                    print(f"Getting posts {i + 1 -50 } - {i} took {(end - start)//60} minutes and {round((end - start)% 60, 2)} seconds")
                    print(f'{delta - (post["time"] - START_DATE).days} days scraped out of {delta} days for {fb_group}')
                if i % 500 == 0:
                    print(f'this week in history brought to you by #{i}:')
                    print(post['text'].split('\n',1)[0])
                    end = time.time()
                    print(f"Getting posts {i + 1 -500 } - {i} for page {fb_group} took {(end - start)//60} minutes and {round((end - start)% 60, 2)} seconds")
                    start = time.time()
                    now = datetime.now()
                    str_now = now.strftime("%H:%M:%S")
                    print("time now is at: ", str_now)
                if i % 1200 == 0:
                    print('waiting 2 minutes for safety')
                    time.sleep(120)
        except:
            progress = datetime.now()
            progress = progress.strftime("%H:%M:%S")
            print("error thrown at: ", progress)
            df = pd.DataFrame()
            df = df.append(specific_group)
            most_recent = df['time'].iloc[-1]
            df.to_pickle(f"{fb_group}_INCOMPLETE.pkl")
            print(f'\nError thrown while scrapping comments for {fb_group}')
            print(f'\nLast recieved post was {fb_group} at time {most_recent}')
            print(f'(Depending on how far this scrape got, this might be complete enough)')
            print(f"A backup of the incomplete dataframe produced was put in this directory with the name {fb_group}_INCOMPLETE.pkl")
            print('A dataframe with all data collected will still be returned by this function.')
            if j + 1 <= len(fb_group_list):
                print('Waiting for 5 minutes before proceeding with the next group so Mark get\'s busy with something else')
                j += 1
                time.sleep(300)
            
    str_now = begin.strftime("%m_%d")
    df_posts = df_posts.append(post_list) ## appending the list of every comment to a main dataframe
    df_posts.to_pickle(f"results_total_{str_now}.pkl")
    print()
    print(f'Exiting function. A dataframe with all available comments is pickled at results_total_{str_now}.pkl')
    print("while you're thinking of it, just upload that complete df to the discord or email it to yourself!")
    return df_posts

In [5]:
#Initialize function parameters 
## done list AlaskasNewsSource, MatSuRegionalMedicalCenter, anchoragedailynews
## close but no cigar alaskapublic (nov 2020)
fb_group_list = all_groups[:7]
cookies = ('facebook.com_cookies.txt')

In [None]:
post_list = []
comments = [] ## Not used right now in my implementation, but will be used when 'comments' are set to True
posts = scrape_posts(fb_group_list, 100000, cookies, {"comments": True, "reactions":True, "allow_extra_requests": True}, post_list, comments)

time at beginning is:  05:41:33




--------------



------------------------------------
post 50 collected at 05:56:17
post datetime at 2022-02-05 10:38:00
Getting posts 1 - 50 took 14.0 minutes and 44.41 seconds
19 days scraped out of 725 days for alaska.dhss
------



-----



---



------



-----



-------------



------------
post 100 collected at 06:14:13
post datetime at 2022-01-22 23:44:53
Getting posts 51 - 100 took 32.0 minutes and 40.66 seconds
33 days scraped out of 725 days for alaska.dhss
--



------



-



----



-





-----



-------



----



-



---



-



--



--



-----------
post 150 collected at 06:34:44
post datetime at 2022-01-05 17:19:33
Getting posts 101 - 150 took 53.0 minutes and 11.69 seconds
50 days scraped out of 725 days for alaska.dhss
--------------------------------------------------
post 200 collected at 06:49:16
post datetime at 2021-12-18 17:10:34
Getting posts 151 - 200 took 67.0 minutes and 43.14 seconds
68 days scraped out of 725 days for alaska.dhss
--------------------------------------------------
post 250 collected at 07:04:13
post datetime at 2021-12-03 18:55:17
Getting posts 201 - 250 took 82.0 minutes and 40.09 seconds
83 days scraped out of 725 days for alaska.dhss
--------------------------------------------------
post 300 collected at 07:18:13
post datetime at 2021-11-22 18:52:04
Getting posts 251 - 300 took 96.0 minutes and 40.75 seconds
94 days scraped out of 725 days for alaska.dhss
--------------------------------------------------
post 350 collected at 07:35:24
post datetime at 2021-11-11 12:47:01
Getting post



-



---------
post 550 collected at 09:21:03
post datetime at 2021-09-23 19:26:46
Getting posts 501 - 550 took 37.0 minutes and 51.52 seconds
154 days scraped out of 725 days for alaska.dhss
--------------------------------------------------
post 600 collected at 09:54:01
post datetime at 2021-09-10 20:12:02
Getting posts 551 - 600 took 70.0 minutes and 49.2 seconds
167 days scraped out of 725 days for alaska.dhss
------------------------



-



--



-



--



--



--



-



--





-



-



--



-



-



-



---





-



-



-
post 650 collected at 10:24:49
post datetime at 2021-08-30 23:50:17
Getting posts 601 - 650 took 101.0 minutes and 37.53 seconds
178 days scraped out of 725 days for alaska.dhss




-



---



----------------------------------------------
post 700 collected at 11:03:01
post datetime at 2021-08-17 15:30:27
Getting posts 651 - 700 took 139.0 minutes and 49.44 seconds
191 days scraped out of 725 days for alaska.dhss
-------------------------------------------