# Scrapping Facebook page posts data

The purpose of this script is to get the posts data from Facebook fanpage.

*Reference: https://developers.facebook.com/docs/graph-api/reference/post/*

In [1]:
#!pip install facebook-sdk

In [2]:
import facebook
import pandas as pd

In [3]:
TOKEN = 'YOUR ACCESS TOKEN' # replace the string with your token
PAGE_NAME = 'OranjeExpress.org'

DEFAULT_NR_LIMIT = 100 # max value=100; see reference

# define the field variables of interest
DEFAULT_FIELDS = [
    'post_id',
    'created_time',
    'message',
    'is_popular'
]

In [4]:
def getPostData (TOKEN, PAGE_NAME, FIELDS=DEFAULT_FIELDS, NR_LIMIT=DEFAULT_NR_LIMIT):
    '''
    get fb page posts data with given parameters using facebook api
    '''
    graph = facebook.GraphAPI(access_token = TOKEN)
    pageId = graph.get_object(PAGE_NAME, field='id')['id']
    print("The Id of page \'%s\' is %s." % (PAGE_NAME, pageId))

    data = [] # initial empty list to store data
    
    # get initial posts data
    posts = graph.get_connections(
                id=pageId, connection_name='posts', limit=NR_LIMIT,
                fields=','.join(FIELDS), # create a query string of combined fields
                )
    data += posts['data']
    
    while 'next' in posts['paging']:
        nextId = posts['paging']['next'].rsplit('after=')[1]
        posts = graph.get_connections(
                    id=pageId, connection_name='posts', limit=NR_LIMIT,
                    fields=','.join(FIELDS), # create a query string of combined fields
                    after=nextId
                    )
        data += posts['data']
        
    print('Completed! Total %i posts scrapped ^^!' % len(data))
            
    return data

In [5]:
d = getPostData(TOKEN, PAGE_NAME)

The Id of page 'OranjeExpress.org' is 225292010874776.
Completed! Total 1544 posts scrapped ^^!


In [6]:
# put data into pandas df
df = pd.DataFrame(d)
df.set_index('id', inplace=True)

# get a glance of data
df.sample(5)

Unnamed: 0_level_0,created_time,message,is_popular
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
225292010874776_947143772022926,2016-01-13T01:11:14+0000,【#荷式社會文化】菁英會議與殖民遺緒--荷蘭聯合國教科文組織國際專家會議會外篇\n作者：王舒...,False
225292010874776_924122320991738,2015-12-01T08:04:10+0000,【#荷事分享】\n\n前幾個月台灣行腳節目三立“愛玩客”，以名廚詹姆士掛帥來荷蘭拍攝，荷事生...,False
225292010874776_696375167099789,2014-09-09T01:05:00+0000,[荷式人文＆藝術]\n藝術家紀柏豪在荷蘭V2_的獨白\n作者：紀柏豪\n\n 今年獲得國立台...,False
225292010874776_1826896394047655,2018-04-07T07:00:00+0000,【#荷事分享】\n連星期日都還不到就開始犯收假厭世症了嗎？表示該換工作啦～～～有想過到歐洲工...,False
225292010874776_266613616742615,2012-02-06T16:36:24+0000,[荷式在角落 Dutchness]\n氣候變遷劇烈，荷蘭今年的嚴冬直到二月才開始。你已經開始...,False


In [7]:
# save file to csv
df.to_csv('./data/fb-posts.csv', encoding='utf_8_sig')

### cleanup & hashtag assignment

In [1]:
import pandas as pd
import re
import random
random.seed(42) # assign seed for reproducibility

In [2]:
# load data
fbPosts = pd.read_csv('./data/fb-posts.csv', encoding='utf_8_sig')

In [3]:
fbPosts.head()

Unnamed: 0,id,created_time,message,is_popular
0,225292010874776_4572366496167284,2020-12-07T07:30:09+0000,【#荷式社會文化】買棵🌲真的聖誕樹🌲放客廳吧！\n \n聖尼古拉斯節（Sinterklaas...,False
1,225292010874776_4651913241545942,2020-12-05T15:41:02+0000,【荷式社會文化】兒歌教學：整個世界都在聖尼古拉斯的禮物袋裡！\n\n今天（12月5日）是荷蘭...,False
2,225292010874776_4642138352523431,2020-12-04T08:00:15+0000,【#荷式社會文化】讓我們一起聽聽唱唱2000首歌，歡送這爛透了的2020年\n \n聖誕將至...,False
3,225292010874776_4635689033168363,2020-12-02T07:57:35+0000,【#荷蘭職場】荷蘭跨國科技業PM實習：面試準備＆實戰經驗分享\n作者：楊雅淳 Iris Ya...,False
4,225292010874776_4631485426922057,2020-12-01T07:09:00+0000,【#荷式時事】全國性口罩政策本日上路\n\n荷蘭的口罩措施於今天（12月1日）起生效：年滿1...,False


In [4]:
# || ASSIGN PARAMETERS

# all styles of bracket used to indicate hashtag/keyword
BRACKETS = [ 
    ["【", "】"], 
    ["［", "］"],
    ["「", "」"], 
    ["『", "』"],
    ["\[" , "\]"],
    ["\[-" , "-\]"],
    ["\[\[" , "\]\]"]
]

# keywords indicating that the post is about a photography work
PHOTO_KEYWORDS = [ "攝影", "credit", "Credit" ]

In [5]:
def getHashtagFromMsg(msg):
    '''
    returns the hashtag/keyword of a given message
    for regex syntax, see https://regexr.com/41vj9
    '''
    # generate search query to retreive hashtag from texts
    searchQry = "|".join(["(%s.*?%s)" % (b[0], b[1]) for b in BRACKETS])

    try:
        firstLine = msg.split("\n")[0]
    except:
        firstLine = msg
    
    try:
        # try find hastag found between indicated brackets
        hashtag = re.search(searchQry, firstLine).group().strip()
        hashtag = hashtag[1:-1].strip() # remove bracket itself
    
    except:
        # check if the message is about a photography post
        if any([kw in str(firstLine) for kw in PHOTO_KEYWORDS]):
            hashtag = "荷式攝影"
        else:
            # return the first line itself when none of above rules matched
            hashtag = str(firstLine)

    # remove the leading `#` if presents
    if hashtag.startswith("#"):
        hashtag = hashtag[1:]

    return hashtag.strip()

In [6]:
# cleaaaaaaanup!
fbPosts["hashtag"] = fbPosts.message.apply(getHashtagFromMsg)

In [7]:
# get a glance of the dataset (randomly select five rows)
fbPosts.sample(5)

Unnamed: 0,id,created_time,message,is_popular,hashtag
850,225292010874776_1060406787363290,2016-07-01T09:48:38+0000,【#荷式環境科學】 本文由 眼底城事 eyes on place 授權轉載\n作者：punk...,False,荷式環境科學
817,225292010874776_1108482622555706,2016-08-25T08:06:14+0000,【#荷事分享】\n好點子就是要傳播出去，過去在荷事生非露出的文章，現在被台灣媒體拍成動態影像...,False,荷事分享
816,225292010874776_1109522552451713,2016-08-26T10:00:00+0000,【#荷式人文藝術- #轉角遇到art】作者：QB Hong\n提到street art，應該...,False,荷式人文藝術- #轉角遇到art
995,225292010874776_915850061818964,2015-11-13T01:13:58+0000,[公告] \n\n預定每月第二週五的醬子專欄，因過度食用伴手禮，導致無法準時交稿，將推延至週...,False,公告
1296,225292010874776_632683416802298,2014-05-10T09:05:17+0000,[荷式社會&文化]\nGa naar buiten!\n重視生活、鼓勵嘗試探索的荷式教育\n...,False,荷式社會&文化


In [8]:
# save file to csv
fbPosts.to_csv('./data/fb-posts-hashtag.csv', encoding='utf_8_sig', index=False)

#### Categorise post per retrieved hashtag

In [9]:
import pandas as pd
import random
random.seed(42) # assign seed for reproducibility

In [10]:
# load data
fbPosts = pd.read_csv('./data/fb-posts-hashtag.csv', encoding='utf_8_sig')

In [11]:
# dictionary for category-to-keyword mapping
DICT_CATEGORY = {
    # category : keywords
    # listed in order of priority of assignment
    "社會&文化" : [ "社會", "文化", "歷史", "政治" ],
    "建築&設計" : [ "建築", "設計" ],
    "人文&藝術" : [ "人文", "藝術" ],
    "環境&科學" : [ "環境", "科學" ],
    "吃喝&玩樂" : [ "吃喝", "玩樂", "食" ],
    "留學荷蘭"  : [ "留學" ],
    "街拍543"  : [ "攝影" ]
}

In [12]:
def mapHashtagToCategory(hashtag):
    '''
    input:  hashtag, string
    output: tag of category, string 
    '''
    indices = [idx for idx, category in enumerate(list(DICT_CATEGORY.values())) 
               if any([kw in str(hashtag) for kw in category])]
    if len(indices) == 0: # no keywords matched
        assignedCatgory = "others"
    else:
        idx = min(indices) 
        assignedCatgory = list(DICT_CATEGORY.keys())[idx]
        
    return assignedCatgory

In [13]:
fbPosts["category"] = fbPosts.hashtag.apply(mapHashtagToCategory)

In [14]:
# get a glance of the dataset (randomly select five rows)
fbPosts.sample(5)

Unnamed: 0,id,created_time,message,is_popular,hashtag,category
561,225292010874776_1736195529784409,2018-01-29T00:30:00+0000,【荷事分享】荷蘭高等教育吸引人的秘密?!\n\n荷蘭高等教育吸引世界的人才、並以健全的法制鼓...,False,荷事分享,others
1401,225292010874776_572500889487218,2013-12-27T11:04:24+0000,[荷式環境&科學]\n嗅覺溝通\n作者：Norman\n\n根據稍早之前荷蘭烏特勒支大學（U...,False,荷式環境&科學,環境&科學
1260,225292010874776_661280307275942,2014-07-05T23:17:48+0000,[荷式爽點] 橘獅子世足\n\n恭喜荷蘭終於破碎 penalty 迷咒，下一戰前四強v.s ...,False,荷式爽點,others
771,225292010874776_1197654550305179,2016-11-18T10:00:00+0000,【#荷式環境科學】作者：張焜傑 Kim Chang\n\n荷蘭埃因霍芬科技園區中的霍斯特中心...,False,荷式環境科學,環境&科學
1305,225292010874776_626687094068597,2014-04-27T13:04:40+0000,[荷式吃喝＆玩樂] 國王節報導--愛荷蘭左擁右抱專文合作\n歡慶百年來第一個國王節--創意不...,False,荷式吃喝＆玩樂,吃喝&玩樂


In [15]:
# save file to csv
fbPosts.to_csv('./data/fb-posts-hashtagX.csv', encoding='utf_8_sig', index=False)