## reference
 * [偷偷說爬蟲by Go](https://city.shaform.com/zh/2019/01/11/plurk-crawler/) 
 * [Requests Docs](https://2.python-requests.org/en/master/api)
 * [bs4 Docs](https://www.crummy.com/software/BeautifulSoup/bs4/doc) 
 * [bs4 simple intro](https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/2/)
 * [Plurk API 2.0](https://www.plurk.com/API)

In [2]:
# import packages
from __future__ import print_function

from mdutils.mdutils import MdUtils # help to create a markdown file
from bs4 import BeautifulSoup as bs # for html parsing
import requests as rq
import slimit
from slimit.parser import Parser
from slimit.visitors.nodevisitor import ASTVisitor

### 0803 - testing 
> Little Goal: input plurk link and get some md

In [3]:
"""
" A Visitor inheritate slimit.visitors.ASTVisitor.
" To traverse the parse tree and transform to python dictionary
"""
class JSVisitor(ASTVisitor):
    def __init__(self, dic):
        self.json = dic
    """
    "  Traverse function
    """
    def visit_Object(self, node):
        for prop in node:
            left, right = prop.left, prop.right
            key = left.value[1:-1]   # deliminate quotes
            try:
                value = self.GetValue(right)
            except ValueError as e:
                print("ValueError: {}".format(str(e)))
            # print("Property key={}, value={}".format(key, value))
            self.json[key] = value
            # visit all children in turn
            self.visit(prop)
    """
    "  To get and transform values from AST nodes.
    """    
    def GetValue(self, node):
        node_type = type(node)
        if node_type is slimit.ast.NewExpr:
            return "NewExpr"
        if node_type is slimit.ast.Boolean:
            if node.value == 'false':
                return False
            if node.value == 'true':
                return True
            raise ValueError("Unknow value of node: {}".format(node.value))
        if node_type is slimit.ast.String:
            return node.value[1:-1]   # deliminate quotes
        if node_type is slimit.ast.Number:
            return int(node.value)
        if node_type is slimit.ast.Null:
            return None
        if node_type is slimit.ast.Array:
            array = [self.GetValue(n) for n in node.items]
            return array
        raise ValueError("Unknow node type: {}".format(node_type))

In [4]:
def get_content_by_link(plurk_url):
    
    # request plurk content from plurk.com
    with rq.Session() as sess:    
        plurk = sess.get(plurk_url)
        if plurk.status_code == rq.codes.ok:
            print("Request Success! Status: {}.".format(plurk.status_code))
        else:
            print("Request fail. Status: {}.".format(plurk.status_code))
            return 'Fail QAQ'
            
    # read content of HTML
    soup = bs(plurk.text)
    # extract the last script out
    script = soup.find_all("script")[-1].string
    
    plurk_content = {}

    parser = Parser()
    json_tree = parser.parse(script)    # construct parse tree
    visitor = JSVisitor(plurk_content)
    visitor.visit(json_tree)            # traverse the tree

    # response
    request_url = "https://www.plurk.com/Responses/get"
    data = {'plurk_id': plurk_content.get('id'), 'from_response_id': '0'}
    print(data)
    
    with rq.Session() as sess:
        # request response from Responses/get, and use plurk_id as data to tell website which plurk we are requesting
        # sess.post means HTTP POST
        response = sess.post(request_url, data=data)
        if response.status_code == rq.codes.ok:
            print("Request Success! Status: {}.".format(response.status_code))
        else:
            print("Request fail. Status: {}.".format(response.status_code))
    
    response_content = response.json()
    return plurk_content, response_content
    

In [49]:
#  plurk_id, favorite_count, owner_id, 
# coins, qualifier, response_count, 
# replurkers_count, anonymous, last_edited, 
# no_comments, posted, lang, content_raw
##################### 
#                   #
#   Main Function   #
#                   #
##################### 
# plurk, content = get_content_by_link('https://www.plurk.com/p/nf00yf')
users = content.get('users')
md_file = MdUtils(file_name='plurk_test',title='plurk_test_v0')

# info
md_file.new_line(str(content.get('response_count'))+'則回應',color='#E8E8E8')

# 製作每則回應
for response in content.get('responses'):
    user_id = str(response.get('user_id'))

    # User #
    # user profile image
    has_profile_image = users.get(user_id).get('has_profile_image')
    avatar = users.get(user_id).get('avatar')
    if has_profile_image == 1 and avatar != None:
        md_file.new_paragraph("![U](https://avatars.plurk.com/"+user_id+"-small"+str(avatar)+".gif) ")
    elif has_profile_image == 1 and avatar == None:
        md_file.new_paragraph("![U](https://avatars.plurk.com/"+user_id+"-small.gif) ")
    else:
        md_file.new_paragraph("![U](https://www.plurk.com/static/default_small.gif) ")
        
    # user name
    name = users.get(user_id).get('display_name')
    color = users.get(user_id).get('name_color')
    if color == None:
        md_file.write(name, bold_italics_code='b', color='#DDDDDD')        
    else:
        md_file.write(name, bold_italics_code='b', color='#'+color)
        
    # Response #
    # user response
    md_file.new_line(response.get('content'))
    md_file.new_line(response.get('posted'))

md_file.create_md_file()

<mdutils.fileutils.fileutils.MarkDownFile at 0x184f64c57f0>