In [1]:
import requests as re
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup

Today the goal is to see how much improvement we can get by threading API calls on a single network card

In [2]:
allThreads_csv = os.path.join("data", "allthreads-2018-04-24T1300.csv")
allThreads_df = pd.read_csv(allThreads_csv)

In [3]:
site_url = 'https://forums.hardwarezone.com.sg'

In [4]:
parentingThreads_df = allThreads_df.loc[allThreads_df['forum_url'] == 'https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/']
print(len(parentingThreads_df))
parentingThreads_df.head()

154


Unnamed: 0,forum_url,thread,thread_url
97846,https://forums.hardwarezone.com.sg/parenting-k...,"Welcome to HWZ's Parenting, Kids & Early Learn...",https://forums.hardwarezone.com.sg/parenting-k...
97847,https://forums.hardwarezone.com.sg/parenting-k...,Milk powder,https://forums.hardwarezone.com.sg/parenting-k...
97848,https://forums.hardwarezone.com.sg/parenting-k...,I suspect my child has ADHD,https://forums.hardwarezone.com.sg/parenting-k...
97849,https://forums.hardwarezone.com.sg/parenting-k...,Do u lose your temper when teaching your child?,https://forums.hardwarezone.com.sg/parenting-k...
97850,https://forums.hardwarezone.com.sg/parenting-k...,Cry when feeding,https://forums.hardwarezone.com.sg/parenting-k...


In [5]:
def getPosts(thread_url):
    print(thread_url)
    lastThreadPage = False
    thread_cols = ['thread_url', 'userid', 'timestamp', 'post_text'] #, 'post_number', 'post_order', 'first_quote_post_number', 'likes_userid'
    thread_df = pd.DataFrame(columns=thread_cols)
    thread_page_url = thread_url

    while(not lastThreadPage):
        #print(thread_page_url)
        r3 = re.get(thread_page_url)
        thread_page = r3.text
        thread_page_soup = BeautifulSoup(thread_page, 'html.parser')

        if (thread_page_soup.find('a', text='Next ›') == None):
            lastThreadPage = True
        else:
            thread_page_url = site_url + thread_page_soup.find('a', text='Next ›')['href']

        thread_page_posts = thread_page_soup.find('div', {'id': 'posts'})
        
        try: 
            for post in thread_page_posts.find_all('div', {'class': 'post-wrapper'}):
                userid_url = post.find('a', {'class': 'bigusername'})['href']
                userid = ''.join(filter(lambda x: x.isdigit(), userid_url))

                datetime_raw = post.find('a', {'name': lambda x: x and x.find('post') == 0}).nextSibling.strip()
                date_list = datetime_raw.split(',')[0].split('-')
                iso_date = '-'.join(list(reversed(date_list)))
                hour = int(datetime_raw.split(' ')[1][0:2])
                if(datetime_raw.split(' ')[2] == 'PM' and hour < 12):
                    hour += 12
                hour_str = str(hour)
                if(hour < 10):
                    hour_str = '0' + str(hour)
                minute = datetime_raw.split(':')[1][0:2]
                iso_datetime = iso_date + 'T' + hour_str + ':' + minute

                post_text = ""
                try:
                    post_text = post.find('div', {'class': 'post_message'}).get_text(' ', strip=True)
                except AttributeError as e: 
                    pass

                row = pd.DataFrame([[thread_url, userid, iso_datetime, post_text]], columns=thread_cols)
                if(len(thread_df)==0):
                    thread_df = row
                else:
                    thread_df = thread_df.append(row, ignore_index=True) #df.append doesn't work inplace
        except:
            row = pd.DataFrame([[thread_url, "", "", ""]], columns=thread_cols) #posts missing, thread may have been deleted
            if(len(thread_df)==0):
                thread_df = row
            else:
                thread_df = thread_df.append(row, ignore_index=True) #df.append doesn't work inplace
    return thread_df

Let's determine the baseline time for crawling an extract.

In [None]:
%time parentingPosts_df = pd.concat([getPosts(threadUrl) for threadUrl in parentingThreads_df['thread_url']])
print(len(parentingPosts_df))

https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/welcome-hwzs-parenting-kids-early-learning-forum-5684416.html
https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/milk-powder-5773651.html
https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/i-suspect-my-child-has-adhd-5816748.html
https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/do-u-lose-your-temper-when-teaching-your-child-5785587.html
https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/cry-when-feeding-5723877.html
https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/newborn-nanny-infant-care-maid-after-maternity-leave-5805042.html
https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/anyone-went-thru-abortion-before-5779121.html
https://forums.hardwarezone.com.sg/parenting-kids-early-learning-397/selling-obscene-films-used-panties-can-illegal-singapore-5815478.html
https://forums.hardwarezone.com.sg/parenting-kid

In [None]:
parentingPosts_df.head()

In [None]:
"""Deal with unusual unicode characters in the post text"""
parentingPosts_df['post_text'] = parentingPosts_df['post_text'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))

In [None]:
parentingPosts_csv = os.path.join("data", "parentingposts-2018-04-24T1300.csv")
parentingPosts_df.to_csv(parentingPosts_csv, encoding='utf-8', index=False)

In the interim, get the threads for specific forums and write to file

In [None]:
#Selectively retrieving threads from forums of interest
foi = []
#foi.append("https://forums.hardwarezone.com.sg/current-affairs-lounge-17/")
#foi.append("https://forums.hardwarezone.com.sg/money-mind-210/")
foi.append("https://forums.hardwarezone.com.sg/credit-cards-line-credit-facilities-243/")
foi.append("https://forums.hardwarezone.com.sg/stocks-shares-indices-92/")
foi.append("https://forums.hardwarezone.com.sg/eat-drink-man-woman-16/")
for forum_url in foi:
    print("Generating posts for " + str(forum_url) + " ...")
    threads_df = allThreads_df.loc[allThreads_df['forum_url'] == forum_url]
    forum_posts_df = pd.concat([getPosts(threadUrl) for threadUrl in threads_df['thread_url']])
    forum_posts_df['post_text'] = forum_posts_df['post_text'].map(lambda x: x.encode('unicode-escape').decode('utf-8'))
    forum_posts_csv = os.path.join("data", ''.join(char for char in forum_url.split('/')[3] if not char.isdigit()).strip('-') + "-2018-04-24T1300.csv")
    forum_posts_df.to_csv(forum_posts_csv, encoding='utf-8', index=False)
    print('Posts for ' + str(forum_url) + " written to " + str(forum_posts_csv))
print('Done')

In [None]:
def processQueryList(query_list, store=None):
    """Process a subset of the list of queries that needs to be made, storing posts in a dataframe"""
    if store is None:
        store = {}
    for url in query_list:
        #Store list of dataframes of posts from each thread into the dictionary
        store[str(url)] = [getPosts(thread_url) for thread_url in query_list]
    return store

Since getPosts handles requests thread-wise already, we'll just try to parallelize thread retrieval.

In [None]:
from threading import Thread

def threadedApiCall(nthreads, master_query_list):
    store = {}
    threads = []
    sublistLength = len(master_query_list)//nthreads
    #split the master query list
    sublists = [master_query_list[x:x+sublistLength] for x in range(0,len(master_query_list), sublistLength)]
    
    for sublist in sublists:
        t = Thread(target=processQueryList, args=(sublist, store))
        threads.append(t)

    #start the threads
    [t.start() for t in threads]
    #wait for the threads to finish
    [t.join() for t in threads]
    
    return pd.concat(thread_posts for thread_posts in list(store.values()))

In [None]:
%%time
threadedApiCall(10, allThreads_df['thread_url'])