# Generalized Forum Scraping

### Remarks

### Functions

In [11]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import nltk
import gensim
import re

Functions that find the tags allowing to retrieve the data we need

In [23]:
def find_next_tag(x, soup, tag):
    """
    Finds recursively the tag necessary to scrape the next page of a forum.
    :param x: string
    :param soup: BeautifulSoup
    :param tag: list
    :return: string
    """
    if x != None and x.parent != None:
        next_tag = x.parent.name
        try:
            soup.find_all(next_tag, string="Next")[0]['href'][1:]
            tag[0] = next_tag
            print("The tag for next page is ", next_tag)
        except:
            find_next_tag(x.parent, soup, tag)
        return tag[0]
    else:
        print("tag of the next page is not treated in this program")
        return None

def find_tags_recursively(x, comparator, tags):
    """
    Help function to find tags recursively.
    :param x: string
    :param comparator: string
    :param tags: list
    :return: list
    """
    tag = x.name
    try:
        class_name = x['class'][0]
        tags.append(class_name)
        tags.append(tag)
        next_ = x.findNext(tag, {'class': class_name})
    except:
        tags.append(tag)
        next_ = x.findNext(tag)
        
    if next_ != None:
        if comparator in next_.text:
            print("I've found the right tags, it's ", tags[::-1])
            return tags[::-1]
        else:
            return find_tags_recursively(x.parent, comparator, tags)
    else:
        print("Unable to scrape this text from this forum") 

def find_tags(soup, param1, param2):
    """
    Finds all tags needed to retrieve the information we're looking for (titles of threads, usernames, message, ...).
    :param soup: BeautifulSoup
    :param param1: string
    :param param2: string
    :return: list
    """
    text = soup.find(text=re.compile(param1))
    if text is None:
        print("Unable to scrape this text from this forum")
        return None
    else:
        return find_tags_recursively(text.parent, param2, []) 
    


Functions that scrape the forum thanks to the tags found above

In [24]:
def find_next(soup, next_tag):
    """
    Tries to find next page, or returns an empty string if there is no next page.
    :param soup: BeautifulSoup
    :param next_tag: string
    :return: string
    """
    try:
        if next_tag is not None:
            return soup.find_all(next_tag, string="Next")[0]['href'][1:]
        return ""
    except:
        return ""

def collect_forum_data(soup_row, tags, soup_row_ok = 0, msg_or_user="", link=""):
    """
    Collects all relevant data (title, link) of a thread.
    :param soup_row: BeautifulSoup
    :param tags: list
    :param soup_row_ok: boolean (0 or 1)
    :param msg_or_user: string
    :param link: string
    :return: dict
    """
    data = {}
    try:
        if not soup_row_ok:
            soup_row = soup_row.find(tags[len(tags) -1])
    
        data['Title'] = soup_row.text.strip()
        data['Link'] = soup_row['href'][1:]
        
    except:
        pass
    return data

def collect_post_data(soup_row, tags, soup_row_ok, link=""):
    """
    Collects all relevant data of a post (message, user).
    :param soup_row: BeautifulSoup
    :param tags: list
    :param soup_row_ok: boolean (0 or 1)
    :param link: string
    :return: dict
    """
    data = {}
    try:        
        data['Link'] = link.replace(PREFIX_URL, '')
        
        if len(tags) == 2 and not soup_row_ok:
            soup_row = soup_row.find_all(tags[len(tags)-1])
        if len(tags) == 3:                       
            soup_row = soup_row.find_all(tags[len(tags)-1])
        elif len(tags) == 4:
            soup_row = soup_row.find(tags[2]).find_all(tags[3])
        
        if soup_row_ok:
            data['Any'] = soup_row.text.strip()
        elif len(soup_row) > 1:
            #concatenate message if it spans over multiple tags (often <p>)
            message = ''
            for msg in soup_row:
                message += msg.text.strip()  
            data['Any'] = message
        else:
            data['Any'] = soup_row[0].text.strip()
          
    except:
        pass
    return data

def collect_recursively(data, soup, tags, next_tag, fcte_name, link="", index=""):
    """
    Collects data recursively using a collection function from the two above.
    :param data: dict
    :param soup: BeautifulSoup
    :param tags: list
    :param next_tag: string
    :param fcte_name: function
    :param link: string
    :param index: int
    :return: dict
    """
    try:
        if index:
            print(index, end='\r', flush=True)
        
        soup_row_ok = 0
        if len(tags) == 1:
            soup_rows = soup.find_all(tags[0])
            soup_row_ok = 1
        if len(tags) == 2:
            soup_rows = soup.find_all(tags[0], {'class': tags[1]})
            if not soup_rows:
                soup_rows = soup.find_all(tags[0]) 
            else:
                soup_row_ok = 1      
        if len(tags) == 3 or len(tags) == 4:
            soup_rows = soup.find_all(tags[0], {'class': tags[1]})
        
        data.extend([fcte_name(soup_row, tags, soup_row_ok, link) for soup_row in soup_rows])
        next_url = find_next(soup, next_tag)
        if next_url:
            soup = BeautifulSoup(requests.get(PREFIX_URL + next_url).text, 'html.parser')
            if index:
                return collect_recursively(data, soup, tags, next_tag, fcte_name, link, index+1)
            else:
                return collect_recursively(data, soup, tags, next_tag, fcte_name, link)
        else:
            return data
    except:
        pass
    return data
                
def verify_if_treated(soup, tags):
    """
    Verifies if the case is treated yet (depending on the length and composants of the tags).
    :param soup: BeautifulSoup
    :param tags: list
    """
    if len(tags) > 4 or len(tags) < 1:
        print("This case is not treated yet")
        
    if len(tags) == 3 or len(tags) == 4:
        try:
            soup.find(tags[0], {'class': tags[1]})
        except:
            print("This case is not treated yet")
    
def collect_all_links(soup, tags, next_tag, fcte_name):
    """
    Launches process to collect a dataframe for all threads of a forum (namely collects all links and titles of a forum).
    :param soup: BeautifulSoup
    :param tags: list
    :param next_tag: string
    :param fcte_name: function
    :return: DataFrame
    """
    verify_if_treated(soup, tags)

    data = collect_recursively([], soup, tags, next_tag, fcte_name)
    return pd.DataFrame(data).dropna()

def collect(forum_df, tags, next_tag, fcte_name):
    """
    Launches process to collect a dataframe for all posts of a thread (namely collects all usernames and messages of a thread).
    :param forum_df: DataFrame
    :param tags: list
    :param next_tag: string
    :param data_function: function
    :return: DataFrame
    """
    data = []
    total = len(forum_df['Link'])
    index = 0
    for url in forum_df['Link']:
        index += 1
        print('{} out of {}'.format(index, total), end='\r', flush=True)
        soup = BeautifulSoup(requests.get(PREFIX_URL + url).text, 'html.parser')
        verify_if_treated(soup, tags)
        data.extend(collect_recursively([], soup, tags, next_tag, fcte_name, url))
    return pd.DataFrame(data)

### 1. Collect all titles and links of the forum

Enter the 3 parameters needed:

In [20]:
#Holiday Truths (America/Canada Discussion)
PREFIX_URL = 'https://www.holidaytruths.co.uk/'
START_URL = PREFIX_URL + 'forum/america-canada-discussion-forum-f2-0.html'
title1 = 'ESTA question on employment'
title2 = 'Vegas Buffets/Restaurants'

#Wrong Planet
#PREFIX_URL = 'http://wrongplanet.net/forums'
#START_URL = PREFIX_URL + '/viewforum.php?f=19'
#title1 = 'RE: Kids w/ Classic Autism, PDD-NOS & Speech Delays'
#title2 = 'Parents on the spectrum'

#Trip Advisor
#PREFIX_URL = 'https://www.tripadvisor.co.uk/'
#START_URL = PREFIX_URL + 'ShowForum-g1-i12334-Holiday_Travel.html'
#title1 = 'See TOP QUESTIONS before posting!'
#title2 = 'Use the SEARCH BOX function before posting!'

In [27]:
soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')

We find the tags to be able to scrape all titles and links of a forum and the tag needed to find the next page

In [28]:
tags = find_tags(soup, title1, title2)
next_tag = find_next_tag(soup.find(text=re.compile("Next")), soup, [''])

I've found the right tags, it's  ['span', 'title', 'a']
The tag for next page is  a


We scrape all titles and links

In [29]:
threads = collect_all_links(soup, tags, next_tag, collect_forum_data)

In [32]:
threads.to_json('Forum Data/subjects.json')

### 2. Collect all usernames, messages of every posts in every link

Enter the parameters needed:

In [33]:
#Holiday Truths ()
user1 = 'AnnaM'
user2 = 'Glynis HT Admin'
msg1 = 'Hi, I am brand new and hopefully I have put this question in the right area.'
msg2 = 'd have thought that was fine. Plenty of retired people travel.'

#Wrong Planet
#user1 = 'cyberdad'
#user2 = 'Solvejg'
#msg1 = 'I cope fine in the general parenting area.'
#msg2 = 'Yes.'

#Trip Advisor
#user1 = 'BradJill'
#user2 = 'Eden7'
#msg1 = 'HOW TO USE THE HOLIDAY TRAVEL FORUM!'
#msg2 = 'Great advice BradJill.....'

In [34]:
START_URL = PREFIX_URL + threads.Link[0]
soup =  BeautifulSoup(requests.get(START_URL).text, 'html.parser')

We check that prefix_url is well defined (optional, run this cell if unable to scrape usernames and messages)

In [173]:
#if START_URL != right_url:
#    print("PREFIX_URL wasn't well defined. Please check it.")

We find the tags to be able to scrape the usernames

In [35]:
user_tags = find_tags(soup, user1, user2)

I've found the right tags, it's  ['div', 'user-name', 'a']


We scrape all usernames

In [37]:
user_posts = collect(threads, user_tags, next_tag, collect_post_data)
user_posts.rename(columns={'Any':'Username'}, inplace=True)

1434 out of 1434

We find the tags to be able to scrape the messages

In [38]:
message_tags = find_tags(soup, msg1, msg2)

I've found the right tags, it's  ['div', 'col-md-10']


We scrape all messages

In [39]:
message_posts = collect(threads, message_tags, next_tag, collect_post_data)

1434 out of 1434

Sometimes random data is found in the tags of the message (ex. 'p'), this function gets rid of this data

In [40]:
def check_messages(data, msg1):
    for i in range(0, len(data)-1):
        if msg1 in data['Any'][i]:
            return i

In [41]:
index = check_messages(message_posts, msg1)
print("index: ", index)
message_posts['Any'] = message_posts['Any'].iloc[index:]
message_posts['Any'] = message_posts['Any'].shift(-index)

index:  0


### 3. Merge the titles, usernames, messages to have all posts

We concatenate the messages with the users

In [42]:
user_posts['Message'] =  pd.Series(message_posts['Any'], index=user_posts.index)
posts= user_posts

In [46]:
posts.head()

Unnamed: 0,Username,Link,Message
0,AnnaM,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th..."
1,Glynis HT Admin,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...
2,AnnaM,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No..."
3,Lance Chambers,forum/esta-question-on-employment-t172445.html,This is not the case - they are more concerned...
4,AnnaM,forum/esta-question-on-employment-t172445.html,They also want to know your parents names? My...


We merged the messages and usernames with the titles

In [47]:
merged_df = pd.merge(threads, posts, on='Link', how='inner')
merged_df = merged_df.reindex(sorted(merged_df.columns), axis=1)
merged_df.to_json('Forum Data/threads.json')

In [48]:
merged_df.head()

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
2,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No...",ESTA question on employment,AnnaM
3,forum/esta-question-on-employment-t172445.html,This is not the case - they are more concerned...,ESTA question on employment,Lance Chambers
4,forum/esta-question-on-employment-t172445.html,They also want to know your parents names? My...,ESTA question on employment,AnnaM
