# Generalized Forum Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import nltk
import gensim



Be careful to run the cells in order

Functions that find the tags allowing to retrieve the data we need

In [153]:
import re

#Find the tag to scrape the next page
def find_next_tag(x, soup, tag):
    if x != None:
        next_tag = x.parent.name
        try:
            soup.find_all(next_tag, string="Next")[0]['href'][1:]
            tag.append(next_tag)
            print("The tag for next page is ", next_tag)
        except:
            find_next_tag(x.parent, soup, tag)
        return tag[0]
    else:
        print("tag of the next page is not treated in this program")
        return None

#Help function to find a tag recursively
def find_tags_recursively(x, comp, tags):
    tag = x.name
    try:
        class_name = x['class'][0]
        tags.append(class_name)
        tags.append(tag)
        next_ = x.findNext(tag, {'class': class_name})
    except:
        #print("TAG: ", tag)
        tags.append(tag)
        #print("TAGS: ", tags)
        next_ = x.findNext(tag)
        #print("next: ", next_)
        
    if comp in next_.text:
        print("I've found the right tag, it's ", tags[::-1])
        return tags[::-1]
    else:
        return find_tags_recursively(x.parent, comp, tags)

#Find all tags needed to retrieve the information we're looking for (titles of thread, messages, ...)
def find_tags(soup, param1, param2):
    text = soup.find(text=re.compile(param1))
    if text is None:
        print("Unable to scrape this text from this forum")
        return None
    else:
        return find_tags_recursively(text.parent, param2, []) 
    


Functions that scrape the forum thanks to the tags found above

In [75]:
def find_next(soup, next_tag):
    try:
        #check if find instead of find_all always work if not do [0]['href'][1:]
        if next_tag is not None:
            return soup.find_all(next_tag, string="Next")[0]['href'][1:]
        return ""
    except:
        return ""

def collect_forum_data(soup_row, tags, soup_row_ok = 0, msg_or_user="", link=""):
    data = {}
    try:
        if not soup_row_ok:
            soup_row = soup_row.find(tags[len(tags) -1])
    
        data['Title'] = soup_row.text.strip()
        data['Link'] = soup_row['href'][1:]
        
    except:
        pass
    return data

def collect_post_data(soup_row, tags, soup_row_ok, link=""):
    data = {}
    try:        
        if len(tags) == 2 and not soup_row_ok:
            soup_row = soup_row.find(tags[len(tags)-1])
        if len(tags) == 3:                       
            soup_row = soup_row.find(tags[len(tags)-1])
        elif len(tags) == 4:
            soup_row = soup_row.find(tags[2]).find(tags[3])
        
        data['Link'] = link.replace(PREFIX_URL, '')
        data['Any'] = soup_row.text.strip()
        
    except:
        pass
    return data

def collect_recursively(data, soup, tags, next_tag, fcte_name, link="", index=""):
    try:
        if index:
            print(index, end='\r', flush=True)
        
        soup_row_ok = 0
        if len(tags) == 1:
            soup_rows = soup.find_all(tags[0])
            soup_row_ok = 1
        if len(tags) == 2:
            soup_rows = soup.find_all(tags[0], {'class': tags[1]})
            if not soup_rows:
                soup_rows = soup.find_all(tags[0]) 
            else:
                soup_row_ok = 1      
        if len(tags) == 3 or len(tags) == 4:
            soup_rows = soup.find_all(tags[0], {'class': tags[1]})
    
        #print(soup_rows)
        data.extend([fcte_name(soup_row, tags, soup_row_ok, link) for soup_row in soup_rows])
        
        next_url = find_next(soup, next_tag)
        if next_url:
            soup = BeautifulSoup(requests.get(PREFIX_URL + next_url).text, 'html.parser')
            if index:
                return collect_recursively(data, soup, tags, next_tag, fcte_name, link, index+1)
            else:
                return collect_recursively(data, soup, tags, next_tag, fcte_name, link)
        else:
            return data
    except:
        pass
    return data
                
def verify_if_treated(soup, tags):
    if len(tags) > 4 or len(tags) < 1:
        print("This case is not treated yet")
        
    if len(tags) == 3 or len(tags) == 4:
        try:
            soup.find(tags[0], {'class': tags[1]})
        except:
            print("This case is not treated yet")
    
def collect_all_links(soup, tags, next_tag, fcte_name):      #soup or url? Combine (put verifications also to collect)
    verify_if_treated(soup, tags)

    data = collect_recursively([], soup, tags, next_tag, fcte_name)
    return pd.DataFrame(data).dropna()

def collect(forum_df, tags, fcte_name):
    data = []
    total = len(forum_df['Link'])
    index = 0
    for url in forum_df['Link']:
        index += 1
        print('{} out of {}'.format(index, total), end='\r', flush=True)
        #url = forum_df['Link'][0]
        soup = BeautifulSoup(requests.get(PREFIX_URL + url).text, 'html.parser')
        verify_if_treated(soup, tags)
        data.extend(collect_recursively([], soup, tags, next_tag, fcte_name, url))
    return pd.DataFrame(data)

### 1. Collect all titles and links of the forum

Enter the 3 parameters needed:

In [154]:
#Holiday Truths
PREFIX_URL = 'https://www.holidaytruths.co.uk/'
START_URL = PREFIX_URL + 'forum/america-canada-discussion-forum-f2-0.html'
soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')
title1 = 'ESTA question on employment'
title2 = 'Vegas Buffets/Restaurants'

#Wrong Planet
#PREFIX_URL = 'http://wrongplanet.net/forums'
#START_URL = PREFIX_URL + '/viewforum.php?f=19'
#soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')
#title1 = 'RE: Kids w/ Classic Autism, PDD-NOS & Speech Delays'
#title2 = 'Parents on the spectrum'

#Stack Overflow
#PREFIX_URL = 'https://stackoverflow.com/'
#START_URL = PREFIX_URL + 'questions/tagged/forum'
#soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')
#title1 = 'Should DynamoDB adjacency lists use discrete partition keys to model each type of relationship?'
#title2 = 'How to Bypass [hide] element in Forums?'

#Au Féminin
#PREFIX_URL = 'https://astrologie.aufeminin.com/forum'
#START_URL = PREFIX_URL + '/all'
#soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')
#title1 = 'coucou....'
#title2 = 'échange serieux en mp'


#Trip Advisor
#PREFIX_URL = 'https://www.tripadvisor.co.uk/'
#START_URL = PREFIX_URL + 'ShowForum-g1-i12334-Holiday_Travel.html'
#soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')
#title1 = 'See TOP QUESTIONS before posting!'
#title2 = 'Use the SEARCH BOX function before posting!'

Find the tags to be able to scrape the titles and links and the tag needed to find the next page

In [155]:
right_tags = find_tags(soup, title1, title2)
next_tag = find_next_tag(soup.find(text=re.compile("Next")), soup, [])
print(next_tag)              #delete this

I've found the right tag, it's  ['span', 'title', 'a']
The tag for next page is  a
a


Scrape all titles and links

In [120]:
threads = collect_all_links(soup, right_tags, next_tag, collect_forum_data)
print(threads)

                                                   Link  \
0        forum/esta-question-on-employment-t172445.html   
1           forum/vegas-buffets-restaurants-t35799.html   
2                           forum/new-york-t173214.html   
3                   forum/my-new-york-trip-t172567.html   
4     forum/best-places-in-colorado-do-kayaking-t172...   
5            forum/san-francisco-transfers-t172521.html   
6                       forum/ocean-florida-t91886.html   
7     forum/broadway-shows-kinky-boots-phantom-of-op...   
8                forum/new-years-in-canada-t172324.html   
9                   forum/new-york-museums-t172322.html   
10                 forum/new-york-in-march-t172279.html   
11                            forum/boston-t171965.html   
12                  forum/trip-of-lifetime-t171927.html   
13                         forum/galapagos-t171883.html   
14    forum/any-recommendations-for-few-best-places-...   
15       forum/las-vegas-accomadation-help-t171441.html 

### 2. Collect all usernames, messages of every posts in every link

Enter the parameters needed:

In [121]:
START_URL = PREFIX_URL + threads.Link[0]
soup =  BeautifulSoup(requests.get(START_URL).text, 'html.parser')
#print(soup.prettify())

#Holiday Truths
user1 = 'AnnaM'
user2 = 'Glynis HT Admin'
msg1 = 'Hi, I am brand new and hopefully I have put this question in the right area.'
msg2 = 'd have thought that was fine. Plenty of retired people travel.'

#Wrong Planet
#user1 = 'cyberdad'
#user2 = 'Solvejg'

#Trip Advisor
#print(soup.prettify())
#user1 = 'BradJill'
#user2 = 'Eden7'

Find the tags to be able to scrape the usernames

In [146]:
user_tags = find_tags(soup, user1, user2)

I've found the right tag, it's  ['div', 'user-name', 'a']


Scrape all usernames

In [123]:
user_posts = collect(threads, user_tags, collect_post_data)
user_posts.rename(columns={'Any':'Username'}, inplace=True)
#print(user_posts)

1434 out of 1434

Find the tags to be able to scrape the messages

In [147]:
message_tags = find_tags(soup, msg1, msg2)

I've found the right tag, it's  ['div', 'col-md-10']


Scrape all messages

In [125]:
message_posts = collect(threads, message_tags, collect_post_data)

1434 out of 1434

In [82]:
#print(message_posts)

### 3. Merge the titles, usernames, messages to have all posts

Concatenate the messages with the users

In [126]:
user_posts['Message'] =  pd.Series(message_posts['Any'], index=user_posts.index)
posts= user_posts

In [127]:
posts

Unnamed: 0,Username,Link,Message
0,AnnaM,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th..."
1,Glynis HT Admin,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...
2,AnnaM,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No..."
3,Lance Chambers,forum/esta-question-on-employment-t172445.html,This is not the case - they are more concerned...
4,AnnaM,forum/esta-question-on-employment-t172445.html,They also want to know your parents names? My...
5,Fiona,forum/esta-question-on-employment-t172445.html,"Anna- if you look at that section, the only co..."
6,Fiona,forum/esta-question-on-employment-t172445.html,Parents names are pretty normal information ne...
7,AnnaM,forum/esta-question-on-employment-t172445.html,"Hi Fiona, I’ve decided to do a tour called The..."
8,Fiona,forum/esta-question-on-employment-t172445.html,Is that the Cosmos one? Haven't been to LA but...
9,AnnaM,forum/esta-question-on-employment-t172445.html,"Hi Fiona, yes it’s with Cosmos, I initially wa..."


Merged with the titles

In [128]:
merged_df = pd.merge(threads, posts, on='Link', how='inner')
merged_df = merged_df.reindex(sorted(merged_df.columns), axis=1)
merged_df.to_json('C:/Users/Meret/Documents/EPFL/3Annee/Semestre_5/Projet/Forum_Chatbot/Forum Data/holiday_truths_threads_myscraping.json')

In [129]:
merged_df

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
2,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No...",ESTA question on employment,AnnaM
3,forum/esta-question-on-employment-t172445.html,This is not the case - they are more concerned...,ESTA question on employment,Lance Chambers
4,forum/esta-question-on-employment-t172445.html,They also want to know your parents names? My...,ESTA question on employment,AnnaM
5,forum/esta-question-on-employment-t172445.html,"Anna- if you look at that section, the only co...",ESTA question on employment,Fiona
6,forum/esta-question-on-employment-t172445.html,Parents names are pretty normal information ne...,ESTA question on employment,Fiona
7,forum/esta-question-on-employment-t172445.html,"Hi Fiona, I’ve decided to do a tour called The...",ESTA question on employment,AnnaM
8,forum/esta-question-on-employment-t172445.html,Is that the Cosmos one? Haven't been to LA but...,ESTA question on employment,Fiona
9,forum/esta-question-on-employment-t172445.html,"Hi Fiona, yes it’s with Cosmos, I initially wa...",ESTA question on employment,AnnaM


### Comparison with original (get rid when done)

In [134]:
original_merged_df = pd.read_json('C:/Users/Meret/Documents/EPFL/3Annee/Semestre_5/Projet/Forum_Chatbot/Forum Data/holiday_truths_threads.json')

In [135]:
original_merged_df = original_merged_df.drop(['Replies', 'Timestamp'], axis=1)

In [136]:
original_merged_df

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
10,forum/esta-question-on-employment-t172445.html,You will love it!\n\n \n\n ...,ESTA question on employment,Fiona
100,forum/trip-of-lifetime-t171927.html,I havent done a multi centre in the States apa...,Trip of a lifetime!,Fiona
1000,forum/thanks-for-your-help-t151857.html,Re: Thanks for your help\n\n\n ...,Thanks for your help,luci HT Mod
10000,forum/anyone-ever-used-vrbo-t63592.html,Thanks widget.\n\n \n\n ...,Anyone ever used VRBO,Waterford2005
10001,forum/anyone-ever-used-vrbo-t63592.html,VRBO\n\n\n Just wondering if an...,Anyone ever used VRBO,Waterford2005
10002,forum/anyone-ever-used-vrbo-t63592.html,which company is that?\r\nVrbo?\n\na few have ...,Anyone ever used VRBO,ALISONSVILLA
10003,forum/anyone-ever-used-vrbo-t63592.html,When I had a home in Florida I got about 90% o...,Anyone ever used VRBO,terrynewpack
10004,forum/anyone-ever-used-vrbo-t63592.html,That's really interesting terry-you must have ...,Anyone ever used VRBO,ALISONSVILLA
