# Generalized Forum Scraping and Creation of PKL files


# Table of Contents

### [1. Generalized Scraping of an Online Forum](#1)
[1.1. Functions](#11) <br/>
[1.2. Collect all Titles and Links of the Forum](#121) <br/>
[1.3. Collect all Usernames, Messages of every Posts in every Link](#122) <br/>
[1.4. Merge the Titles, Usernames, Messages to have all Posts](#13) <br/>

### [2. Creation of the PKL Files](#2)
[2.1. Sorting Data](#21) <br/>
[2.2. Precomputing word2vec](#22) <br/>
[2.3. Dot Product Similarity Functions](#23) <br/>
[2.4. Chatbot Functions](#24) <br/>

## 1. Generalized Forum Scraping

### 1.1. Functions

In [5]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import nltk
import gensim
import re

Functions that find the tags allowing to retrieve the data we need

In [6]:
def find_next_tag(x, soup, tag):
    """
    Finds recursively the tag necessary to scrape the next page of a forum.
    :param x: string
    :param soup: BeautifulSoup
    :param tag: list
    :return: string
    """
    if x != None and x.parent != None:
        next_tag = x.parent.name
        try:
            soup.find_all(next_tag, string="Next")[0]['href'][1:]
            tag[0] = next_tag
            print("The tag for next page is ", next_tag)
        except:
            find_next_tag(x.parent, soup, tag)
        return tag[0]
    else:
        print("tag of the next page is not treated in this program")
        return None

def find_tags_recursively(x, comparator, tags):
    """
    Help function to find tags recursively.
    :param x: string
    :param comparator: string
    :param tags: list
    :return: list
    """
    tag = x.name
    try:
        class_name = x['class'][0]
        tags.append(class_name)
        tags.append(tag)
        next_ = x.findNext(tag, {'class': class_name})
    except:
        tags.append(tag)
        next_ = x.findNext(tag)
        
    if next_ != None:
        if comparator in next_.text:
            print("I've found the right tags, it's ", tags[::-1])
            return tags[::-1]
        else:
            return find_tags_recursively(x.parent, comparator, tags)
    else:
        print("Unable to scrape this text from this forum") 

def find_tags(soup, param1, param2):
    """
    Finds all tags needed to retrieve the information we're looking for (titles of threads, usernames, message, ...).
    :param soup: BeautifulSoup
    :param param1: string
    :param param2: string
    :return: list
    """
    text = soup.find(text=re.compile(param1))
    if text is None:
        print("Unable to scrape this text from this forum")
        return None
    else:
        return find_tags_recursively(text.parent, param2, []) 

Functions that scrape the forum thanks to the tags found above

In [7]:
def find_next(soup, next_tag):
    """
    Tries to find the next page, or returns an empty string if there is no next page.
    :param soup: BeautifulSoup
    :param next_tag: string
    :return: string
    """
    try:
        if next_tag is not None:
            return soup.find_all(next_tag, string="Next")[0]['href'][1:]
        return ""
    except:
        return ""

def collect_forum_data(soup_row, tags, soup_row_ok = 0, msg_or_user="", link=""):
    """
    Collects all relevant data (title, link) of a thread.
    :param soup_row: BeautifulSoup
    :param tags: list
    :param soup_row_ok: boolean (0 or 1)
    :param msg_or_user: string
    :param link: string
    :return: dict
    """
    data = {}
    try:
        if not soup_row_ok:
            soup_row = soup_row.find(tags[len(tags) -1])
    
        data['Title'] = soup_row.text.strip()
        data['Link'] = soup_row['href'][1:]
        
    except:
        pass
    return data

def collect_post_data(soup_row, tags, soup_row_ok, link=""):
    """
    Collects all relevant data of a post (message, user).
    :param soup_row: BeautifulSoup
    :param tags: list
    :param soup_row_ok: boolean (0 or 1)
    :param link: string
    :return: dict
    """
    data = {}
    try:        
        data['Link'] = link.replace(PREFIX_URL, '')
        
        if len(tags) == 2 and not soup_row_ok:
            soup_row = soup_row.find_all(tags[len(tags)-1])
        if len(tags) == 3:                       
            soup_row = soup_row.find_all(tags[len(tags)-1])
        elif len(tags) == 4:
            soup_row = soup_row.find(tags[2]).find_all(tags[3])
        
        if soup_row_ok:
            data['Any'] = soup_row.text.strip()
        elif len(soup_row) > 1:
            #concatenate message if it spans over multiple tags (often <p>)
            message = ''
            for msg in soup_row:
                message += msg.text.strip()  
            data['Any'] = message
        else:
            data['Any'] = soup_row[0].text.strip()
          
    except:
        pass
    return data

def collect_recursively(data, soup, tags, next_tag, fcte_name, link="", index=""):
    """
    Collects data recursively using a collection function from the two above.
    :param data: dict
    :param soup: BeautifulSoup
    :param tags: list
    :param next_tag: string
    :param fcte_name: function
    :param link: string
    :param index: int
    :return: dict
    """
    try:
        if index:
            print(index, end='\r', flush=True)
        
        soup_row_ok = 0
        if len(tags) == 1:
            soup_rows = soup.find_all(tags[0])
            soup_row_ok = 1
        if len(tags) == 2:
            soup_rows = soup.find_all(tags[0], {'class': tags[1]})
            if not soup_rows:
                soup_rows = soup.find_all(tags[0]) 
            else:
                soup_row_ok = 1      
        if len(tags) == 3 or len(tags) == 4:
            soup_rows = soup.find_all(tags[0], {'class': tags[1]})
        
        data.extend([fcte_name(soup_row, tags, soup_row_ok, link) for soup_row in soup_rows])
        next_url = find_next(soup, next_tag)
        if next_url:
            soup = BeautifulSoup(requests.get(PREFIX_URL + next_url).text, 'html.parser')
            if index:
                return collect_recursively(data, soup, tags, next_tag, fcte_name, link, index+1)
            else:
                return collect_recursively(data, soup, tags, next_tag, fcte_name, link)
        else:
            return data
    except:
        pass
    return data
                
def verify_if_treated(soup, tags):
    """
    Verifies if the case is treated yet (depending on the length and composants of the tags).
    :param soup: BeautifulSoup
    :param tags: list
    """
    if len(tags) > 4 or len(tags) < 1:
        print("This case is not treated yet")
        
    if len(tags) == 3 or len(tags) == 4:
        try:
            soup.find(tags[0], {'class': tags[1]})
        except:
            print("This case is not treated yet")
    
def collect_all_links(soup, tags, next_tag, fcte_name):
    """
    Launches process to collect a dataframe for all threads of a forum (namely collects all links and titles of a forum).
    :param soup: BeautifulSoup
    :param tags: list
    :param next_tag: string
    :param fcte_name: function
    :return: DataFrame
    """
    verify_if_treated(soup, tags)

    data = collect_recursively([], soup, tags, next_tag, fcte_name)
    return pd.DataFrame(data).dropna()

def collect(forum_df, tags, next_tag, fcte_name):
    """
    Launches process to collect a dataframe for all posts of a thread (namely collects all usernames and messages of a thread).
    :param forum_df: DataFrame
    :param tags: list
    :param next_tag: string
    :param data_function: function
    :return: DataFrame
    """
    data = []
    total = len(forum_df['Link'])
    index = 0
    for url in forum_df['Link']:
        index += 1
        print('{} out of {}'.format(index, total), end='\r', flush=True)
        soup = BeautifulSoup(requests.get(PREFIX_URL + url).text, 'html.parser')
        verify_if_treated(soup, tags)
        data.extend(collect_recursively([], soup, tags, next_tag, fcte_name, url))
    return pd.DataFrame(data)

### 1.2. Collect all Titles and Links of the Forum

Enter the 3 parameters needed:

In [8]:
#Holiday Truths (America/Canada Discussion)
PREFIX_URL = 'https://www.holidaytruths.co.uk/'
START_URL = PREFIX_URL + 'forum/america-canada-discussion-forum-f2-0.html'
title1 = 'ESTA question on employment'
title2 = 'Vegas Buffets/Restaurants'

#Wrong Planet
#PREFIX_URL = 'http://wrongplanet.net/forums'
#START_URL = PREFIX_URL + '/viewforum.php?f=19'
#title1 = 'RE: Kids w/ Classic Autism, PDD-NOS & Speech Delays'
#title2 = 'Parents on the spectrum'

#Trip Advisor
#PREFIX_URL = 'https://www.tripadvisor.co.uk/'
#START_URL = PREFIX_URL + 'ShowForum-g1-i12334-Holiday_Travel.html'
#title1 = 'See TOP QUESTIONS before posting!'
#title2 = 'Use the SEARCH BOX function before posting!'

In [9]:
soup = BeautifulSoup(requests.get(START_URL).text, 'html.parser')

We find the tags to be able to scrape all titles and links of a forum and the tag needed to find the next page

In [10]:
tags = find_tags(soup, title1, title2)
next_tag = find_next_tag(soup.find(text=re.compile("Next")), soup, [''])

I've found the right tags, it's  ['span', 'title', 'a']
The tag for next page is  a


We scrape all titles and links

In [14]:
threads = collect_all_links(soup, tags, next_tag, collect_forum_data)
threads.head()

Unnamed: 0,Link,Title
0,forum/esta-question-on-employment-t172445.html,ESTA question on employment
1,forum/vegas-buffets-restaurants-t35799.html,Vegas Buffets/Restaurants
2,forum/new-york-t173214.html,New York
3,forum/my-new-york-trip-t172567.html,My New York Trip
4,forum/best-places-in-colorado-do-kayaking-t172...,Best places in Colorado to do kayaking?


In [12]:
threads.to_json('Forum Data/subjects.json')

### 1.3. Collect all Usernames, Messages of every Posts in every Link

Enter the parameters needed:

In [15]:
#Holiday Truths (America/Canada Discussion)
user1 = 'AnnaM'
user2 = 'Glynis HT Admin'
msg1 = 'Hi, I am brand new and hopefully I have put this question in the right area.'
msg2 = 'd have thought that was fine. Plenty of retired people travel.'

#Wrong Planet
#user1 = 'cyberdad'
#user2 = 'Solvejg'
#msg1 = 'I cope fine in the general parenting area.'
#msg2 = 'Yes.'

#Trip Advisor
#user1 = 'BradJill'
#user2 = 'Eden7'
#msg1 = 'HOW TO USE THE HOLIDAY TRAVEL FORUM!'
#msg2 = 'Great advice BradJill.....'

In [16]:
START_URL = PREFIX_URL + threads.Link[0]
soup =  BeautifulSoup(requests.get(START_URL).text, 'html.parser')

We check that prefix_url is well defined (optional, run this cell if unable to scrape usernames and messages)

In [17]:
#if START_URL != right_url:
#    print("PREFIX_URL wasn't well defined. Please check it.")

We find the tags to be able to scrape the usernames

In [18]:
user_tags = find_tags(soup, user1, user2)

I've found the right tags, it's  ['div', 'user-name', 'a']


We scrape all usernames

In [19]:
user_posts = collect(threads, user_tags, next_tag, collect_post_data)
user_posts.rename(columns={'Any':'Username'}, inplace=True)
user_posts.head()

1434 out of 1434

Unnamed: 0,Username,Link
0,AnnaM,forum/esta-question-on-employment-t172445.html
1,Glynis HT Admin,forum/esta-question-on-employment-t172445.html
2,AnnaM,forum/esta-question-on-employment-t172445.html
3,Lance Chambers,forum/esta-question-on-employment-t172445.html
4,AnnaM,forum/esta-question-on-employment-t172445.html


We find the tags to be able to scrape the messages

In [20]:
message_tags = find_tags(soup, msg1, msg2)

I've found the right tags, it's  ['div', 'col-md-10']


We scrape all messages

In [21]:
message_posts = collect(threads, message_tags, next_tag, collect_post_data)
message_posts.head()

1434 out of 1434

Unnamed: 0,Any,Link
0,"Hi, I am brand new and hopefully I have put th...",forum/esta-question-on-employment-t172445.html
1,Hi Anita & \nIf you are retired Anna then I'...,forum/esta-question-on-employment-t172445.html
2,"Thanks for your input, My worry is if I put No...",forum/esta-question-on-employment-t172445.html
3,This is not the case - they are more concerned...,forum/esta-question-on-employment-t172445.html
4,They also want to know your parents names? My...,forum/esta-question-on-employment-t172445.html


Sometimes random data is found in the tags of the message (ex. 'p'), this function gets rid of this data

In [22]:
def check_messages(data, msg1):
    for i in range(0, len(data)-1):
        if msg1 in data['Any'][i]:
            return i

In [23]:
index = check_messages(message_posts, msg1)
print("index: ", index)
message_posts['Any'] = message_posts['Any'].iloc[index:]
message_posts['Any'] = message_posts['Any'].shift(-index)

index:  0


### 1.4. Merge the Titles, Usernames, Messages to have all Posts

We concatenate the messages with the users

In [24]:
user_posts['Message'] =  pd.Series(message_posts['Any'], index=user_posts.index)
posts= user_posts

In [25]:
posts.head()

Unnamed: 0,Username,Link,Message
0,AnnaM,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th..."
1,Glynis HT Admin,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...
2,AnnaM,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No..."
3,Lance Chambers,forum/esta-question-on-employment-t172445.html,This is not the case - they are more concerned...
4,AnnaM,forum/esta-question-on-employment-t172445.html,They also want to know your parents names? My...


We merged the messages and usernames with the titles

In [26]:
merged_df = pd.merge(threads, posts, on='Link', how='inner')
merged_df = merged_df.reindex(sorted(merged_df.columns), axis=1)
merged_df.to_json('Forum Data/threads.json')

In [27]:
merged_df.head()

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
2,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No...",ESTA question on employment,AnnaM
3,forum/esta-question-on-employment-t172445.html,This is not the case - they are more concerned...,ESTA question on employment,Lance Chambers
4,forum/esta-question-on-employment-t172445.html,They also want to know your parents names? My...,ESTA question on employment,AnnaM


## 2. Creation of the PKL Files

In [28]:
import numpy as np
import pandas as pd
import nltk

We read the data

In [29]:
forum_subjects = pd.read_json('Forum Data/subjects.json')
forum_threads = pd.read_json('Forum Data/threads.json')

In [30]:
forum_subjects.head()

Unnamed: 0,Link,Title
0,forum/esta-question-on-employment-t172445.html,ESTA question on employment
1,forum/vegas-buffets-restaurants-t35799.html,Vegas Buffets/Restaurants
10,forum/new-york-in-march-t172279.html,New York in March
100,forum/motorhome-rv-holiday-in-arizona-nevada-t...,motorhome/RV holiday in arizona/nevada?
1000,forum/las-vegas-report-kind-of-t109997.html,Las Vegas Report (Kind Of)


In [31]:
forum_threads.head()

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
10,forum/esta-question-on-employment-t172445.html,You will love it!\n\n \n\n ...,ESTA question on employment,Fiona
100,forum/vegas-buffets-restaurants-t35799.html,Thank you so much for your help!\n\n ...,Vegas Buffets/Restaurants,Zabka
1000,forum/las-vegas-2012-t152580.html,Re: Las Vegas 2012.....\n\n\n H...,Las Vegas 2012.....,luci HT Mod


If we have data coming from different discussion forums we can concatenate it.

In [None]:
"""
forum_threads_1 = pd.read_json('...')
forum_subjects_1 = pd.read_json('...')
forum_threads_2 = pd.read_json('...')
forum_subjects_2 = pd.read_json('...')

forum_subjects = pd.concat([forum_subjects_1, forum_subjects_2], ignore_index= True)
forum_threads = pd.concat([forum_threads_1, forum_threads_2], ignore_index= True)
"""

In [34]:
forum_threads.Message = forum_threads.Message.apply(lambda msg: msg.split('_________________')[0])
forum_subjects = forum_subjects.drop_duplicates(subset=['Link'], keep='first')
forum_threads = forum_threads.drop_duplicates(subset=['Link', 'Message'], keep='first')

The total number of posts is:

In [35]:
len(forum_threads)

15760

The total number of threads is:

In [36]:
len(forum_subjects)

1434

In [37]:
merged_forum_threads = pd.merge(forum_threads.drop(['Title'], axis=1), forum_subjects, 
                                on='Link', how='inner')
merged_forum_threads.to_json('Forum Data/merged.json')
merged_forum_threads.head()

Unnamed: 0,Link,Message,Username,Title
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",AnnaM,ESTA question on employment
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,Glynis HT Admin,ESTA question on employment
2,forum/esta-question-on-employment-t172445.html,You will love it!\n\n \n\n ...,Fiona,ESTA question on employment
3,forum/esta-question-on-employment-t172445.html,Keep your ESTA reference number\n\n\n ...,James Fletcher,ESTA question on employment
4,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No...",AnnaM,ESTA question on employment


In [38]:
len(merged_forum_threads)

15760

### 2.1. Sorting Data

In [39]:
import nltk.data
import subprocess
import pandas as pd

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [40]:
df = pd.read_json('Forum Data/merged.json')
df = df[df['Message'].map(lambda x: x is not None)]
df.head()

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
10,forum/esta-question-on-employment-t172445.html,Is that the Cosmos one? Haven't been to LA but...,ESTA question on employment,Fiona
100,forum/vegas-buffets-restaurants-t35799.html,"Ah, the legendary Cheesecake Factory... people...",Vegas Buffets/Restaurants,grayejectbutton
1000,forum/fort-lauderdale-miami-t114861.html,Re: Fort Lauderdale to Miami\n\n\n ...,Fort Lauderdale to Miami,anstruther


We find the first post of every threads.

In [41]:
titles_with_first_post = df.groupby('Link').first().reset_index()[['Link', 'Title', 'Username', 'Message']]
titles_with_first_post.columns = ['Link', 'Title', 'Seeker', 'First_Post']
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post
0,forum/-20-dollar-trick-t110905.html,$20 Dollar trick,jac47,just returned from our first trip to vegas.\nI...
1,forum/-toronto-halal-t102178.html,--Toronto- halal--,Just_a_tourist,Hi! Does anyone know good places where they se...
2,forum/1st-time-florida-help-t141308.html,1st Time to Florida HELP!!!,Lelly,"Hello, we are a family of 5 going to Florida f..."
3,forum/1st-time-florida-whats-nearby-t153058.html,1st time florida - whats nearby,cart583,"hi, never been to florida before but have book..."
4,forum/1st-time-new-york-booking-advice-t126856...,1st time New York / booking advice,seagull,Starting to look into booking a break in New Y...


In [42]:
def tokenize_properly(text):
    if text is not None:
        r = [sent for sent in tokenizer.tokenize(text.replace('\n', '.')) 
            if len(sent.replace('.', '').replace(' ', '')) >= 2]
        return r

titles_with_first_post.Title = titles_with_first_post.Title.apply(tokenize_properly)
titles_with_first_post.First_Post = titles_with_first_post.First_Post.apply(tokenize_properly)
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],Just_a_tourist,"[Hi!, Does anyone know good places where they ..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],Lelly,"[Hello, we are a family of 5 going to Florida ..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],cart583,"[hi, never been to florida before but have boo..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],seagull,[Starting to look into booking a break in New ...


An approach is to put the titles and the first posts in lower case.

In [43]:
def lower(text):
    if text is not None:
        r = [s.lower() for s in text]
        return r
#titles_with_first_post.Title = titles_with_first_post.Title.apply(lower)
#titles_with_first_post.First_Post = titles_with_first_post.First_Post.apply(lower)
#titles_with_first_post.head()

We count the number of sentences for the titles and the first posts.

In [44]:
titles_with_first_post['Title_sent_count'] = titles_with_first_post.Title.apply(len)
titles_with_first_post['FP_sent_count'] = titles_with_first_post.First_Post.apply(len)
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post,Title_sent_count,FP_sent_count
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],Just_a_tourist,"[Hi!, Does anyone know good places where they ...",1,3
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],Lelly,"[Hello, we are a family of 5 going to Florida ...",1,7
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],cart583,"[hi, never been to florida before but have boo...",1,7
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],seagull,[Starting to look into booking a break in New ...,1,1


Another approach is to clean the replies as for example "Reply" or "Re: title of the post" are not useful to find the best answer.

In [47]:
def cleaning_replies(text):
    if text is not None:
        r = [s for s in text if not s.startswith('Re:') and not s.startswith("Reply") ]
        return r

We get rid of everything the author of the first post is saying, we tokenize the data and we count the number of sentences of the replies.

In [48]:
titles_with_messages = pd.merge(titles_with_first_post, 
                                df[['Username', 'Message', 'Link']], on='Link')
titles_with_messages.rename(columns={'Username':'Replier'}, inplace=True)
titles_with_messages = titles_with_messages[titles_with_messages.apply(
    lambda row: not row['First_Post'] == row['Message'] and not row['Seeker'] == row['Replier'], axis=1)]
titles_with_messages.Message = titles_with_messages.Message.apply(tokenize_properly)
#titles_with_messages.Message = titles_with_messages.Message.apply(cleaning_replies)
titles_with_messages.rename(columns={'Message': 'Reply'}, inplace=True)
titles_with_messages['Reply_sent_count'] = titles_with_messages.Reply.apply(len)
titles_with_messages.head()

Unnamed: 0,Link,Title,Seeker,First_Post,Title_sent_count,FP_sent_count,Replier,Reply,Reply_sent_count
1,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,luci HT Mod,"[Excellent news jac!, I'm so pleased you tried...",2
2,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,lesley74,[I love the $20 trick but I haven't tried it f...,1
3,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,luci HT Mod,[Tried it at the Bellagio in November and got ...,5
4,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,kiershay,[],0
5,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,luci HT Mod,"[The suggested method is, when they ask for yo...",4


In [49]:
titles_with_messages.columns

Index(['Link', 'Title', 'Seeker', 'First_Post', 'Title_sent_count',
       'FP_sent_count', 'Replier', 'Reply', 'Reply_sent_count'],
      dtype='object')

In [50]:
len(titles_with_messages)

11818

In [51]:
CHOSEN_COLUMNS = ['Link', 'Reply']
titles_with_messages[CHOSEN_COLUMNS].head()

Unnamed: 0,Link,Reply
1,forum/-20-dollar-trick-t110905.html,"[Excellent news jac!, I'm so pleased you tried..."
2,forum/-20-dollar-trick-t110905.html,[I love the $20 trick but I haven't tried it f...
3,forum/-20-dollar-trick-t110905.html,[Tried it at the Bellagio in November and got ...
4,forum/-20-dollar-trick-t110905.html,[]
5,forum/-20-dollar-trick-t110905.html,"[The suggested method is, when they ask for yo..."


In [52]:
msg_df = titles_with_messages[CHOSEN_COLUMNS]
msg_df.head()

Unnamed: 0,Link,Reply
1,forum/-20-dollar-trick-t110905.html,"[Excellent news jac!, I'm so pleased you tried..."
2,forum/-20-dollar-trick-t110905.html,[I love the $20 trick but I haven't tried it f...
3,forum/-20-dollar-trick-t110905.html,[Tried it at the Bellagio in November and got ...
4,forum/-20-dollar-trick-t110905.html,[]
5,forum/-20-dollar-trick-t110905.html,"[The suggested method is, when they ask for yo..."


In [53]:
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post,Title_sent_count,FP_sent_count
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],Just_a_tourist,"[Hi!, Does anyone know good places where they ...",1,3
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],Lelly,"[Hello, we are a family of 5 going to Florida ...",1,7
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],cart583,"[hi, never been to florida before but have boo...",1,7
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],seagull,[Starting to look into booking a break in New ...,1,1


In [54]:
tfp_df = titles_with_first_post[['Link', 'Title', 'First_Post']]
tfp_df.head()

Unnamed: 0,Link,Title,First_Post
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],[just returned from our first trip to vegas..I...
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],"[Hi!, Does anyone know good places where they ..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],"[Hello, we are a family of 5 going to Florida ..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],"[hi, never been to florida before but have boo..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],[Starting to look into booking a break in New ...


### 2.2. Precomputing word2vec

We apply word2vec (we give a vector representation for every word of a sentence and then we associate every sentence with the means of its words). We do this for the titles, the first posts and the messages.

In [55]:
import gensim
import numpy as np
model = gensim.models.KeyedVectors.load_word2vec_format('Forum Data/GoogleNews-vectors-negative300.bin', binary=True)
from nltk.corpus import stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

def get_sentence_vector(sentence):
    tokens = [token for token in nltk.word_tokenize(sentence) if token not in stopwords]
    vectors = []
    for token in tokens:
        try:
            word_vec = model.wv[token]
            vectors.append(word_vec)
        except:
            pass
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return []

In [56]:
tfp_df['Title_word2vec'] = tfp_df.Title.apply(lambda sents: [get_sentence_vector(sent) for sent in sents])
tfp_df['First_Post_word2vec'] = tfp_df.First_Post.apply(lambda sents: [get_sentence_vector(sent) for sent in sents])
tfp_df.head()

  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Link,Title,First_Post,Title_word2vec,First_Post_word2vec
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],[just returned from our first trip to vegas..I...,"[[0.053548176, -0.06933594, -0.022298178, 0.05...","[[0.06859633, 0.07334631, -0.01894244, 0.10071..."
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],"[Hi!, Does anyone know good places where they ...","[[-0.4375, -0.36914062, 0.21484375, 0.14941406...","[[-0.087402344, 0.095703125, 0.27539062, -0.01..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],"[Hello, we are a family of 5 going to Florida ...","[[0.09753418, 0.0126953125, 0.038024902, 0.224...","[[0.039695047, 0.022238992, 0.029807352, 0.102..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],"[hi, never been to florida before but have boo...","[[0.030883789, -0.016217042, 0.0061157225, 0.2...","[[-0.010480608, 0.021902902, -0.032854352, 0.0..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],[Starting to look into booking a break in New ...,"[[-0.012568156, 0.072255455, -0.041554768, 0.0...","[[0.042194713, 0.096147016, -0.065665506, 0.10..."


In [57]:
msg_df['Reply_word2vec'] = msg_df.Reply.apply(lambda sents: [get_sentence_vector(sent) for sent in sents])
msg_df.head()

  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Link,Reply,Reply_word2vec
1,forum/-20-dollar-trick-t110905.html,"[Excellent news jac!, I'm so pleased you tried...","[[-0.1188151, -0.056722004, -0.079589844, -0.0..."
2,forum/-20-dollar-trick-t110905.html,[I love the $20 trick but I haven't tried it f...,"[[0.15499441, 0.046831403, 0.044363838, 0.1210..."
3,forum/-20-dollar-trick-t110905.html,[Tried it at the Bellagio in November and got ...,"[[0.05673828, 0.018359374, 0.0030273437, 0.101..."
4,forum/-20-dollar-trick-t110905.html,[],[]
5,forum/-20-dollar-trick-t110905.html,"[The suggested method is, when they ask for yo...","[[-0.018924968, 0.0462972, -0.009883626, 0.071..."


We write the data to pickle files

In [58]:
import pickle

output1 = open('Forum Data/tfp_df.pkl', 'wb')
pickle.dump(tfp_df, output1)

output2 = open('Forum Data/msg_df.pkl', 'wb')
pickle.dump(msg_df, output2)

### 2.3. Dot Product Similarity Functions

In [59]:
import pandas as pd
from os import listdir
import gensim
import numpy as np
import nltk
from nltk.corpus import stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

In [60]:
model = gensim.models.KeyedVectors.load_word2vec_format('Forum Data/GoogleNews-vectors-negative300.bin', binary=True)

In [61]:
tfp_df = pd.read_pickle('Forum Data/tfp_df.pkl')
tfp_df.head()

Unnamed: 0,Link,Title,First_Post,Title_word2vec,First_Post_word2vec
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],[just returned from our first trip to vegas..I...,"[[0.053548176, -0.06933594, -0.022298178, 0.05...","[[0.06859633, 0.07334631, -0.01894244, 0.10071..."
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],"[Hi!, Does anyone know good places where they ...","[[-0.4375, -0.36914062, 0.21484375, 0.14941406...","[[-0.087402344, 0.095703125, 0.27539062, -0.01..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],"[Hello, we are a family of 5 going to Florida ...","[[0.09753418, 0.0126953125, 0.038024902, 0.224...","[[0.039695047, 0.022238992, 0.029807352, 0.102..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],"[hi, never been to florida before but have boo...","[[0.030883789, -0.016217042, 0.0061157225, 0.2...","[[-0.010480608, 0.021902902, -0.032854352, 0.0..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],[Starting to look into booking a break in New ...,"[[-0.012568156, 0.072255455, -0.041554768, 0.0...","[[0.042194713, 0.096147016, -0.065665506, 0.10..."


Functions that compute the similarity between sentences.

In [62]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))

def get_sentence_vector(sentence):
    tokens = [token for token in nltk.word_tokenize(sentence) if token not in stopwords]
    vectors = []
    for token in tokens:
        try:
            word_vec = model.wv[token]
            vectors.append(word_vec)
        except:
            pass
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return []

def is_not_null(sent_vec):
    for element in sent_vec:
        if not element == 0.0:
            return True
    return False

def sent_to_text_similarity(sent_vec, text_vec):
    similarities = []
    for vec in text_vec:
        if is_not_null(vec):
            similarities.append(np.dot(sent_vec, vec)/(np.linalg.norm(sent_vec) * np.linalg.norm(vec)))
    if similarities:
        return np.mean(similarities)
    else:
        return np.nan

def text_to_text_similarity(sent_vecs1, sent_vecs2):
    similarities = []
    for v1 in sent_vecs1:
        if is_not_null(v1):
            similarity = sent_to_text_similarity(v1, sent_vecs2)
            if not np.isnan(similarity):
                similarities.append(similarity)
    if similarities:
        return np.mean(similarities)
    else:
        return np.nan

def text_to_corpus_similarity(text, corpus):
    sent_vecs = text_to_sent_vec(text)
    corpus_vecs = [text_to_sent_vec(other_text) for other_text in corpus]
    max_sim = 0
    index = -1
    for text_index in range(len(corpus_vecs)):
        similarity = text_to_text_similarity(sent_vecs, corpus_vecs[text_index])
        if not np.isnan(similarity) and max_sim < similarity:
            max_sim = similarity
            index = text_index
    if index >= 0:
        return corpus[index]
    else:
        return None

### 2.4. Chatbot Functions

Functions that find the best matching sentences in a forum.

In [63]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [64]:
def compute_similarity(row, sent_vec):
    title_sim = 0
    title_word2vec = row['Title_word2vec']
    if len(title_word2vec) > 0:
        if len(title_word2vec[0]) > 0:
            title_sim = cosine_similarity(sent_vec, title_word2vec[0])
    return title_sim

def compute_separate_similarity(row, sent_vecs):
    title_sim = 0
    title_word2vec = row['Title_word2vec']
    if len(title_word2vec) > 0:
        if len(title_word2vec[0]) > 0:
            title_sim = np.dot(sent_vecs[0], title_word2vec[0])/(np.linalg.norm(sent_vecs[0])*np.linalg.norm(title_word2vec[0]))
    fp_sim = text_to_text_similarity(sent_vecs[1:], row['First_Post_word2vec'])
    return title_sim + fp_sim

def compute_separate_similarity_no_question(row, sent_vecs):
    fp_sim = text_to_text_similarity(sent_vecs, row['First_Post_word2vec'])
    return fp_sim

def get_most_similar_title(sentences, sent_vecs):
    """
    Finds the most similar thread in a forum (the thread which will be the most likely to have a matching response).
    Returns the row of the DataFrame of this most similar thread.
    :param sentences: list
    :param sent_vecs: list
    :return: DataFrame
    """
    if sentences == 0:
        raise ValueError('Write something!')
    elif len(sentences) == 1:
        #compares only with titles
        title_fp_sim = tfp_df.apply(lambda row: compute_similarity(row, sent_vecs[0]), axis=1)
    elif sentences[0].endswith('?'):
        #compares with titles and first posts
        title_fp_sim = tfp_df.apply(lambda row: compute_separate_similarity(row, sent_vecs), axis=1)
    else:
        #compares with first posts
        title_fp_sim = tfp_df.apply(lambda row: compute_separate_similarity_no_question(row, sent_vecs), axis=1)
    return tfp_df.loc[title_fp_sim.idxmax()]

def get_response_sentences(sent_vecs, link, max_sentences):
    """
    Finds the closest sentences (depending on max_sentences) in a particular thread.
    :param sent_vecs: list
    :param link: string
    :param max_sentences: int
    :return: string
    """
    answer_df = pd.read_pickle('Forum Data/msg_df.pkl')
    answer_df = answer_df[answer_df['Link'].map(lambda x: x == link)]
    
    if answer_df.empty:
        s = 'I did not find a matching sentence'
        return s
    
    best_answer = answer_df.loc[answer_df['Reply_word2vec'].apply(lambda other_vecs: 
                                                     text_to_text_similarity(sent_vecs, other_vecs)).idxmax()]
        
    best_sentence_idx = np.argmax([sent_to_text_similarity(sent_vec, sent_vecs) for sent_vec in best_answer.Reply_word2vec if len(sent_vec)])
    reply_sentences = best_answer.Reply
    if max_sentences <= 1:
        return reply_sentences[best_sentence_idx]
    else:
        context_sent_count = int((max_sentences - 1)/2)
        sent_count = len(reply_sentences)
        lower_bound = best_sentence_idx - context_sent_count
        upper_bound = best_sentence_idx + context_sent_count + 1
        return ' '.join(reply_sentences[max(0, lower_bound - max(0, upper_bound - sent_count)): 
                                        min(upper_bound + max(0, 0 - lower_bound) + ((max_sentences - 1) % 2), sent_count)])

def chatbot_answer(question, max_sentences=1):
    """
    Finds the closest response in the forum using forum data and word2vec.
    :param question: string
    :param max_sentences: int
    :return: string
    """
    sentences = tokenizer.tokenize(question)
    sent_vecs = [get_sentence_vector(sent) for sent in sentences]
    most_similar_title = get_most_similar_title(sentences, sent_vecs)
    return get_response_sentences(sent_vecs, most_similar_title.Link, max_sentences)
                        

In [65]:
chatbot_answer("Best hotel in Vegas?", max_sentences=3)

  if __name__ == '__main__':


"Our fave is the MGM Grand. Yes it is huge, but that is what Vegas is all about. You have everything you want in one hotel...The atmosphere is great with a very young feel to the place and the feel good factor is more evident here than at other hotels, with perhaps the NYNY being the only hotel able to compete...Rooms are to die for, service is exceptional, and location can't be beaten."