## Create Pkl Files

In [7]:
import numpy as np
import pandas as pd
import nltk

We read the data

In [9]:
forum_subjects = pd.read_json('Forum Data/subjects.json')
forum_threads = pd.read_json('Forum Data/threads.json')

In [10]:
forum_subjects.head()

Unnamed: 0,Link,Title
0,forum/esta-question-on-employment-t172445.html,ESTA question on employment
1,forum/vegas-buffets-restaurants-t35799.html,Vegas Buffets/Restaurants
10,forum/new-york-in-march-t172279.html,New York in March
100,forum/motorhome-rv-holiday-in-arizona-nevada-t...,motorhome/RV holiday in arizona/nevada?
1000,forum/las-vegas-report-kind-of-t109997.html,Las Vegas Report (Kind Of)


In [11]:
forum_threads.head()

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
10,forum/esta-question-on-employment-t172445.html,You will love it!\n\n \n\n ...,ESTA question on employment,Fiona
100,forum/vegas-buffets-restaurants-t35799.html,Thank you so much for your help!\n\n ...,Vegas Buffets/Restaurants,Zabka
1000,forum/las-vegas-2012-t152580.html,Re: Las Vegas 2012.....\n\n\n H...,Las Vegas 2012.....,luci HT Mod


If we have data coming from different discussion forums we can concatenate it.

In [8]:
"""
forum_threads_1 = pd.read_json('...')
forum_subjects_1 = pd.read_json('...')
forum_threads_2 = pd.read_json('...')
forum_subjects_2 = pd.read_json('...')

forum_subjects = pd.concat([forum_subjects_1, forum_subjects_2], ignore_index= True)
forum_threads = pd.concat([forum_threads_1, forum_threads_2], ignore_index= True)
"""

In [16]:
forum_threads.Message = forum_threads.Message.apply(lambda msg: msg.split('_________________')[0])
forum_subjects = forum_subjects.drop_duplicates(subset=['Link'], keep='first')
forum_threads = forum_threads.drop_duplicates(subset=['Link', 'Message'], keep='first')

The total number of posts is:

In [13]:
len(forum_threads)

15760

The total number of threads is:

In [14]:
len(forum_subjects)

1434

In [15]:
merged_forum_threads = pd.merge(forum_threads.drop(['Title'], axis=1), forum_subjects, 
                                on='Link', how='inner')
merged_forum_threads.to_json('Forum Data/merged.json')
merged_forum_threads.head()

Unnamed: 0,Link,Message,Username,Title
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",AnnaM,ESTA question on employment
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,Glynis HT Admin,ESTA question on employment
2,forum/esta-question-on-employment-t172445.html,You will love it!\n\n \n\n ...,Fiona,ESTA question on employment
3,forum/esta-question-on-employment-t172445.html,Keep your ESTA reference number\n\n\n ...,James Fletcher,ESTA question on employment
4,forum/esta-question-on-employment-t172445.html,"Thanks for your input, My worry is if I put No...",AnnaM,ESTA question on employment


In [17]:
len(merged_forum_threads)

15760

### 1. Sorting data

In [64]:
import nltk.data
import subprocess
import pandas as pd

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [65]:
df = pd.read_json('Forum Data/merged.json')
df = df[df['Message'].map(lambda x: x is not None)]
df.head()

Unnamed: 0,Link,Message,Title,Username
0,forum/esta-question-on-employment-t172445.html,"Hi, I am brand new and hopefully I have put th...",ESTA question on employment,AnnaM
1,forum/esta-question-on-employment-t172445.html,Hi Anita & \nIf you are retired Anna then I'...,ESTA question on employment,Glynis HT Admin
10,forum/esta-question-on-employment-t172445.html,Is that the Cosmos one? Haven't been to LA but...,ESTA question on employment,Fiona
100,forum/vegas-buffets-restaurants-t35799.html,"Ah, the legendary Cheesecake Factory... people...",Vegas Buffets/Restaurants,grayejectbutton
1000,forum/fort-lauderdale-miami-t114861.html,Re: Fort Lauderdale to Miami\n\n\n ...,Fort Lauderdale to Miami,anstruther


We find the first post of every threads.

In [66]:
titles_with_first_post = df.groupby('Link').first().reset_index()[['Link', 'Title', 'Username', 'Message']]
titles_with_first_post.columns = ['Link', 'Title', 'Seeker', 'First_Post']
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post
0,forum/-20-dollar-trick-t110905.html,$20 Dollar trick,jac47,just returned from our first trip to vegas.\nI...
1,forum/-toronto-halal-t102178.html,--Toronto- halal--,Just_a_tourist,Hi! Does anyone know good places where they se...
2,forum/1st-time-florida-help-t141308.html,1st Time to Florida HELP!!!,Lelly,"Hello, we are a family of 5 going to Florida f..."
3,forum/1st-time-florida-whats-nearby-t153058.html,1st time florida - whats nearby,cart583,"hi, never been to florida before but have book..."
4,forum/1st-time-new-york-booking-advice-t126856...,1st time New York / booking advice,seagull,Starting to look into booking a break in New Y...


In [67]:
def tokenize_properly(text):
    if text is not None:
        r = [sent for sent in tokenizer.tokenize(text.replace('\n', '.')) 
            if len(sent.replace('.', '').replace(' ', '')) >= 2]
        return r

titles_with_first_post.Title = titles_with_first_post.Title.apply(tokenize_properly)
titles_with_first_post.First_Post = titles_with_first_post.First_Post.apply(tokenize_properly)
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],Just_a_tourist,"[Hi!, Does anyone know good places where they ..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],Lelly,"[Hello, we are a family of 5 going to Florida ..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],cart583,"[hi, never been to florida before but have boo..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],seagull,[Starting to look into booking a break in New ...


An approach is to put the titles and the first posts in lower case.

In [68]:
def lower(text):
    if text is not None:
        r = [s.lower() for s in text]
        return r
#titles_with_first_post.Title = titles_with_first_post.Title.apply(lower)
#titles_with_first_post.First_Post = titles_with_first_post.First_Post.apply(lower)
#titles_with_first_post.head()

We count the number of sentences for the titles and the first posts.

In [69]:
titles_with_first_post['Title_sent_count'] = titles_with_first_post.Title.apply(len)
titles_with_first_post['FP_sent_count'] = titles_with_first_post.First_Post.apply(len)
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post,Title_sent_count,FP_sent_count
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],Just_a_tourist,"[Hi!, Does anyone know good places where they ...",1,3
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],Lelly,"[Hello, we are a family of 5 going to Florida ...",1,7
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],cart583,"[hi, never been to florida before but have boo...",1,7
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],seagull,[Starting to look into booking a break in New ...,1,1


Another approach is to clean the replies as for example "Reply" or "Re: title of the post" are not useful to find the best answer.

In [70]:
def cleaning_replies(text):
    print(text)
    if text is not None:
        r = [s for s in text if not s.startswith('Re:') and not s.startswith("Reply") ]
        return r

We get rid of everything the author of the first post is saying, we tokenize the data and we count the number of sentences of the replies.

In [71]:
titles_with_messages = pd.merge(titles_with_first_post, 
                                df[['Username', 'Message', 'Link']], on='Link')
titles_with_messages.rename(columns={'Username':'Replier'}, inplace=True)
titles_with_messages = titles_with_messages[titles_with_messages.apply(
    lambda row: not row['First_Post'] == row['Message'] and not row['Seeker'] == row['Replier'], axis=1)]
titles_with_messages.Message = titles_with_messages.Message.apply(tokenize_properly)
#titles_with_messages.Message = titles_with_messages.Message.apply(cleaning_replies)
titles_with_messages.rename(columns={'Message': 'Reply'}, inplace=True)
titles_with_messages['Reply_sent_count'] = titles_with_messages.Reply.apply(len)
titles_with_messages.head()

Unnamed: 0,Link,Title,Seeker,First_Post,Title_sent_count,FP_sent_count,Replier,Reply,Reply_sent_count
1,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,luci HT Mod,"[Re: $20 Dollar trick..., Excellent news jac!,...",4
2,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,lesley74,"[Re: $20 Dollar trick..., I love the $20 trick...",3
3,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,luci HT Mod,"[Re: $20 Dollar trick..., Tried it at the Bell...",7
4,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,kiershay,[Re: $20 Dollar trick... does t...,2
5,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3,luci HT Mod,"[Re: $20 Dollar trick..., The suggested method...",6


In [72]:
titles_with_messages.columns

Index(['Link', 'Title', 'Seeker', 'First_Post', 'Title_sent_count',
       'FP_sent_count', 'Replier', 'Reply', 'Reply_sent_count'],
      dtype='object')

In [73]:
len(titles_with_messages)

11818

In [74]:
CHOSEN_COLUMNS = ['Link', 'Reply']
titles_with_messages[CHOSEN_COLUMNS].head()

Unnamed: 0,Link,Reply
1,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., Excellent news jac!,..."
2,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., I love the $20 trick..."
3,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., Tried it at the Bell..."
4,forum/-20-dollar-trick-t110905.html,[Re: $20 Dollar trick... does t...
5,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., The suggested method..."


In [75]:
msg_df = titles_with_messages[CHOSEN_COLUMNS]
msg_df.head()

Unnamed: 0,Link,Reply
1,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., Excellent news jac!,..."
2,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., I love the $20 trick..."
3,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., Tried it at the Bell..."
4,forum/-20-dollar-trick-t110905.html,[Re: $20 Dollar trick... does t...
5,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., The suggested method..."


In [80]:
titles_with_first_post.head()

Unnamed: 0,Link,Title,Seeker,First_Post,Title_sent_count,FP_sent_count
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],jac47,[just returned from our first trip to vegas..I...,1,3
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],Just_a_tourist,"[Hi!, Does anyone know good places where they ...",1,3
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],Lelly,"[Hello, we are a family of 5 going to Florida ...",1,7
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],cart583,"[hi, never been to florida before but have boo...",1,7
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],seagull,[Starting to look into booking a break in New ...,1,1


In [81]:
tfp_df = titles_with_first_post[['Link', 'Title', 'First_Post']]
tfp_df.head()

Unnamed: 0,Link,Title,First_Post
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],[just returned from our first trip to vegas..I...
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],"[Hi!, Does anyone know good places where they ..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],"[Hello, we are a family of 5 going to Florida ..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],"[hi, never been to florida before but have boo..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],[Starting to look into booking a break in New ...


### 2. Precomputing word2vec

We apply word2vec (we give a vector representation for every word of a sentence and then we associate every sentence with the mean of its words). We do this for the titles, the first posts and the messages.

In [82]:
import gensim
import numpy as np
model = gensim.models.KeyedVectors.load_word2vec_format('Forum Data/GoogleNews-vectors-negative300.bin', binary=True)
from nltk.corpus import stopwords
stopwords = set(nltk.corpus.stopwords.words('english'))

def get_sentence_vector(sentence):
    tokens = [token for token in nltk.word_tokenize(sentence) if token not in stopwords]
    vectors = []
    for token in tokens:
        try:
            word_vec = model.wv[token]
            vectors.append(word_vec)
        except:
            pass
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return []

In [83]:
tfp_df['Title_word2vec'] = tfp_df.Title.apply(lambda sents: [get_sentence_vector(sent) for sent in sents])
tfp_df['First_Post_word2vec'] = tfp_df.First_Post.apply(lambda sents: [get_sentence_vector(sent) for sent in sents])
tfp_df.head()

  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Link,Title,First_Post,Title_word2vec,First_Post_word2vec
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],[just returned from our first trip to vegas..I...,"[[0.053548176, -0.06933594, -0.022298178, 0.05...","[[0.06859633, 0.07334631, -0.01894244, 0.10071..."
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],"[Hi!, Does anyone know good places where they ...","[[-0.4375, -0.36914062, 0.21484375, 0.14941406...","[[-0.087402344, 0.095703125, 0.27539062, -0.01..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],"[Hello, we are a family of 5 going to Florida ...","[[0.09753418, 0.0126953125, 0.038024902, 0.224...","[[0.039695047, 0.022238992, 0.029807352, 0.102..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],"[hi, never been to florida before but have boo...","[[0.030883789, -0.016217042, 0.0061157225, 0.2...","[[-0.010480608, 0.021902902, -0.032854352, 0.0..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],[Starting to look into booking a break in New ...,"[[-0.012568156, 0.072255455, -0.041554768, 0.0...","[[0.042194713, 0.096147016, -0.065665506, 0.10..."


In [84]:
msg_df['Reply_word2vec'] = msg_df.Reply.apply(lambda sents: [get_sentence_vector(sent) for sent in sents])
msg_df.head()

  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Link,Reply,Reply_word2vec
1,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., Excellent news jac!,...","[[0.026672363, 0.012939453, 0.06286621, 0.0266..."
2,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., I love the $20 trick...","[[0.026672363, 0.012939453, 0.06286621, 0.0266..."
3,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., Tried it at the Bell...","[[0.026672363, 0.012939453, 0.06286621, 0.0266..."
4,forum/-20-dollar-trick-t110905.html,[Re: $20 Dollar trick... does t...,"[[0.011083985, 0.012243653, 0.04169922, 0.1027..."
5,forum/-20-dollar-trick-t110905.html,"[Re: $20 Dollar trick..., The suggested method...","[[0.026672363, 0.012939453, 0.06286621, 0.0266..."


We write the data to pickle files

In [86]:
import pickle

output1 = open('Forum Data/tfp_df.pkl', 'wb')
pickle.dump(tfp_df, output1)

output2 = open('Forum Data/msg_df.pkl', 'wb')
pickle.dump(msg_df, output2)

### 3. Dot Product Similarity Functions

In [87]:
import pandas as pd
from os import listdir
import gensim
import numpy as np
import nltk
from nltk.corpus import stopwords
import ast
stopwords = set(nltk.corpus.stopwords.words('english'))

In [88]:
model = gensim.models.KeyedVectors.load_word2vec_format('Forum Data/GoogleNews-vectors-negative300.bin', binary=True)

In [89]:
tfp_df = pd.read_pickle('Forum Data/tfp_df.pkl')
tfp_df.head()

Unnamed: 0,Link,Title,First_Post,Title_word2vec,First_Post_word2vec
0,forum/-20-dollar-trick-t110905.html,[$20 Dollar trick],[just returned from our first trip to vegas..I...,"[[0.053548176, -0.06933594, -0.022298178, 0.05...","[[0.06859633, 0.07334631, -0.01894244, 0.10071..."
1,forum/-toronto-halal-t102178.html,[--Toronto- halal--],"[Hi!, Does anyone know good places where they ...","[[-0.4375, -0.36914062, 0.21484375, 0.14941406...","[[-0.087402344, 0.095703125, 0.27539062, -0.01..."
2,forum/1st-time-florida-help-t141308.html,[1st Time to Florida HELP!!],"[Hello, we are a family of 5 going to Florida ...","[[0.09753418, 0.0126953125, 0.038024902, 0.224...","[[0.039695047, 0.022238992, 0.029807352, 0.102..."
3,forum/1st-time-florida-whats-nearby-t153058.html,[1st time florida - whats nearby],"[hi, never been to florida before but have boo...","[[0.030883789, -0.016217042, 0.0061157225, 0.2...","[[-0.010480608, 0.021902902, -0.032854352, 0.0..."
4,forum/1st-time-new-york-booking-advice-t126856...,[1st time New York / booking advice],[Starting to look into booking a break in New ...,"[[-0.012568156, 0.072255455, -0.041554768, 0.0...","[[0.042194713, 0.096147016, -0.065665506, 0.10..."


Functions that compute the similarity between sentences.

In [91]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1) * np.linalg.norm(vec2))

def get_sentence_vector(sentence):
    tokens = [token for token in nltk.word_tokenize(sentence) if token not in stopwords]
    vectors = []
    for token in tokens:
        try:
            word_vec = model.wv[token]
            vectors.append(word_vec)
        except:
            pass
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return []

def is_not_null(sent_vec):
    for element in sent_vec:
        if not element == 0.0:
            return True
    return False

def sent_to_text_similarity(sent_vec, text_vec):
    similarities = []
    for vec in text_vec:
        if is_not_null(vec):
            similarities.append(np.dot(sent_vec, vec)/(np.linalg.norm(sent_vec) * np.linalg.norm(vec)))
    if similarities:
        return np.mean(similarities)
    else:
        return np.nan

def text_to_text_similarity(sent_vecs1, sent_vecs2):
    similarities = []
    for v1 in sent_vecs1:
        if is_not_null(v1):
            similarity = sent_to_text_similarity(v1, sent_vecs2)
            if not np.isnan(similarity):
                similarities.append(similarity)
    if similarities:
        return np.mean(similarities)
    else:
        return np.nan

def text_to_corpus_similarity(text, corpus):
    sent_vecs = text_to_sent_vec(text)
    corpus_vecs = [text_to_sent_vec(other_text) for other_text in corpus]
    max_sim = 0
    index = -1
    for text_index in range(len(corpus_vecs)):
        similarity = text_to_text_similarity(sent_vecs, corpus_vecs[text_index])
        if not np.isnan(similarity) and max_sim < similarity:
            max_sim = similarity
            index = text_index
    if index >= 0:
        return corpus[index]
    else:
        return None

### 4. Chatbot Functions

Functions that find the best matching sentences in a forum.

In [92]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [101]:
def compute_similarity(row, sent_vec):
    title_sim = 0
    title_word2vec = row['Title_word2vec']
    if len(title_word2vec) > 0:
        if len(title_word2vec[0]) > 0:
            title_sim = cosine_similarity(sent_vec, title_word2vec[0])
    return title_sim

def compute_separate_similarity(row, sent_vecs):
    title_sim = 0
    title_word2vec = row['Title_word2vec']
    if len(title_word2vec) > 0:
        if len(title_word2vec[0]) > 0:
            title_sim = np.dot(sent_vecs[0], title_word2vec[0])/(np.linalg.norm(sent_vecs[0])*np.linalg.norm(title_word2vec[0]))
    fp_sim = text_to_text_similarity(sent_vecs[1:], row['First_Post_word2vec'])
    return title_sim + fp_sim

def compute_separate_similarity_no_question(row, sent_vecs):
    fp_sim = text_to_text_similarity(sent_vecs, row['First_Post_word2vec'])
    return fp_sim

def get_most_similar_title(sentences, sent_vecs):
    """
    Finds the most similar thread in a forum (the thread which will be the most likely to have a matching response).
    Returns the row of the DataFrame of this most similar thread.
    :param sentences: list
    :param sent_vecs: list
    :return: DataFrame
    """
    if sentences == 0:
        raise ValueError('Write something!')
    elif len(sentences) == 1:
        #compares only with titles
        title_fp_sim = tfp_df.apply(lambda row: compute_similarity(row, sent_vecs[0]), axis=1)
    elif sentences[0].endswith('?'):
        #compares with titles and first posts
        title_fp_sim = tfp_df.apply(lambda row: compute_separate_similarity(row, sent_vecs), axis=1)
    else:
        #compares with first posts
        title_fp_sim = tfp_df.apply(lambda row: compute_separate_similarity_no_question(row, sent_vecs), axis=1)
    return tfp_df.loc[title_fp_sim.idxmax()]

def get_response_sentences(sent_vecs, link, max_sentences):
    """
    Finds the closest sentences (depending on max_sentences) in a particular thread.
    :param sent_vecs: list
    :param link: string
    :param max_sentences: int
    :return: string
    """
    answer_df = pd.read_pickle('Forum Data/msg_df.pkl')
    answer_df = answer_df[answer_df['Link'].map(lambda x: x == link)]
    
    if answer_df.empty:
        s = 'I did not find a matching sentence'
        return s
    
    best_answer = answer_df.loc[answer_df['Reply_word2vec'].apply(lambda other_vecs: 
                                                     text_to_text_similarity(sent_vecs, other_vecs)).idxmax()]
        
    best_sentence_idx = np.argmax([sent_to_text_similarity(sent_vec, sent_vecs) for sent_vec in best_answer.Reply_word2vec if len(sent_vec)])
    reply_sentences = best_answer.Reply
    if max_sentences <= 1:
        return reply_sentences[best_sentence_idx]
    else:
        context_sent_count = int((max_sentences - 1)/2)
        sent_count = len(reply_sentences)
        lower_bound = best_sentence_idx - context_sent_count
        upper_bound = best_sentence_idx + context_sent_count + 1
        return ' '.join(reply_sentences[max(0, lower_bound - max(0, upper_bound - sent_count)): 
                                        min(upper_bound + max(0, 0 - lower_bound) + ((max_sentences - 1) % 2), sent_count)])

def chatbot_answer(question, max_sentences=1):
    """
    Finds the closest response in the forum using forum data and word2vec.
    :param question: string
    :param max_sentences: int
    :return: string
    """
    sentences = tokenizer.tokenize(question)
    sent_vecs = [get_sentence_vector(sent) for sent in sentences]
    most_similar_title = get_most_similar_title(sentences, sent_vecs)
    return get_response_sentences(sent_vecs, most_similar_title.Link, max_sentences)
                        

In [106]:
chatbot_answer("Best hotel in Vegas?", max_sentences=3)

  if __name__ == '__main__':


"Our fave is the MGM Grand. Yes it is huge, but that is what Vegas is all about. You have everything you want in one hotel...The atmosphere is great with a very young feel to the place and the feel good factor is more evident here than at other hotels, with perhaps the NYNY being the only hotel able to compete...Rooms are to die for, service is exceptional, and location can't be beaten."