In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KDTree

In [2]:
url= 'https://www.toptal.com/faq'

soup = BeautifulSoup(requests.get(url).text, "html.parser")

In [3]:
questions = []
answers = []

a_selector = 'body > main > section > section._1K1faYl9 > div '
q_selector = 'body > main > section > section > h3._3r3EnTQQ'

for x in soup.select(a_selector):
    answers.append(x.text.strip())

for x in soup.select(q_selector):
    questions.append(x.text.strip())

In [4]:
print(len(questions))
print(len(answers))

data = pd.DataFrame({'q':questions, 'a':answers})
data.head()

17
17


Unnamed: 0,q,a
0,Are there any upfront recruiting or contractua...,We require an initial deposit of $500 that wil...
1,How is the trial period no-risk?,We make sure to start each engagement with a t...
2,What happens if I’m not satisfied with a Topta...,We allow our clients to begin a trial period w...
3,How is Toptal different?,Many organizations are faced with the issue of...
4,Are English skills ever an issue when working ...,No. Each and every Toptal expert writes and sp...


In [5]:
vectorizer = make_pipeline(TfidfVectorizer(), TruncatedSVD(n_components=100), Normalizer())

In [6]:
vectorizer.fit(pd.concat([data.q, data.a]))

In [7]:
vectors = vectorizer.transform(data.q)

In [8]:
vectors[0]

array([ 7.54691752e-02, -1.25717402e-02,  6.53698532e-02,  3.60078198e-02,
       -3.05375594e-01,  1.02691349e-01, -1.75892407e-01, -1.94839287e-01,
       -7.66138232e-02,  5.80294384e-01,  3.45992153e-01,  9.40694305e-03,
       -2.66489779e-04,  9.76987003e-02, -8.16484850e-02,  1.63595787e-01,
        4.79086141e-01,  1.01183413e-01, -3.50946892e-04,  5.97702345e-02,
       -1.29137666e-01, -1.02070936e-01, -1.06678284e-01,  2.67980014e-02,
        9.97766799e-02, -3.37748636e-02,  3.23366559e-02, -5.94705205e-03,
       -8.30275621e-02,  7.62528824e-02,  7.08876774e-03, -3.54801070e-03,
       -1.80641234e-02,  5.18540595e-02])

In [9]:
vectors.shape

(17, 34)

In [10]:
index = KDTree(vectors)

In [11]:
distances, indices = index.query(vectorizer.transform(['What can I use for payment?']), k=3)

In [12]:
distances

array([[0.59189449, 1.15648822, 1.17639995]])

In [13]:
indices

array([[16,  2, 10]], dtype=int64)

In [14]:
for d, i in zip(distances[0], indices[0]):
    print(d, i, data.q[i])

0.5918944941978405 16 What methods of payment do you accept?
1.1564882235137919 2 What happens if I’m not satisfied with a Toptal expert?
1.1763999527568623 10 What happens if a Toptal expert is unavailable for the days or times they are scheduled to work with me?


In [15]:
distances, indices = index.query(vectorizer.transform(['Where is Toptal located?']), k=3)

In [16]:
for d, i in zip(distances[0], indices[0]):
    print(d, i, data.q[i])

0.5183400367740291 6 Where are your experts located?
0.9226845240730206 7 Where do Toptal experts work?
1.1406549609960133 3 How is Toptal different?


In [17]:
indices[0][0]

6

In [18]:
def respond(text):
    distances, indices = index.query(vectorizer.transform([text]), k=3)
    if distances[0][0] > 0.65:
        print(f"Unfortunately, I cannot answer this question yet. Maybe, you wanted to know '{data['q'][indices[0][0]]}'")
    else:
        print(data['a'][indices[0][0]])

In [19]:
respond('Where is Toptal located?')

Currently, we have experts in over 100 countries, most being located in the Americas and Europe.


In [20]:
respond('Where is the nearest restaurant?')

Unfortunately, I cannot answer this question yet. Maybe, you wanted to know 'Where do Toptal experts work?'


In [21]:
respond('What if Toptal expert fails?')

We allow our clients to begin a trial period with up to three experts from our network per position. While we rigorously screen all applicants to ensure talent and intelligence, we understand that not every person will be a perfect fit for every company. For this reason, we allow our clients to work with multiple experts for each position before they decide on a candidate with whom they are confident and comfortable.


In [22]:
respond('Can I invite Toptal expert to the company?')

Unfortunately, I cannot answer this question yet. Maybe, you wanted to know 'Can I hire a full-time worker from Toptal and bring him/her into our company?'


In [23]:
respond('Who owns the results of work?')

The client. Toptal’s only business is connecting world-class experts to the clients who want to work with them - not intellectual property. Our contracts provide that all work created by a Toptal expert is the property of the client, not Toptal.


In [24]:
respond('Who owns the worker?')

The client. Toptal’s only business is connecting world-class experts to the clients who want to work with them - not intellectual property. Our contracts provide that all work created by a Toptal expert is the property of the client, not Toptal.


In [42]:
class ToptalFAQBot():
    def reply(self, text):
        self.text = text
        distances, indices = index.query(vectorizer.transform([self.text]), k=3)
        if distances[0][0] > 0.65:
            print(f"Unfortunately, I cannot answer this question yet. Maybe, you wanted to know '{data['q'][indices[0][0]]}'")
        else:
            print(data['a'][indices[0][0]])

In [43]:
bot = ToptalFAQBot()
bot.reply('Who owns the results of work?')

The client. Toptal’s only business is connecting world-class experts to the clients who want to work with them - not intellectual property. Our contracts provide that all work created by a Toptal expert is the property of the client, not Toptal.
