In [1]:
# ! pip install apify-client chromadb tiktoken SQLAlchemy==2.0.1

In [2]:
import os
import pandas as pd

from langchain.docstore.document import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.utilities import ApifyWrapper

### Use Apify to scrape a website

Create a free account at:

https://apify.com/apify/website-content-crawler

Uncoment to run the crawler

In [3]:
# Scrape a website

# apify = ApifyWrapper()

# # Call the Actor to obtain text from the crawled webpages

# start_url = 'https://tickets.dominodatalab.com/'

# loader = apify.call_actor(
#     actor_id="apify/website-content-crawler",
#     run_input={
#         "startUrls": [{"url": start_url}],
#         "max_items": 10
#     },
#     dataset_mapping_function=lambda item: Document(
#         page_content=item["text"] or "", metadata={"source": item["url"]}
#     ),
# )

Download your raw scraped csv file and upload it to a Domino Dataset.

Some data prep and cleaning can help RAG app performance.

In [4]:
data = pd.read_csv("/mnt/data/{}/raw_data/domino_help.csv".format(os.environ['DOMINO_PROJECT_NAME']))
data.head()

Unnamed: 0,url,markdown
0,https://tickets.dominodatalab.com/,[![DOMINO SUPPORT Help Center home page](//the...
1,https://tickets.dominodatalab.com/hc/en-us/req...,Description\n\nPlease enter the details of you...
2,https://tickets.dominodatalab.com/hc/en-us/req...,![](https://popup.dominodatalab.com/datapopup/...
3,https://tickets.dominodatalab.com/hc/en-us/cat...,## Domino Support Service Information\n\nInfor...
4,https://tickets.dominodatalab.com/hc/en-us/sig...,![](https://popup.dominodatalab.com/datapopup/...


In [5]:
import pandas as pd

# Load the CSV dataset from Apify download
data = pd.read_csv("/mnt/data/{}/raw_data/domino_help.csv".format(os.environ['DOMINO_PROJECT_NAME']))

# Extract the url and txt columns
data = data[['url', 'markdown']]

# Write the extracted data to a new file
data.to_csv("/mnt/data/{}/raw_data/domino_help.csv".format(os.environ['DOMINO_PROJECT_NAME']), index=False)

import random

i = random.randint(0,100)
print(i)

data['markdown'][i]

67


'![](https://tickets.dominodatalab.com/system/photos/361254859151/Screen_Shot_2021-12-01_at_7.17.31_PM.png)\n\n[Follow](https://tickets.dominodatalab.com/hc/en-us/articles/20288906500244-Installing-R-packages-from-github-into-my-Domino-environment/subscription.html "Opens a sign-in dialog")\n\nThere are a few methods of installing external R packages from git into your environment. One of the most common methods we see is to use the devtools package. The full docs for the devtools package can be found [here](https://github.com/r-lib/devtools), but generally you\'ll want to install the devtools package into your Dockerfile as shown below:\xa0\n\ninstall.packages("devtools")\n\nThen install the full package:\xa0\n\n```\nRUN R -e "devtools::install_github(repo = "owner/project", host = "https://github.com", auth_token = "{token}")"\n```\n\nIf you\xa0 encounter any issues using the devtools package, you can also try using the "remotes" package. The docs can be found [here](https://cran.r-p

### Clean up help data

The raw data from the crawler contains duplicate rows, links and other metadata that isn't needed.

In [6]:
data = pd.read_csv("/mnt/data/{}/raw_data/domino_help.csv".format(os.environ['DOMINO_PROJECT_NAME']))

print('Original shape: {}'.format(data.shape))
print('Drop duplicate rows')

data = data.drop_duplicates(subset='markdown').reset_index()

print(data.shape)

print('Filter for pages that contain help questions')

def filter_rows(df, column, search_string):
    filtered_df = df[df[column].str.contains(search_string)]
    return(filtered_df)

# Indicates page contains a question:
search_string = 'Issue:'
column = 'markdown'
data = filter_rows(df=data, column=column, search_string=search_string).copy()
data = data.reset_index()
print(data.shape)

# Extract only the question & answer
data['markdown'] = data['markdown'].apply(lambda x : x.split('Issue:')[1])
data['markdown'] = data['markdown'].apply(lambda x : x.split('Notes/Information')[0])
data = data[['url', 'markdown']]


data.to_csv("/mnt/data/{}/domino_help_clean.csv".format(os.environ['DOMINO_PROJECT_NAME']), index=False)

data.head()

Original shape: (1050, 2)
Drop duplicate rows
(664, 3)
Filter for pages that contain help questions
(208, 4)


Unnamed: 0,url,markdown
0,https://tickets.dominodatalab.com/hc/en-us/search,... Some scheduled jobs and workspaces can in...
1,https://tickets.dominodatalab.com/hc/en-us/art...,"**\n\nOn Git-based projects, when starting a n..."
2,https://tickets.dominodatalab.com/hc/en-us/art...,"**\n\nBefore Domino 5.3, GIT based projects do..."
3,https://tickets.dominodatalab.com/hc/en-us/art...,**\n\nThe Admin >> Data >> Datasets page allow...
4,https://tickets.dominodatalab.com/hc/en-us/art...,** A common question from endusers is how they...


# Domino Docs

In [7]:
import os
import pandas as pd

from langchain.docstore.document import Document
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.utilities import ApifyWrapper

In [8]:
# Scrape a website

# apify = ApifyWrapper()

# # Call the Actor to obtain text from the crawled webpages

# start_url = 'https://docs.dominodatalab.com/en/latest/user_guide/'

# loader = apify.call_actor(
#     actor_id="apify/website-content-crawler",
#     run_input={
#         "startUrls": [{"url": start_url}],
#         "max_items": 10
#     },
#     dataset_mapping_function=lambda item: Document(
#         page_content=item["text"] or "", metadata={"source": item["url"]}
#     ),
# )

Download your raw scraped csv file and upload it to a Domino Dataset.

Some data prep and cleaning can help RAG app performance.

In [9]:
import pandas as pd

# Load the CSV dataset
docs = pd.read_csv("/mnt/data/{}/raw_data/domino_docs.csv".format(os.environ['DOMINO_PROJECT_NAME']))

# Extract the url and txt columns
docs = docs[['url', 'markdown']]

# Write the extracted data to a new file called help.csv
docs.to_csv("/mnt/data/{}/raw_data/domino_docs.csv".format(os.environ['DOMINO_PROJECT_NAME']), index=False)

import random

i = random.randint(0,100)
print(i)

docs['markdown'][i]

4


'## Get Domino version\n\nTo see the Domino version that you are running, go to `<your domino url>/version`.\n\nFor example, users on `try.domino.tech` can see the version at `try.domino.tech/version`.'

In [10]:
docs = pd.read_csv("/mnt/data/{}/raw_data/domino_docs.csv".format(os.environ['DOMINO_PROJECT_NAME']))

print('Original shape: {}'.format(docs.shape))
print('Drop duplicate rows')

docs = docs.drop_duplicates(subset='markdown').reset_index()

print(docs.shape)

print('Filter for pages that containe help questions')

def filter_rows(df, column, search_string):
    filtered_df = df[df[column].str.contains(search_string)]
    return(filtered_df)

docs = docs[['url', 'markdown']]


docs.to_csv("/mnt/data/{}/domino_docs_clean.csv".format(os.environ['DOMINO_PROJECT_NAME']), index=False)

docs.head()

Original shape: (383, 2)
Drop duplicate rows
(383, 3)
Filter for pages that containe help questions


Unnamed: 0,url,markdown
0,https://docs.dominodatalab.com/en/latest/user_...,Page not foundLet's search!
1,https://docs.dominodatalab.com/en/latest/user_...,## User guide\n\nDomino is a data science plat...
2,https://docs.dominodatalab.com/en/latest/user_...,## Troubleshooting guidelines for users\n\n[Tr...
3,https://docs.dominodatalab.com/en/latest/user_...,About Domino\n\n[Domino Data Lab](https://domi...
4,https://docs.dominodatalab.com/en/latest/user_...,## Get Domino version\n\nTo see the Domino ver...
