In [52]:
import os

os.environ["INSTILL_API_TOKEN"]=""


In [93]:
import requests
import json
import re


domain = "api.instill.tech"
namespace = "chunhao094"
api_token = os.environ['INSTILL_API_TOKEN']
get_links_pipeline = "get-links"
upload_web_data = "upload-web-data"
talk_to_aftee_pipeline = "talk-to-aftee"
get_file_names_pipeline = "get-all-files-in-catalog"

catalog_id = "aftee-customer-service"

def get_url(domain, namespace, pipeline_name):
        return f"https://{domain}/v1beta/users/{namespace}/pipelines/{pipeline_name}/trigger"


def get_payload(data):
    return {
        "inputs": [
            data
        ]
    }

def call_api(pipeline_name, data):    
    url = get_url(domain, namespace, pipeline_name)
    json_data = json.dumps(data)
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_token}',
    }
    response = requests.post(url, headers=headers, data=json_data)
    return response.json()

def extract_markdown_links(text):
    # Regular expression pattern to match markdown links
    pattern = r'\[(.*?)\]\((.*?)\)'
    
    # Find all matches in the input text
    matches = re.findall(pattern, text)
    
    # Convert matches to the desired format
    links = [{"title": title, "url": url} for title, url in matches]
    
    return links



def get_output(response, key_name):
    return response["outputs"][0][key_name]

category_links = [
    "https://netprotections.freshdesk.com/support/solutions/70000089079",
    "https://netprotections.freshdesk.com/support/solutions/70000126449",
    "https://netprotections.freshdesk.com/support/solutions/70000320060",
    "https://netprotections.freshdesk.com/support/solutions/70000126448",
    "https://netprotections.freshdesk.com/support/solutions/70000321543",
    "https://netprotections.freshdesk.com/support/solutions/70000126447",
    "https://netprotections.freshdesk.com/support/solutions/70000126452",
    ]


In [94]:
links_to_catalog = []
titles = []

for category_link in category_links:
    data = {
        "category_link": category_link 
    }
    links_response = call_api(get_links_pipeline, get_payload(data))
    markdown = get_output(links_response, "markdown")
    
    urls_map = extract_markdown_links(markdown)
    
    for url_map in urls_map:
        link_with_domain = url_map["url"]
        
        if (link_with_domain not in links_to_catalog and 
            link_with_domain.startswith("http://netprotections.freshdesk.com/support/solutions/articles/")): 
            links_to_catalog.append(link_with_domain)
            unreadable_title = url_map["title"]
            text = unreadable_title.encode('utf-8').decode('unicode_escape')
            chinese = text.encode('latin1').decode('utf-8')
            titles.append(chinese+".md")

print("title length: ", len(titles))
print("links length: ", len(links_to_catalog))
print(json.dumps(titles, indent=4))
print(json.dumps(links_to_catalog, indent=4))



title length:  65
links length:  65
[
    "\u4ec0\u9ebc\u662f AFTEE\u5148\u4eab\u5f8c\u4ed8 \uff1f.md",
    "\u5982\u4f55\u4f7f\u7528 AFTEE\u5148\u4eab\u5f8c\u4ed8 \uff1f.md",
    "AFTEE\u5148\u4eab\u5f8c\u4ed8 \u4f7f\u7528\u898f\u7d04.md",
    "AFTEE\u5148\u4eab\u5f8c\u4ed8 \u4f86\u81ea\u54ea\u9593\u516c\u53f8\uff1f.md",
    "\u4f7f\u7528 AFTEE\u5148\u4eab\u5f8c\u4ed8 \u9700\u8981\u624b\u7e8c\u8cbb\u6216\u984d\u5916\u8cbb\u7528\u55ce\uff1f.md",
    "\u4ec0\u9ebc\u6642\u5019\u6703\u6536\u5230\u7e73\u8cbb\u55ae\uff1f.md",
    "\u6c92\u6709\u6536\u5230\u7e73\u8cbb\u55ae.md",
    "\u8a02\u55ae\u53d6\u6d88\u4ecd\u6536\u5230\u7e73\u8cbb\u901a\u77e5.md",
    "\u7e73\u8cbb\u55ae\u5167\u5bb9\u932f\u8aa4/\u6c92\u5370\u8c61.md",
    "\u7e73\u8cbb\u689d\u78bc/\u532f\u6b3e\u5e33\u865f\u7121\u6cd5\u4f7f\u7528.md",
    "\u8a72\u5982\u4f55\u4ed8\u6b3e\uff1f.md",
    "\u5982\u4f55\u900f\u904e AFTEE APP \u4ed8\u6b3e\uff1f.md",
    "\u7e73\u8cbb\u5f8c\u5c1a\u672a\u5165\u5e33.md",
    "\u4ec0\u9ebc\u662f

In [107]:
get_file_names = call_api(get_file_names_pipeline, get_payload({"catalog_id": catalog_id, "namespace": namespace}))
print(get_file_names)




{'outputs': [{'file_names': {'result_0': ['29_如何使用_AFTEE先享後付_結帳.md', '17_如何使用悠遊付付款.md', '7_訂單取消仍收到繳費通知.md', '0_什麼是_AFTEE先享後付_.md', '3_AFTEE先享後付_來自哪間公司.md', '2_AFTEE先享後付_使用規約.md', '11_如何透過_AFTEE_APP_付款.md', '4_使用_AFTEE先享後付_需要手續費或額外費用嗎.md', '20_超過繳費期限該怎麼辦.md', '21_若持續未付款會發生什麼事.md']}}], 'metadata': None}


In [108]:
uploaded_file_names = get_output(get_file_names, "file_names")["result_0"]

idx_titles = []
for i, title in enumerate(titles):
    temp = title.replace("/", "_").replace(" ", "_").replace("？", "")
    idx_titles.append(str(i) + "_" + temp)

to_be_uploaded_titles = []
to_be_uploaded_urls = []

for title, link in zip(idx_titles, links_to_catalog):
    if title not in uploaded_file_names:
        to_be_uploaded_titles.append(title)
        to_be_uploaded_urls.append(link)

# for title, link in zip(to_be_uploaded_titles, to_be_uploaded_urls):
#     print(title, link)

# print(len(to_be_uploaded_titles))

# print(titles)
upload_response = call_api(upload_web_data, get_payload({"catalog_id": catalog_id, 
                                                         "links": to_be_uploaded_urls, 
                                                         "namespace": namespace, 
                                                         "file_names": to_be_uploaded_titles}))

print(upload_response)

{'code': 3, 'message': 'Component artifact-0 failed to execute. failed to upload file: rpc error: code = Unknown desc = file already exists in the catalog. file: {1_如何使用_AFTEE先享後付_.md}', 'details': []}


In [109]:
questions = [
    "什麼是AFTEE，我能用它做什麼？",
    "15至45天的繳費天數是如何計算的",
    "為什麼我被收滯納金？",
    "為什麼我的額度被調低？",
    "我的貨怎麼還沒到？",
    "我可以在哪裡查詢我的訂單？",
    "我可以分期幾次？",
    "我可以分期幾期？",
    "國外可以用 AFTEE 嗎？",
    "我沒有台灣門號可以用 AFTEE 嗎？",
    "為什麼我的額度那麼低？",
    "為什麼我收到催繳簡訊？",
    "AFTEE 跟信用卡的差別是什麼？可以買什麼？"
]
answers = []

chunks = []

for question in questions:
    talk_to_aftee = call_api(talk_to_aftee_pipeline, 
                            get_payload({"catalog_id": catalog_id,
                                        "namespace": namespace, 
                                        "question": questions[0]}))

    answers.append(get_output(talk_to_aftee, "answer"))
    chunks.append(get_output(talk_to_aftee, "chunks"))
    print(talk_to_aftee)

{'outputs': [{'answer': 'AFTEE APP 會員不需任何登錄費或年費，只需下載安裝 AFTEE APP 並完成註冊即可使用以下服務：\n\n- 輸入手機號碼及APP密碼即可完成結帳手續\n- 輕鬆確認您使用 AFTEE先享後付 的交易紀錄\n- 即時確認您的額度上限及可用額度\n- 繳費期限將延長最短15至最長45日\n- 繳費期限內可自由選擇繳費金額', 'chunks': [{'chunk-uid': '344ec04a-5863-4053-b5e5-4040cefd669f', 'similarity-score': 0.6648455, 'source-file-name': '42_AFTEE_APP_會員是什麼.md', 'text-content': '輸入搜尋字詞 搜尋 /support/solutions /support/solutions/70000126447 /support/solutions/articles/70000242110-aftee-app-%E6%9C%83%E5%93%A1%E6%98%AF%E4%BB%80%E9%BA%BC- \r\n\r\nAFTEE APP 會員是什麼？ \r\n\r\n 修改於： 星期五, 九月 22, 2023 在 1:36 PM\r\n\r\n AFTEE APP 會員不需任何登錄費或年費，只需下載安裝 AFTEE APP 並完成註冊即可使用以下服務：\r\n\r\n● 輸入手機號碼及APP密碼即可完成結帳手續● 輕鬆確認您使用 AFTEE先享後付 的交易紀錄● 即時確認您的額度上限及可用額度● 繳費期限將延長最短15至最長45日● 繳費期限內可自由選擇繳費金額\r\n\r\n\r\n立即下載 AFTEE APP\xa0\xa0↓\r\n\r\nhttps://itunes.apple.com/tw/app/aftee-%E5%BE%8C%E6%94%AF%E4%BB%98/id1420585964?mt=8https://play.google.com/store/apps/details?id=tw.netprotections.afteeapphttps://play.google.com/store/apps/details?id=tw.netprotect

In [110]:
import csv
with open('aftee_data.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(["Question", "Answer", "Chunks"])
    
    # Write the data
    for question, answer, chunk in zip(questions, answers, chunks):
        writer.writerow([question, answer, chunk])

print("Data exported to aftee_data.csv")


Data exported to aftee_data.csv
