### Imports

In [1]:
import os
import glob
import json
import pandas as pd
import re
import pickle
import concurrent.futures
import requests
import openai
from tqdm import tqdm
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import MarkdownTextSplitter

### Load Pytorch Docs

In [2]:
df = pd.read_json('blogs.json')
df.shape

(123, 2)

### Using langchain - split the data into multiple pages

In [3]:
def split_pages(df):
    markdown_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
    pages = []
    for index, row in df.iterrows():
        markdown_text = row["text"]
        metadata = row["metadata"]
        docs = markdown_splitter.create_documents([markdown_text], [metadata])
        pages.extend(docs)
    
    print('total pages:', len(pages))
    return pages


pages = split_pages(df)
len(pages)

total pages: 2539


2539

### Run this step multiple times with the start and end values as the multiple of 1000
```
start = 0, end = 1000
start = 1000, end = 2000
start = 2000, end = 3000
start = 3000, end = 4070
```

### At the end of each iteration, blogs_qa_openai_*.csv will be generated

In [11]:
api_key = ''

def get_qa_openai(context):
    try:
        completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo", api_key = api_key,
                messages=[
                    {"role": "user", "content": context}
                  ]
                )

        qa = completion.choices[0].message.content

    except requests.exceptions.RequestException as e:
        print(f'Request failed with error: {str(e)}.')
        print(f'Waiting for 3 minutes before trying again...')
        time.sleep(180)
    
    return qa

questions_ans = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    
    futures = []
    start = 0
    end = 2540
    for i in pages[start:end]:
        
        context = f"Generate question and answer only in this format 'Question: Answer:' using this context \
        and you can decide the number of question and answer to generate based \
        on context size but don't generate too many same kind of questions: {i.page_content}"
        
        futures.append(executor.submit(get_qa_openai, context))

    for future, i in tqdm(zip(concurrent.futures.as_completed(futures), pages[start:end]), total=len(pages[start:end])):
        try:
            qa = future.result()
            questions_ans.append({'text':qa, 'metadata':i.metadata})
        except Exception as exc:
            print(f'generated an exception: {exc}')

df1 = pd.DataFrame(questions_ans)
df1.to_csv(f'blogs_qa_openai_{start}_{end}.csv')
df1.shape

 15%|████████████████████████                                                                                                                                           | 374/2539 [03:21<12:07,  2.98it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 557e40cc5bc0e37013c0261df8e0542b in your message.)


 24%|███████████████████████████████████████                                                                                                                            | 608/2539 [05:33<22:21,  1.44it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 06039c1cecc114462fb7285a40d366a6 in your message.)


 27%|████████████████████████████████████████████▍                                                                                                                      | 692/2539 [06:18<16:51,  1.83it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID de02e84ba9cb18ff081d9cb762f8d48e in your message.)


 36%|██████████████████████████████████████████████████████████▊                                                                                                        | 916/2539 [08:13<09:57,  2.72it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 1547fd67b6fe635997de5c561b4ba631 in your message.)


 53%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                            | 1336/2539 [12:02<17:15,  1.16it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID d5f6767d8ed587f00a5f5074d4722b07 in your message.)


 55%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 1402/2539 [12:35<07:49,  2.42it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 746b002ff36bd88d2ffecb9931304c04 in your message.)


 60%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 1528/2539 [13:41<09:27,  1.78it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 43fa05bb28c5868620520ede95293ba6 in your message.)


 63%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 1610/2539 [14:24<10:00,  1.55it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 0f9317bb81e2dc0473c379bbfa266f47 in your message.)


 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 1686/2539 [15:03<03:34,  3.98it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 62d6ec3cfd413d86646f4347d4a0491f in your message.)


 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 1900/2539 [16:56<04:02,  2.64it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 61a2697b78b546a14f7c4e56125b63b2 in your message.)


 79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 2002/2539 [17:52<02:21,  3.80it/s]

generated an exception: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html>
)


 81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 2062/2539 [18:23<04:32,  1.75it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 9bb658aec7414dc4f76e663ffe00cada in your message.)


 88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 2235/2539 [19:52<03:35,  1.41it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID a0270d650bd151200a5babdbd96caac0 in your message.)


 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                  | 2255/2539 [20:01<03:13,  1.47it/s]

generated an exception: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 7df5c0c07be76d55852c7d2772dde790 in your message.)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2539/2539 [22:38<00:00,  1.87it/s]


(2525, 2)

### Each row contains multiple question and answers. Split it into multiple rows to have one question and answer per row

In [16]:
def split_sentence_by_word(sentence, split_word):
    sentences = sentence.split(split_word)
    result = [''.join([split_word, s.strip()]) for s in sentences if s.strip()]
    return result


In [17]:
def final_qa(df1):

    final_text = []

    for index,i in df1.iterrows():
        result = split_sentence_by_word(i['text'], 'Question: ')
        metadata = i['metadata']
        for i in result:
            final_text.append({'text':i, 'metadata':metadata})
    return final_text

In [19]:
blogs_qa_list = final_qa(df1)
len(blogs_qa_list)

11887

### Remove the rows where LLM couldnt find the answer

In [22]:
to_be_excluded = []
for i in range(len(blogs_qa_list)):
    row = blogs_qa_list[i]
    text = row["text"]
    answer = text.split("Answer: ")[-1]

    if answer.strip() == "None":
        to_be_excluded.append(i)
        continue
        
    if "not" in answer.lower() and "supported" in answer.lower():
        to_be_excluded.append(i)
        continue
        

In [23]:
for index in sorted(to_be_excluded, reverse=True):
    del blogs_qa_list[index]
    
len(docs_qa_list)

11887

### Write the final dataset

In [24]:

with open("blogs_qa_dataset.json", "w") as fp:
    json.dump(blogs_qa_list, fp)