### Imports

In [None]:
import os
import glob
import json
import pandas as pd
import re
import pickle
import concurrent.futures
import requests
import openai
from tqdm import tqdm
from langchain.text_splitter import CharacterTextSplitter

### Load Pytorch Docs

In [None]:
df = pd.read_json('docs.json')
df = df[(df['text'].str.len() > 100)]
df = df.reset_index(drop=True)

pattern = r'\*{3,}'
df['text'] = df['text'].apply(lambda x: re.sub(pattern, '', x))
df['text'] = df['text'].str.replace('\n\n', '\n')

df.to_csv('docs_cleaned.csv')
df

### Using langchain - split the data into multiple pages

In [None]:
def split_pages(df):
    splitter = CharacterTextSplitter(separator="\n", chunk_size=2048)
    print('chunking pages into smaller sub-pages')
            
    pages = []

    for index, i in df.iterrows():
        pages.extend(splitter.create_documents([i['text']], [i['metadata']]))
    print('saving pages as pages.pkl')
    pickle.dump(pages, open('pages.pkl', 'wb'))
    
    print('total pages:', len(pages))
    return pages


pages = split_pages(df)
len(pages)

### Run this step multiple times with the start and end values as the multiple of 1000
```
start = 0, end = 1000
start = 1000, end = 2000
start = 2000, end = 3000
start = 3000, end = 4070
```

### At the end of each iteration, docs_qa_openai_*.csv will be generated

In [None]:
api_key = ''

def get_qa_openai(context):
    try:
        completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo", api_key = api_key,
                messages=[
                    {"role": "user", "content": context}
                  ]
                )

        qa = completion.choices[0].message.content

    except requests.exceptions.RequestException as e:
        print(f'Request failed with error: {str(e)}.')
        print(f'Waiting for 3 minutes before trying again...')
        time.sleep(180)
    
    return qa

questions_ans = []

with concurrent.futures.ThreadPoolExecutor() as executor:
    
    futures = []
    start = 3000
    end = 4070
    for i in pages[start:end]:
        
        context = f"Generate question and answer only in this format 'Question: Answer:' using this context \
        and you can decide the number of question and answer to generate based \
        on context size but don't generate too many same kind of questions: {i.page_content}"
        
        futures.append(executor.submit(get_qa_openai, context))

    for future, i in tqdm(zip(concurrent.futures.as_completed(futures), pages[start:end]), total=len(pages[start:end])):
        try:
            qa = future.result()
            questions_ans.append({'text':qa, 'metadata':i.metadata})
        except Exception as exc:
            print(f'generated an exception: {exc}')

df1 = pd.DataFrame(questions_ans)
df1.to_csv(f'docs_qa_openai_{start}_{end}.csv')
df1.shape

### Combine all the output files into a single file

In [None]:
docs_qa_list = glob.glob("docs_qa_openai_*")

df_list = []

for filename in docs_qa_list:
    df = pd.read_csv(filename, index_col=None, header=0)
    df_list.append(df)

docs_qa = pd.concat(df_list, axis=0, ignore_index=True)
docs_qa.shape

### Each row contains multiple question and answers. Split it into multiple rows to have one question and answer per row

In [None]:
def split_sentence_by_word(sentence, split_word):
    sentences = sentence.split(split_word)
    result = [''.join([split_word, s.strip()]) for s in sentences if s.strip()]
    return result


In [None]:
def final_qa(df1):

    final_text = []

    for index,i in df1.iterrows():
        result = split_sentence_by_word(i['text'], 'Question: ')
        metadata = i['metadata']
        for i in result:
            final_text.append({'text':i, 'metadata':metadata})
    return final_text

In [None]:
docs_qa_list = final_qa(docs_qa)
len(docs_qa_list)

### Remove the rows where LLM couldnt find the answer

In [None]:
to_be_excluded = []
for i in range(len(docs_qa_list)):
    row = docs_qa_list[i]
    text = row["text"]
    answer = text.split("Answer: ")[-1]

    if answer.strip() == "None":
        to_be_excluded.append(i)
        continue
        
    if "not" in answer.lower() and "supported" in answer.lower():
        to_be_excluded.append(i)
        continue
        

In [None]:
for index in sorted(to_be_excluded, reverse=True):
    del docs_qa_list[index]
    
len(docs_qa_list)

### Write the final dataset

In [None]:

with open("docs_qa_dataset.json", "w") as fp:
    json.dump(docs_qa_list, fp)