# Dataset Creation with OpenAI API

In [None]:
!pip -q install langchain tiktoken cohere openai PyPDF2 datasets


from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter
import re
import openai
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m17.4 M

In [None]:
#OpenAI API_key
import os
os.environ['OPENAI_API_KEY'] = 'removed'

In [None]:
#mounting Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#Read in HP troubleshooting pdf
hp_guide=PdfReader('/content/gdrive/MyDrive/Colab Notebooks/Langchain/HP_project/HP_Troubleshooting_Guide.pdf')


In [None]:
#Extract all of the text from the pdf
hp_guide_text=''
for i, page in enumerate(hp_guide.pages):
  text=page.extract_text()
  if text:
    hp_guide_text+=text



In [None]:
#Split the text into chunks for use in ChatGPT
text_splitter=RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200)
split_text=text_splitter.split_text(hp_guide_text)
print("Number of text chunks: {}  \n".format(len(split_text)))

print("Example of text_chunk \n")
split_text[0]


Number of text chunks: 41  

Example of text_chunk 



'Troubleshooting Guide www.hp.com 1–11\nComputer Diagnostic Features\nDiagnostics for Windows\nThe Diagnostics for Windows (DFW)  utility allows you to view \ninformation about the hardware and software configuration of the computer while running Microsoft Windows XP. It also allows you to perform hardware and software te sts on the subsystems of the \ncomputer.\nWhen you invoke Diagnostics for Wi ndows, the Overview screen is \ndisplayed, which shows the current configuration of the computer. From the Overview screen, there is access to several categories of information about th e computer and the Test tab. The information in \nevery screen of the utility can be saved to a file or printed.\n✎To test all subsystems, you must log in as the administrator. If you do \nnot log in as the administrator, you will be unable to test some \nsubsystems. The inability to test a subsystem will be indicated by an \nerror message under the subsystem na me in the Test window or by \nshaded check boxes

In [None]:
print("Number of words in example text chunk: {}".format(len([i for i in split_text[0].split()])))

Number of words in example text chunk: 461


In [None]:
#Function to obtain question/answer pairs from the text chunks created above. The prompt gives specific instructions on how to format the q/a pairs for easy formatting after they are created.
#each q/a pair can be sepearted from others by .split("&&&""). The question can then be separated from the answer by .split("###"")

def q_a_retrieval(prompt):
  response=openai.chat.completions.create(
         model="gpt-4",
         messages=[{'role':"system",
                    "content":'''You are a helpful assistant that does not make up answers. Text will be provided by the user. You are to create a question and answer pair.
                    You are to create 12 of those question and answer pairs. These question answer pairs will be used as training data to train a Large Langauge Model.
                    The question answer pairs need to be different from each other. The output for each question and answer pair should be as
                    follows:  user prompt### model answer&&& . Here is an example:  What is the capital of France ### The capital of France is Paris.&&&
                    Do not number the question/answers response. Do not use bulletpoints for the question/answers response  '''},
                  {"role": "user",
                   "content":prompt}],
         temperature=0.0,
         max_tokens=1500,
         top_p=0.3,
         n=1)
  content=content=response.choices[0].message.content

  return content

In [None]:
#Getting the Q/A pairs from ChatGPT
q_a_pairs=""

for i in range(40):
    response = q_a_retrieval(split_text[i])
    q_a_pairs = q_a_pairs + response

q_a_pairs

In [None]:
#Function to separate the questions and answers. The original output, questions and answers are then placed into a pandas dataframe.
def q_a_dataframe(openai_output):
  q_a_list=openai_output.split("&&&")
  print(len(q_a_list))
  q_a_list=[i.strip() for i in q_a_list ]
  q_a_list=[i for i in q_a_list if len(i.split("###"))>1] #guards against poor ChatGPT output which does not have a Q/A pair
  print(len(q_a_list))
  questions=[answer_question.split("###")[0].strip()  for answer_question in q_a_list]
  answers=[answer_question.split("###")[1].strip()  for answer_question in q_a_list]
  df=pd.DataFrame({"q_and_a":q_a_list, 'question':questions, "answer":answers})
  return df


df_q_a_hp_guide=q_a_dataframe(q_a_pairs)


In [None]:
#Saving dataset as a CSV file
df_q_a_hp_guide.to_csv("/content/gdrive/MyDrive/Colab Notebooks/Langchain/HP_project/df_q_a_hp_guide.csv")

In [None]:
#loading dataset
df_q_a_hp_guide=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/Langchain/HP_project/df_q_a_hp_guide.csv",index_col=0)

In [None]:
print("\n\nLength of q/a dataset: {}".format(df_q_a_hp_manual.shape[0]))
df_q_a_hp_manual.head(15)




Length of q/a dataset: 452


Unnamed: 0,q_and_a,question,answer
,,,
0.0,"""What is the Diagnostics for Windows (DFW) uti...","""What is the Diagnostics for Windows (DFW) uti...",The Diagnostics for Windows (DFW) utility is u...
1.0,"""How can you access the information about the ...","""How can you access the information about the ...","When you invoke Diagnostics for Windows, the O..."
2.0,"""What are the limitations of using Diagnostics...","""What are the limitations of using Diagnostics...","If you do not log in as the administrator, you..."
3.0,"""What is the purpose of using Diagnostics for ...","""What is the purpose of using Diagnostics for ...",Diagnostics for Windows is used to determine i...
4.0,"""What should you do if third party devices are...","""What should you do if third party devices are...",Third party devices not supported by HP may no...
5.0,"""How can you determine whether Diagnostics for...","""How can you determine whether Diagnostics for...",To determine whether Diagnostics for Windows i...
6.0,"""What should you do if Diagnostics for Windows...","""What should you do if Diagnostics for Windows...",If Diagnostics for Windows is not installed on...
7.0,"""What are the steps to install the Diagnostics...","""What are the steps to install the Diagnostics...",To install the Diagnostics for Windows utility...
8.0,"""What should you do if the Diagnostics for Win...","""What should you do if the Diagnostics for Win...",If the Diagnostics for Windows utility is load...
