In [116]:
import openai
from utils import load_api_key

In [117]:
openai.api_key = load_api_key("/Users/gursi/Desktop/openai-api.txt")

In [56]:
def note_generator(chat_completion: openai.ChatCompletion, outline: dict[str, list[str]], level: str, subject: str):
    prompt = engineer_prompt_note_gen(outline, level, subject)
    model_output = chat_completion.create(
        model = "gpt-3.5-turbo",
        messages = [
            # {"role": "system", "content":f"Your are a {level} {subject} notes generator."},
            {"role": "system", "content":f"Your are a {level} {subject} notes generator. Generate notes in following format: \n Topic heading: \n \t Notes..."},
            {"role": "user", "content":prompt}
        ]
    )
    model_output = parse_results(model_output)
    return model_output
        
    
def engineer_prompt_note_gen(outline: dict[str, list[str]], level: str, subject: str):
    prompt = f'Generate quick revision notes for me on the following topics for a {subject} course on a {level} level. Include all equations. Explain each variable and concept. Do not say "sure, here are some notes..".\n\n'
    for topic in outline:
        prompt += f"- {topic}\n"
        if outline[topic] is not None:
            for subtopic in outline[topic]:
                prompt += f"    - {subtopic}\n"
    return prompt


def parse_results(chatgpt_output: dict) -> str:
    return chatgpt_output["choices"][0]["message"]["content"]

In [118]:
openai.api_key = load_api_key("/Users/gursi/Desktop/openai-api.txt")
chat_complete = openai.ChatCompletion

In [246]:
outline = {
    "Solution to -1/2 factorial":None,
}
output = note_generator(chat_complete, outline, "College", "Mathematics")

In [247]:
print(output)

Topic heading: Solution to -1/2 factorial

Notes:
The notation for a factorial is given by the exclamation mark, which is represented by the symbol (!). Factorials are used to calculate the number of permutations and arrangements of items. For non-negative integers, the factorial of a number is calculated as follows:

n! = n * (n - 1) * (n - 2) * ... * 2 * 1

For example, 5! = 5 * 4 * 3 * 2 * 1 = 120.

However, for non-integer values, including negative numbers, the factorial is calculated using the gamma function. The gamma function is defined as follows:

Γ(z) = ∫(0,inf) t^(z-1) * e^(-t) dt

Where z is a complex number.

To calculate -1/2 factorial using gamma function, substitute -1/2 into the equation above:

Γ(-1/2) = ∫(0,inf) t^(-3/2) * e^(-t) dt

Using integration by parts, we get:

Γ(-1/2) = 2 * √π

Therefore, -1/2 factorial is equal to 2 times the square root of pi, or:

(-1/2)! = 2 * √π

Note that "factorial" for negative non-integer numbers does not have a commonly accepted 

In [11]:
from utils import parse_results, parse_pdf, create_chat_object
from sklearn.cluster import KMeans
import numpy as np
import openai
import string

In [23]:
def extract_keywords_from_prompt(chat_completion: openai.ChatCompletion, prompt: str):
    model_output = chat_completion.create(
        model = "gpt-3.5-turbo",
        messages = [
            {"role": "system", "content":f"Your job is to extract key words from text. Generic words should never be extracted, only topic specific words."},
            {"role": "user", "content":f"Extract the keywords from the following instruction. Output a single list of comma separated values only once. \n\n {prompt}"}
        ]
    )
    output = parse_results(model_output)
    return output.split(",")

def generate_single_summary(chat_completion: openai.ChatCompletion, input_text: str, summary_prompt: str) -> str:
    model_output = chat_completion.create(
        model = "gpt-3.5-turbo",
        messages = [
            {"role": "system", "content":f"Your job is to summarize text based on a prompt. If relevant data is not found, return nothing."},
            {"role": "user", "content":f"{input_text} \n\n {summary_prompt}"}
        ]
    )
    output = parse_results(model_output)
    return output

def generate_summary(
        chat_completion: openai.ChatCompletion,
        text_body: str,
        prompt: str,
        buffer: int = 600
) -> str:
    keywords = extract_keywords_from_prompt(chat_completion, prompt)
    keywords = [k.translate(str.maketrans('', '', string.punctuation)).strip().lower() for k in keywords]
    print(f"{len(keywords)} keywords found...")
    kw = []
    [[kw.append(w) for w in word.split(" ")] for word in keywords]
    arr_text = np.array(text_body.lower().split())

    print("Matching keywords in text...")
    idxs = np.array([])
    max_idx = len(arr_text)
    for keyw in kw:
        kw_idxs = np.where(arr_text == keyw)[0] / max_idx
        idxs = np.concatenate([idxs, kw_idxs])

    print("Clustering...")
    kmeans = KMeans(n_clusters = len(keywords))
    _ = kmeans.fit_predict(idxs.reshape(-1, 1))
    centroid_idxs = list((kmeans.cluster_centers_ * len(arr_text)).astype(int).reshape(-1))

    print("Generating summary...")
    summaries = []
    for centroid_idx in centroid_idxs:
        text_input = list(arr_text[max(0, centroid_idx - buffer):min(len(arr_text), centroid_idx + buffer)])
        text_input = " ".join(text_input)
        summary = generate_single_summary(chat_completion, text_input, prompt)
        summaries.append(summary)
    summaries = " ".join(summaries)
    final_summary = generate_single_summary(chat_completion, summaries, prompt)
    return final_summary

In [24]:
text_body = parse_pdf("/Users/gursi/Desktop/nst.pdf")
prompt = "Summarize the loss function, optimization method and model architecture used in this study for me."
chat_completion = create_chat_object("/Users/gursi/Desktop/openai-api.txt")
summary = generate_summary(
    chat_completion,
    text_body,
    prompt,
)
print(summary)

4 keywords found...
Matching keywords in text...
Clustering...
Generating summary...
The study used an artificial neural system that separated image content from style using a VGG neural network and optimizing the white noise image through gradient descent. The loss function included a squared-error loss for feature representation and a mean-squared distance for style representation, with adjustable trade-off between the two. The model architecture consisted of 16 convolutional and 5 pooling layers, with max-pooling replaced by average pooling for image synthesis. Fully connected layers were not used, and the model was publicly available in the Caffe framework. The weighting factors for the contribution of each layer to the total loss were also included.


In [248]:
text_body = parse_pdf("/Users/gursi/desktop/223.pdf")
prompt = "Summarize linear independence."
chat_completion = create_chat_object("/Users/gursi/Desktop/openai-api.txt")
summary = generate_summary(
    chat_completion,
    text_body,
    prompt,
)
print(summary)

2 keywords found...
Matching keywords in text...
Clustering...
Generating summary...
Linear independence is a property of a set of vectors where none of them can be expressed as a combination of the others. It is important in linear algebra for subspaces and bases. A basis is a set of linearly independent vectors that spans the subspace, and the dimension of a subspace is the number of vectors in a basis. Linearly independent vectors can be used as a basis for a vector space, allowing any vector in that space to be expressed as a unique linear combination of the basis vectors. The determinant of their matrix must be non-zero for a set of vectors to be linearly independent.


In [27]:
text_body = parse_pdf("/Users/gursi/desktop/eco.pdf")
prompt = "Summarize the topic on the intertemporal budget line."
chat_completion = create_chat_object("/Users/gursi/Desktop/openai-api.txt")
summary = generate_summary(
    chat_completion,
    text_body,
    prompt,
)
print(summary)

3 keywords found...
Matching keywords in text...
Clustering...
Generating summary...
The intertemporal budget line is a concept that represents the trade-off between current and future consumption. It shows the different combinations of present and future goods that a consumer can afford given their income and the interest rate. The slope of the line represents the opportunity cost of present consumption in terms of future consumption, and the intercepts represent the maximum levels of current and future consumption. The economic interpretation of the intercepts is that they represent the present value of future income or the future value of current income. A numerical example is provided to illustrate how the intertemporal budget line works in practice. Overall, the intertemporal budget line is a useful tool for analyzing the intertemporal allocation of resources and understanding how individuals make choices between present and future consumption. However, the given text does not con

In [62]:
text_body = parse_pdf("/Users/gursi/Desktop/test.pdf")

In [264]:
model_output = chat_completion.create(
        model = "gpt-3.5-turbo",
        messages = [
            {"role": "system", "content":f"Your job is to extract questions and their respective answers from the given texts. Output them in the given format: \n\n Question: \n Answer: \n"},
            {"role": "user", "content":f"{text_body[:-9000]} \n\n Make sure to extract questions and their answers."}
        ]
    )
output = parse_results(model_output)

In [265]:
print(output)

Sorry, I cannot extract questions and answers as the given text does not include any specific questions and answers. It only provides information about the course syllabus, instructors, TAs, textbook, and assessment system.


In [250]:
def read_questions_answers(chat_completion: openai.ChatCompletion, text_body: str, div_length: int = 5000) -> dict[str, str]:
    output_strs = ""
    for i in range(1, (len(text_body) // div_length) + 2):
        x = i * div_length
        substr = text_body[(x - i):x]
        model_output = chat_completion.create(
            model = "gpt-3.5-turbo",
            messages = [
                # {"role": "system", "content":f"Your job is to extract suspected questions and their answers from the given text. Output them in the given format: \n\n Question: \n Answer: \n"},
                # {"role": "user", "content":f"Find and extract question and answer pairs from this test: {text_body}"}
                {"role": "system", "content":f"Your task is to analyze a given text and extract questions along with their respective answers. While examining the text, ensure that you pay close attention to accuracy, clarity, and any specific formatting requirements within the text. If a question contains unique elements like programming code or math equations, make sure to include those in your output as well. Present your findings using the following format: Question: [Insert extracted question here] Answer: [Insert corresponding answer here] While maintaining the original meaning of both questions and answers, demonstrate flexibility and creativity in extracting information from various types of texts (e.g., articles, interviews, forums). Your response should be adaptable enough to capture unique insights found within different sources while still accurately presenting the extracted data. Please provide at least three examples of questions and answers extracted from the provided text."},
                {"role": "user", "content":f"{substr} \n\n Make sure to extract questions and their answers."}
            ]
        )
        output = parse_results(model_output)
        output_strs += output + "\n\n"

    # output_dict = {}
    # for qna_pair in output.split("Question:"):
    #     if qna_pair != "":
    #         q, a = qna_pair.split("Answer:")
    #         q, a = q.strip(), a.strip()
    #         output_dict[q] = a
    # return output_dict
    return output_strs

In [251]:
text_body = parse_pdf("/Users/gursi/Desktop/test.pdf")
output_dict = read_questions_answers(chat_completion, text_body)

In [252]:
print(output_dict)

Sure, I apologize for the mistake. Can you please provide me with the text that you would like me to analyze and extract questions and their answers from?

Question: What is the task I am supposed to perform on the given text? 
Answer: You are supposed to analyze the given text and extract questions along with their respective answers. 

Question: What are the things to be considered while examining the text?
Answer: Accuracy, clarity, and any specific formatting requirements within the text should be considered while examining the text.

Question: How should the extracted information be presented?
Answer: The extracted information should be presented in the format of Question and Answer.

Apologies for the confusion. Here are three examples of questions and their corresponding answers extracted from the provided text:

Question: What is the main purpose of this text?
Answer: The main purpose of this text is to provide instructions on how to analyze a given text and extract questions a

In [274]:
sample_question = list(output_dict.keys())[1]
expected_answer = output_dict[sample_question]
my_answer = "Missing values in the DOB field are represented with the NA values. The code will not produce correct outputs if these are not removed. You would remove them using the is.na() and filter() functions in R."

def grade_answer(chat_completion: openai.ChatCompletion, question: str, my_answer: str, correct_answer: str) -> str:
    model_output = chat_completion.create(
        model = "gpt-3.5-turbo",
        messages = [
            {"role": "system", "content":f"You are a teacher and your job is to grade my answer based on the Expected Answer and Question. My answer need not exactly match the Expected Answer. Give your output in the following format: \n Score (Out of 100): \n Feedback: ...\n"},
            {"role": "user", "content":f"Question: {question} \n Expected Answer: {correct_answer} \n My answer: {my_answer}"}
        ]
    )
    output = parse_results(model_output)
    return output

print(grade_answer(chat_completion, sample_question, my_answer, expected_answer))

Score (Out of 100): 60
Feedback: Your answer is discussing specific steps for data cleaning in R rather than addressing the question of what should be considered while examining text. While the information you provided may be useful in some contexts, it is not directly related to the topic at hand. To improve your score, try to focus on answering the question asked and providing relevant information.


In [258]:
def generate_syllabus(chat_completion: openai.ChatCompletion, text_body: str, div_length: int = 5000) -> str:
    model_output = chat_completion.create(
        model = "gpt-3.5-turbo",
        messages = [
            # {"role": "system", "content":f"Your job is to read the information and extract the Grade Weightages, Office Hour timings, whther or not the course has mandatory attendance, the late submission policy and the regrading policy and output it in the given format:\n\n - Grade weightages/Breakdown: \n  - Component 1 - X% \n   - Component 2 - X% \n Office Hours: \n Mandatory Attendance: <Yes/No>\n Late submissing policy: \n Regrading policy: "},
            {"role": "system", "content":f"Your job is to read the information and extract the Grade Weightages, Office Hour timings, whther or not the course has mandatory attendance, the late submission policy and the regrading policy and output it in the given format:\n\n - Grade weightages/Breakdown: \n  - Component 1 - X% \n   - Component 2 - X% \n Office Hours: \n Mandatory Attendance: <Yes/No>\n Late submissing policy: \n Regrading policy: "},
            {"role": "user", "content":f"{text_body}"}
            ]
        )
    output = parse_results(model_output)
    return output

text_body = parse_pdf("/Users/gursi/Desktop/223-syl.pdf")
output = generate_syllabus(chat_completion, text_body)
print(output)

Grade weightages/Breakdown:
- Reading Assignments - 14%
- Online Standards-based Homework - 15%
- Group Reports - 30%
- Individual Reflective Writing Assignments - 6%
- Standards-based, in-person final exam or final project - 35%

Office Hours: Posted on Quercus

Mandatory Attendance: Not mentioned

Late submission policy:
- Late reading assignments are not accepted
- MathMatize homework assignments must be completed by the due dates listed above
- Late group reports are not accepted except for special circumstances

Regrading policy: Students can resubmit online standards-based homework and group reports without penalty. The higher of the original score and resubmitted score will be used for course marks. Final exam and individual reflective writing assignments cannot be resubmitted.


In [260]:
model_output = chat_completion.create(
    model = "gpt-3.5-turbo",
    messages = [
        {"role": "system", "content":f"You are a college exam maker for the mathematics department. Your job is to generate question and answer pairs for questions of the given topic in the format: \n Question: \n Answer: "},
        {"role": "user", "content":f"Improper integrals"}
        ]
    )
output = parse_results(model_output)
print(output)

Question: What is an improper integral?

Answer: An improper integral is a type of definite integral where either the limits of integration or the integrand itself is unbounded or undefined. This means that the integral cannot be evaluated using the traditional methods of integration and requires special techniques to determine its value.


In [287]:
def generate_questions(chat_completion: openai.ChatCompletion, level: str, subject: str, topic: str, difficulty: str, num_questions: int) -> dict[str, str]:
    model_output = chat_completion.create(
        model = "gpt-3.5-turbo",
        messages = [
            {"role": "system", "content":f"You are a {level} exam maker for the {subject} department. Your job is to generate the given number of question and answer pairs for questions of the given topic in the format. Do not number the questions: \n Question: \n Answer: \n\n"},
            {"role": "user", "content":f"Generate {str(num_questions)} of difficulty {difficulty} on {topic}"}
            ]
        )
    output = parse_results(model_output)
    output_dict = {}
    for qna_pair in output.strip().split("Question:"):
        if qna_pair != "":
            q, a = qna_pair.split("Answer:")
            q, a = q.strip(), a.strip()
            output_dict[q] = a
    return output_dict

In [288]:
output = generate_questions(chat_completion, "High school", "Biology", "Plant biology", "Easy", 5)

In [289]:
print(output)

{'What is photosynthesis?': 'Photosynthesis is the process by which plants convert light energy into chemical energy to be stored as glucose.', 'What is the function of the root system in a plant?': 'The root system in a plant anchors it to the soil and absorbs water and nutrients.', 'What is the function of stomata in plants?': 'Stomata are small openings on the surface of leaves that allow for the exchange of gases, such as carbon dioxide and oxygen.', 'What is the function of chloroplasts in plant cells?': 'Chloroplasts are organelles within plant cells that carry out photosynthesis.', 'What is the function of the stem in a plant?': 'The stem provides support for the plant and transports water and nutrients throughout the plant.'}


In [266]:
sample_output = """
Question: What is the task I am supposed to perform on the given text? 
Answer: You are supposed to analyze the given text and extract questions along with their respective answers. 

Question: What are the things to be considered while examining the text?
Answer: Accuracy, clarity, and any specific formatting requirements within the text should be considered while examining the text.

Question: How should the extracted information be presented?
Answer: The extracted information should be presented in the format of Question and Answer.

Apologies for the confusion. Here are three examples of questions and their corresponding answers extracted from the provided text:

Question: What is the main purpose of this text?
Answer: The main purpose of this text is to provide instructions on how to analyze a given text and extract questions along with their respective answers.

Question: What should be paid attention to while examining the text?
Answer: While examining the text, one should pay close attention to accuracy, clarity, and any specific formatting requirements within the text.

Question: What should be included in the output if a question contains unique elements like programming code or math equations?
Answer: If a question contains unique elements like programming code or math equations, they should be included in the output along with the question and answer.
"""

In [271]:
output_dict = {}
for qna_pair in sample_output.strip().split("Question:"):
    if qna_pair != "":
        q, a = qna_pair.split("Answer:")
        q, a = q.strip(), a.strip()
        output_dict[q] = a

In [272]:
output_dict

{'What is the task I am supposed to perform on the given text?': 'You are supposed to analyze the given text and extract questions along with their respective answers.',
 'What are the things to be considered while examining the text?': 'Accuracy, clarity, and any specific formatting requirements within the text should be considered while examining the text.',
 'How should the extracted information be presented?': 'The extracted information should be presented in the format of Question and Answer.\n\nApologies for the confusion. Here are three examples of questions and their corresponding answers extracted from the provided text:',
 'What is the main purpose of this text?': 'The main purpose of this text is to provide instructions on how to analyze a given text and extract questions along with their respective answers.',
 'What should be paid attention to while examining the text?': 'While examining the text, one should pay close attention to accuracy, clarity, and any specific formatt