<a href="https://colab.research.google.com/github/eslqian/410Project/blob/task%2Frm%2FstudyGuideHelperPhase1/410_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CS 410 Project: Part 1 - Finding best document to answer guiding question

### 1 Unzip transcript files to use

In [None]:
import zipfile
import os

# Replace 'example.zip' with your actual zip file path
zip_path = '/content/Week7.zip'
extract_path = '/content/'

# Create a directory to extract to
os.makedirs(extract_path, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Extracted to {extract_path}")


Extracted to /content/


### View the overview questions

In [None]:
import re
import os

# Define the path to your overview file
overview_file = "/content/week7_overview.txt"

# Check if the file exists before proceeding
if os.path.exists(overview_file):
    with open(overview_file, 'r') as file:
        text = file.read()

    # Helper function to extract content between two headings
    def extract_content(heading, text, next_heading=None):
        # Define pattern to capture all text after the heading until the next heading
        if next_heading:
            pattern = re.compile(rf"{heading}\n\n(.*?)\n{next_heading}", re.DOTALL)
        else:  # if it is the last section
            pattern = re.compile(rf"{heading}\n\n(.*?)(?=\n[A-Z]\n|$)", re.DOTALL)
        match = pattern.search(text)
        content = match.group(1).strip() if match else ""
        # Split content into a list by newlines and ignore empty lines
        return [line.strip() for line in content.split('\n') if line.strip()]

    # Extract each section
    goals_and_objectives = extract_content("Goals and Objectives", text, "Guiding Questions")
    guiding_questions = extract_content("Guiding Questions", text, "Key Phrases and Concepts")
    key_phrases_and_concepts = extract_content("Key Phrases and Concepts", text)

    # Print the extracted sections
    print("Goals and Objectives:")
    for item in goals_and_objectives:
        print("-", item)
    print("\nGuiding Questions:")
    for item in guiding_questions:
        print("-", item)
    print("\nKey Phrases and Concepts:")
    for item in key_phrases_and_concepts:
        print("-", item)
else:
    print(f"The file at {overview_file} does not exist.")


Goals and Objectives:
- Explain some basic concepts in natural language processing.
- Explain different ways to represent text data.
- Explain the two basic types of word associations and how to mine paradigmatic relations from text data.

Guiding Questions:
- What does a computer have to do in order to understand a natural language sentence?
- What is ambiguity?
- Why is natural language processing (NLP) difficult for computers?
- What is bag-of-words representation?
- Why is this word-based representation more robust than representations derived from syntactic and semantic analysis of text?
- What is a paradigmatic relation?
- What is a syntagmatic relation?
- What is the general idea for discovering paradigmatic relations from text?
- What is the general idea for discovering syntagmatic relations from text?
- Why do we want to do Term Frequency Transformation when computing similarity of context?
- How does BM25 Term Frequency transformation work?
- Why do we want to do Inverse Docu

## For each question, determine the documents that are most likely to contain an answer to one of the questions from the overview

In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assume extract_content is defined elsewhere

transcripts_folder = "/content/Week7"
overview_file = "/content/week7_overview.txt"

# Load the content from the overview file
with open(overview_file, 'r') as file:
    overview_text = file.read()

# Extract questions and key concepts
guiding_questions = extract_content("Guiding Questions", overview_text, "Key Phrases and Concepts")
key_phrases_and_concepts = extract_content("Key Phrases and Concepts", overview_text)

# Combine the questions and key concepts
queries = guiding_questions + key_phrases_and_concepts

# Read all transcripts and store them in a list
documents = []
document_names = []
for filename in os.listdir(transcripts_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(transcripts_folder, filename), 'r') as file:
            documents.append(file.read())
            document_names.append(filename)

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

# Convert the documents to a matrix of TF-IDF features
tfidf_matrix = vectorizer.fit_transform(documents)

# Define a function to find the top three matching documents for each query
def find_top_documents(query, top_n=3):
    query_tfidf = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]  # Get the indices of top matches
    return [(document_names[i], cosine_similarities[i]) for i in top_indices]

# Find and print the top three documents for each query
for query in queries:
    top_documents = find_top_documents(query)
    print(f"Query: {query}")
    for doc, score in top_documents:
        print(f"Matching document: {doc} with score {score:.4f}")
    print("\n")


Query: What does a computer have to do in order to understand a natural language sentence?
Matching document: 7_3_Natural_Language_Content_Analysis_Part_1.txt with score 0.4608
Matching document: 7_5_Text_Representation_Part_1.txt with score 0.1435
Matching document: 7_7_Word_Association_Mining_And_Analysis.txt with score 0.1224


Query: What is ambiguity?
Matching document: 7_3_Natural_Language_Content_Analysis_Part_1.txt with score 0.1418
Matching document: 7_6_Text_Representation_Part_2.txt with score 0.0000
Matching document: 7_5_Text_Representation_Part_1.txt with score 0.0000


Query: Why is natural language processing (NLP) difficult for computers?
Matching document: 7_3_Natural_Language_Content_Analysis_Part_1.txt with score 0.3344
Matching document: 7_4_Natural_Language_Content_Analysis_Part_2.txt with score 0.2958
Matching document: 7_5_Text_Representation_Part_1.txt with score 0.1379


Query: What is bag-of-words representation?
Matching document: 7_5_Text_Representation_Par

# GPT Question Generation

## Have GPT generate a list of guiding questions and key cncepts based of the video transcripts

In [None]:
OPENAI_API_KEY = # USE YOUR OWN API KEY PLS

In [None]:
from openai import OpenAI
import os
import re

# Path to the directory with your transcript files
transcripts_folder = "/content/Week7"

pre_prompt = "The following text is a part of a lecture transcript from a master-level text information course. Please provide a response in two sections. In the first section, called Guiding Questions, list down potential Guiding Questions and provide an answer to each as a numbered list in the format of Q1: {question} A1: {answer to Q1}. In the second section, called Key Concepts, identify Key Concepts and provide a definition for each as numbered list in the format of 1. {Term} - {Definition}: "

def query_chat_gpt(prompt):
    client = OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-16k",  # Use the specific GPT-4 variant
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=4096  # Maximum output token limit
    )
    # Accessing the response content correctly
    return response.choices[0].message.content

def parse_questions_answers(text):
    pattern = re.compile(r"Q(\d+): (.*?)\nA\1: (.+?)(?=\nQ\d+:|\Z)", re.DOTALL)
    matches = pattern.findall(text)
    questions_answers = [{"Q": match[1], "A": match[2]} for match in matches]
    return questions_answers

def parse_key_concepts(text):
    pattern = re.compile(r"(\d+)\. (.*?) - (.*?)(?=\n\d+\. |$)", re.DOTALL)
    matches = pattern.findall(text)
    key_concepts = [{"id": match[0], "term": match[1], "definition": match[2]} for match in matches]
    return key_concepts

# Store all guiding questions and key concepts
all_guiding_questions = []
all_key_concepts = []
all_responses = []

max_chunk_size = 10000

# Process each transcript file in the directory
for filename in os.listdir(transcripts_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(transcripts_folder, filename)
        with open(file_path, 'r') as file:
            text = file.read()

            # Break up into chunks that fit into the GPT-3 prompt size limit
            chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

            for chunk in chunks:
                prompt = pre_prompt + chunk

                # Query GPT-3
                response_text = query_chat_gpt(prompt)
                all_responses.append(response_text)


# Output the accumulated results
print("Accumulated Guiding Questions and Answers:")
for qa in all_guiding_questions:
    print(f"Q{qa['Q']}:\nA{qa['A']}")

print("\nAccumulated Key Concepts:")
for concept in all_key_concepts:
    print(f"{concept['id']}. {concept['term']} - {concept['definition']}")


Accumulated Guiding Questions and Answers:

Accumulated Key Concepts:


## Parse the guiding questions and answers

In [None]:
import re

def parse_key_concepts(text):
    pattern = re.compile(r"(\d+)\.\s*(.*?)\s*-\s*(.+?)(?=\n\d+\. |$)", re.DOTALL)
    matches = pattern.findall(text)
    key_concepts = [{"id": match[0], "term": match[1].strip(), "definition": match[2].strip()} for match in matches]
    return key_concepts
def parse_questions_answers(text):
    # This pattern will match questions in the format "Q1:" or "Q:" and answers in the format "A1:" or "A:"
    pattern = re.compile(r"Q(\d+)?:\s*(.*?)\s*A\1?:\s*(.+?)(?=\nQ(\d+)?:|\Z)", re.DOTALL)
    matches = pattern.findall(text)
    questions_answers = [{"Q": match[1].strip(), "A": match[2].strip()} for match in matches]
    return questions_answers

# I'll assume 'all_responses' is a list of strings, each string being a response from the model.
all_guiding_questions = []
all_key_concepts = []

for response_text in all_responses:
    # Split the response into the 'Guiding Questions' and 'Key Concepts' sections
    guiding_questions_section = re.search(r"Guiding Questions:(.*?)\n\n", response_text, re.DOTALL)
    key_concepts_section = re.search(r"Key Concepts:(.*)", response_text, re.DOTALL)

    # Extract the questions and answers
    questions_answers = parse_questions_answers(guiding_questions_section.group(1)) if guiding_questions_section else []
    # Extract the key concepts
    key_concepts = parse_key_concepts(key_concepts_section.group(1)) if key_concepts_section else []

    # Accumulate the questions and concepts
    all_guiding_questions.extend(questions_answers)
    all_key_concepts.extend(key_concepts)

# Output the accumulated results
print("Accumulated Guiding Questions and Answers:")
for qa in all_guiding_questions:
    print(f"Q) {qa['Q']}:\nA) {qa['A']}")

print("\nAccumulated Key Concepts:")
for concept in all_key_concepts:
    print(f"{concept['id']}. {concept['term']} - {concept['definition']}")


Accumulated Guiding Questions and Answers:
Q) What is the main focus of text mining?:
A) The main focus of text mining is to turn text data into actionable knowledge.
Q) What is the Expected Overlap of Words in Context method?:
A) It is a method that represents each context with a word vector based on the probability of a word in the context and measures similarity using the product of the probabilities.
Q) What is the difference between text mining and text analytics?:
A) Text mining and text analytics are used interchangeably, but there is a subtle difference in emphasis. Text mining focuses on the process, while text analytics emphasizes the result or the problem at hand.
2. Q2: What is the main goal of text mining and text analytics?
   A2: The main goal is to turn text data into high-quality information or actionable knowledge.
3. Q3: What is the difference between high-quality information and actionable knowledge?
   A3: High-quality information refers to concise information that

## For each question in the Overview Guiding Questions, have GPT generate answers using the top 3 documents as context

In [None]:
import openai

# Your updated query_gpt4 function
def query_gpt4(prompt):
    client = openai.OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",  # Use the specific GPT-4 variant
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=4096  # Maximum output token limit
    )
    return response.choices[0].message.content

# Function to construct prompt with context and query GPT-4
def ask_gpt_with_context(question, context_documents, document_names):
    # Combine the question with context documents
    prompt = f"Question: {question}\n\n"
    prompt += "Context:\n"
    for doc, name in zip(context_documents, document_names):
        prompt += f"Document: {name}\n{doc}\n\n"

    # Truncate the prompt if it's too long
    max_length = 128000  # max context token limit for GPT-4
    if len(prompt) > max_length:
        prompt = prompt[-max_length:]

    # Query GPT-4 and return the response
    return query_gpt4(prompt)

# Store the outputs in a list
outputs = []

# Process each guiding question
for query in guiding_questions:
    top_documents = find_top_documents(query)
    top_doc_names = [doc_name for doc_name, _ in top_documents]
    top_doc_contents = [documents[document_names.index(doc_name)] for doc_name in top_doc_names]
    answer = ask_gpt_with_context(query, top_doc_contents, top_doc_names)

    # Store the output
    output = {
        "question": query,
        "documents_used": top_doc_names,
        "answer": answer
    }
    outputs.append(output)

    # Print the output
    print(f"Question: {query}")
    print(f"Documents used: {', '.join(top_doc_names)}")
    print("Answer:", answer)
    print("-------------------------------------------------------------------------------------------------------------\n\n")


Question: What does a computer have to do in order to understand a natural language sentence?
Documents used: 7_3_Natural_Language_Content_Analysis_Part_1.txt, 7_5_Text_Representation_Part_1.txt, 7_7_Word_Association_Mining_And_Analysis.txt
Answer: A computer must perform several steps to understand a natural language sentence. These steps include:

1. **Word Segmentation**: Identifying individual words in a sentence, which can usually be done by looking for spaces in languages like English.

2. **Lexical Analysis**: Determining the syntactical categories (parts of speech) of each word, such as nouns, verbs, adjectives, etc. This step is also known as part-of-speech tagging.

3. **Syntactical Parsing**: Analyzing the structure of the sentence to determine how words and phrases are related. This step involves identifying noun phrases, verb phrases, prepositional phrases, etc., and how they connect to convey meaning. The result is a parse tree that shows the sentence's syntactical struct

## Create a multiple choice quiz based on the above output questions and answers

In [None]:
# Step 1: Create summaries for each Q&A pair
summaries = []
for output in outputs:
    summary = f"Topic: {output['question']}\nKey Points: {output['answer']}\n"
    summaries.append(summary)

# Step 2: Combine the summaries into a single prompt
quiz_prompt = "Based on the following summaries, create a multiple-choice quiz to test knowledge on these topics:\n\n"
quiz_prompt += "\n".join(summaries)
quiz_prompt += "\n\nCreate a multiple-choice quiz:"

# Step 3: Use the query_gpt4 function to create the quiz
quiz_questions = query_gpt4(quiz_prompt)

# Print the generated quiz
print(quiz_questions)


**Quiz: Understanding Natural Language Processing and Related Concepts**

1. What is the first step a computer must perform to understand a natural language sentence?
   A. Semantic Analysis
   B. Syntactical Parsing
   C. Word Segmentation
   D. Lexical Analysis

2. What is lexical ambiguity in language processing?
   A. When a pronoun refers to more than one possible antecedent.
   B. When a word has multiple syntactic parses.
   C. When a word has multiple meanings.
   D. When a sentence carries an unstated presupposition.

3. Which of the following is NOT a challenge for Natural Language Processing (NLP)?
   A. The variety of expressions in human language.
   B. The simplicity of grammatical structures.
   C. Disambiguating pronouns and presuppositions.
   D. Understanding the pragmatic aspects of sentences.

4. What is the main idea behind the bag-of-words representation in text mining?
   A. To preserve the grammatical structure of sentences.
   B. To capture the exact order of w

# EXPERIMENT

In [None]:
import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

# Function to preprocess and lemmatize text
def preprocess_and_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    # Lowercasing and removing special characters
    words = re.sub(r'\W+', ' ', text.lower()).split()
    return ' '.join([lemmatizer.lemmatize(word) for word in words if word not in stop_words])

# Preprocess and store documents
preprocessed_documents = [preprocess_and_lemmatize(doc) for doc in documents]

# Create a TfidfVectorizer object with custom parameters
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.5, min_df=2)

# Convert the documents to a matrix of TF-IDF features
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Optional: Apply dimensionality reduction
svd = TruncatedSVD(n_components=100)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

# Enhanced function to find the top matching documents
def find_top_documents(query, top_n=3):
    query_processed = preprocess_and_lemmatize(query)
    query_tfidf = vectorizer.transform([query_processed])
    # If dimensionality reduction is applied
    query_tfidf_reduced = svd.transform(query_tfidf)
    cosine_similarities = cosine_similarity(query_tfidf_reduced, tfidf_matrix_reduced).flatten()
    top_indices = np.argsort(cosine_similarities)[-top_n:][::-1]
    return [(document_names[i], cosine_similarities[i]) for i in top_indices]

# Example usage
for query in queries:
    top_documents = find_top_documents(query)
    print(f"Query: {query}")
    for doc, score in top_documents:
        print(f"Matching document: {doc} with score {score:.4f}")
    print("\n")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Query: What does a computer have to do in order to understand a natural language sentence?
Matching document: 7_4_Natural_Language_Content_Analysis_Part_2.txt with score 0.9841
Matching document: 7_3_Natural_Language_Content_Analysis_Part_1.txt with score 0.9804
Matching document: 7_5_Text_Representation_Part_1.txt with score 0.3874


Query: What is ambiguity?
Matching document: 7_6_Text_Representation_Part_2.txt with score 0.0000
Matching document: 7_5_Text_Representation_Part_1.txt with score 0.0000
Matching document: 7_3_Natural_Language_Content_Analysis_Part_1.txt with score 0.0000


Query: Why is natural language processing (NLP) difficult for computers?
Matching document: 7_4_Natural_Language_Content_Analysis_Part_2.txt with score 0.9845
Matching document: 7_3_Natural_Language_Content_Analysis_Part_1.txt with score 0.9725
Matching document: 7_5_Text_Representation_Part_1.txt with score 0.3748


Query: What is bag-of-words representation?
Matching document: 7_7_Word_Association_Mi

In [None]:
backup_outputs = outputs

In [None]:
backup_outputs

## This One is faster since it executes in parallel, but beware as it racks up the price quickly

In [None]:
%%time
import concurrent.futures
import openai

def query_gpt4(prompt):
    client = openai.OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=4096
    )
    return response.choices[0].message.content

def ask_gpt_with_context(question, context_documents, document_names):
    prompt = f"Question: {question}\n\nContext:\n"
    for doc, name in zip(context_documents, document_names):
        prompt += f"Document: {name}\n{doc}\n\n"

    max_length = 128000
    if len(prompt) > max_length:
        prompt = prompt[-max_length:]

    return query_gpt4(prompt)

def process_question(query):
    top_documents = find_top_documents(query)
    top_doc_names = [doc_name for doc_name, _ in top_documents]
    top_doc_contents = [documents[document_names.index(doc_name)] for doc_name in top_doc_names]
    answer = ask_gpt_with_context(query, top_doc_contents, top_doc_names)

    output = {
        "question": query,
        "documents_used": top_doc_names,
        "answer": answer
    }

    return output

# Using ThreadPoolExecutor for parallel processing
outputs2 = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit each query to the executor
    future_to_query = {executor.submit(process_question, query): query for query in guiding_questions}

    # As each future completes, process its result
    for future in concurrent.futures.as_completed(future_to_query):
        query = future_to_query[future]
        try:
            output = future.result()
            outputs2.append(output)
            print(f"Question: {query}")
            print(f"Documents used: {', '.join(output['documents_used'])}")
            print("Answer:", output['answer'])
            print("-------------------------------------------------------------------------------------------------------------\n\n")
        except Exception as exc:
            print(f"{query} generated an exception: {exc}")



Question: What is a paradigmatic relation?
Documents used: 7_8_Paradigmatic_Relation_Discovery_Part_1.txt, 7_7_Word_Association_Mining_And_Analysis.txt, 7_9_Paradigmatic_Relation_Discovery_Part_2.txt
Answer: A paradigmatic relation is a type of word association where two words are considered to be paradigmatically related if they can be substituted for each other without significantly altering the meaning of the sentence. This relates to them being in the same semantic or syntactic class.

In summary:
- **Paradigmatic Relation**: Two words A and B have a paradigmatic relation if they can be substituted for each other in a sentence, usually reflecting that they belong to the same semantic or syntactic class. This substitutability indicates that they occur in similar contexts within the text. For example, "cat" and "dog" can both be used in the sentence "The ___ sleeps on the couch," indicating a paradigmatic relation between the words as they are both household animals that can perform 

### Faster and cheaper with 3.5k turbo

In [None]:
%%time
import concurrent.futures
import openai

def query_gpt4(prompt):
    client = openai.OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=4096
    )
    return response.choices[0].message.content

def ask_gpt_with_context(question, context_documents, document_names):
    prompt = f"Question: {question}\n\nContext:\n"
    for doc, name in zip(context_documents, document_names):
        prompt += f"Document: {name}\n{doc}\n\n"

    max_length = 16000
    if len(prompt) > max_length:
        prompt = prompt[-max_length:]

    return query_gpt4(prompt)

def process_question(query):
    top_documents = find_top_documents(query)
    top_doc_names = [doc_name for doc_name, _ in top_documents]
    top_doc_contents = [documents[document_names.index(doc_name)] for doc_name in top_doc_names]
    answer = ask_gpt_with_context(query, top_doc_contents, top_doc_names)

    output = {
        "question": query,
        "documents_used": top_doc_names,
        "answer": answer
    }

    return output

# Using ThreadPoolExecutor for parallel processing
outputs3 = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit each query to the executor
    future_to_query = {executor.submit(process_question, query): query for query in guiding_questions}

    # As each future completes, process its result
    for future in concurrent.futures.as_completed(future_to_query):
        query = future_to_query[future]
        try:
            output = future.result()
            outputs3.append(output)
            print(f"Question: {query}")
            print(f"Documents used: {', '.join(output['documents_used'])}")
            print("Answer:", output['answer'])
            print("-------------------------------------------------------------------------------------------------------------\n\n")
        except Exception as exc:
            print(f"{query} generated an exception: {exc}")



Question: What is ambiguity?
Documents used: 7_6_Text_Representation_Part_2.txt, 7_5_Text_Representation_Part_1.txt, 7_3_Natural_Language_Content_Analysis_Part_1.txt
Answer: Ambiguity is the presence of multiple possible meanings or interpretations within a given context. In the context of natural language processing and text mining, ambiguity refers to the difficulty in accurately interpreting and understanding language due to the multiple possible meanings of words, sentences, and phrases. The presence of ambiguity makes it challenging for computers to process and analyze text data accurately.
-------------------------------------------------------------------------------------------------------------


Question: What is bag-of-words representation?
Documents used: 7_7_Word_Association_Mining_And_Analysis.txt, 7_8_Paradigmatic_Relation_Discovery_Part_1.txt, 7_9_Paradigmatic_Relation_Discovery_Part_2.txt
Answer: The bag-of-words representation is a method of representing text data whe

## Might want to look into fine tuning [OpenAI - Fine Tuning Models](https://platform.openai.com/docs/guides/fine-tuning)

In [None]:
import os
import json
import openai
import re
from pdfplumber import open as open_pdf

def extract_text_from_pdf(pdf_path):
    with open_pdf(pdf_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    return text

def process_lecture_transcripts(folder_path):
    lectures = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r') as file:
                lectures.append(file.read())
    return lectures

def split_textbook_into_sections(textbook_content, section_pattern):
    sections = re.split(section_pattern, textbook_content)
    return [section.strip() for section in sections if section.strip()]

def create_jsonl_data_lectures(lectures):
    """
    Create JSONL data from lecture transcripts.

    :param lectures: List of lecture transcripts.
    :return: List of JSON objects for each lecture.
    """
    jsonl_data_lectures = []
    for lecture in lectures:
        # Assume each lecture has a question or topic and a corresponding detailed explanation
        # Split them into 'question' and 'answer'
        # This is a placeholder; actual implementation will depend on the lecture format
        question, answer = split_lecture_into_qa(lecture)  # Define this function based on your lecture format
        jsonl_data_lectures.append({
            "prompt": question,
            "completion": answer
        })
    return jsonl_data_lectures

def create_jsonl_data_textbook(textbook_sections):
    """
    Create JSONL data from textbook sections.

    :param textbook_sections: List of sections from the textbook.
    :return: List of JSON objects for each section.
    """
    jsonl_data_textbook = []
    for section in textbook_sections:
        # Assume each section has a heading or topic and detailed content
        # Split them into 'topic' and 'content'
        # This is a placeholder; actual implementation will depend on the textbook format
        topic, content = split_section_into_topic_content(section)  # Define this function based on your textbook format
        jsonl_data_textbook.append({
            "prompt": topic,
            "completion": content
        })
    return jsonl_data_textbook


# Extract text from textbook PDF
textbook_content = extract_text_from_pdf('/content/textbook.pdf')

# Process lecture transcripts
lectures = process_lecture_transcripts('/content/lecture_transcripts/')

# Split textbook into sections
section_pattern = r"\n(?:Chapter|Section) \d+:"  # Adjust the pattern based on your textbook
textbook_sections = split_textbook_into_sections(textbook_content, section_pattern)

# Create JSONL data for fine-tuning from lectures and textbook sections
jsonl_data_lectures = create_jsonl_data_lectures(lectures)
jsonl_data_textbook = create_jsonl_data_textbook(textbook_sections)

# Combine both datasets
jsonl_data = jsonl_data_lectures + jsonl_data_textbook

# Save your data as a JSONL file
with open('training_data.jsonl', 'w') as f:
    for item in jsonl_data:
        f.write(json.dumps(item) + "\n")

# Upload the data using OpenAI API
openai.File.create(file=open("training_data.jsonl", "rb"), purpose="fine-tune")

# Create a fine-tuning job
# Replace 'file-abc123' with your file ID obtained after uploading
response = openai.FineTuning.create(
    training_file="file-abc123",
    model="gpt-3.5-turbo"
)

# The model name will be in the response
fine_tuned_model_name = response['fine_tuned_model']
print("Fine-tuned model name:", fine_tuned_model_name)

# Check response for details about the fine-tuning job
print(response)


In [None]:
response = openai.Completion.create(
  model=fine_tuned_model_name,  # Use the name of your fine-tuned model
  prompt="Your prompt here",
  max_tokens=50
)
print(response.choices[0].text)
