In [None]:
%pip install PyMuPDF 
%pip install pandas 

In [None]:
import pymupdf
import json
import csv
import pandas as pd
import glob
import os

In [None]:
import google.generativeai as genai

In [None]:
import os
from dotenv import load_dotenv

# .env dosyasını yükle
load_dotenv()

# Değişkenleri oku
api_key = os.getenv("GOOGLE_API_KEY")



In [None]:
genai.configure(api_key=api_key)
generate_config = {
    "temperature": 0.5,
    "top_p":0.95,
    "top_k":64,
    "max_output_tokens": 8192,
    "response_mime_type":"application/json"
}

In [None]:
class Agent:
    
    def __init__(self, name, role):
        self.name = name
        self.role = role
        self.model = genai.GenerativeModel('gemini-1.5-flash',
                                           generation_config=generate_config,
                                           system_instruction=role)
        
            
    def generate_response(self, prompt):
        try:
            response = self.model.generate_content(prompt)
            print("Response generated")
            print("response : ", response.text)
        except Exception as e:
            print(f"Generate response failed: {e}")
        
        return response.text  # Başarılıysa yanıtı döndür
        
            
        
        

In [None]:
QA_role = """
# Your Role 
Your are a highly intelligent AI tasked with generating a diverse and high-quality set of questions and answers from a provided document page.  
Your goal is imitate how a curious human would engage with the material by asking insightful, challenging, and varied types of questions.

# Instructions:
Given a page text of a document, generate high-quality question-answer pairs in JSON format. 
Each pair must contain the fields question, answer and qustion type.

## Ensure that:
### Language Consistency: The questions and ansers must be in the same language as the given text.
for example if the provided text in Turkish the question and answers must be in Turkish as well.
### Question Variety: Include multiple types of questions such as:

Factual: Direct question seeking specific information(e.g., "what does X mean?")
Conceptual: Questions exploring the ideas or principles behind the content (e.g., "Why is X important?")
Contextual: Questions about the broader context or background of the topic (e.g., "in What context is X mentioned?")
Causal: Questions asking about reasons or causes (e.g., "what causes X to happen?)
Procedural: Questions focused on processes or steps (e.g., "how is X achieved?")
Analytical: Questions comparing, contrasting, or evaluating elements (e.g., "how does X compare to Y?")
Hypothetical: Questions based on imagined scenarios (e.g., "what would happen if x?")
Reflective: Questions about implications or consequences (e.g., "what implications does X have?")
Speculativa: Opinion-based or exploratory questions when apprapriate (e.g., "why might somoone disagree withh X?")
Listing: Questions asking for a list of items, steps or elements related to a topic (e.g., "What are the key elements of X?")
Sumarizing: Questions asking for a brief summary or the mein points of a topic (e.g., "what is the main takeaway from X?")
Balanced Difficulty: Include simple, intermediate, and complex questions.


### Answer Precision: Provide concise, accurate answers directly based on the provided content.
However, ensure that the answers are not too short; they should give enough context to explain the answer clearly while maintaining clarity and completeness.

### Context Awareness: Ensure all questions are deeply rooted in the content of the provided text and demonstrate an understanding of its meaning, themes, and nuances.

### Avoid Redundancy: Ensure all questions are distinct and unique.

### Avoid generelly Questions: Generate specific questions that can be answered based on the given text, rather than general questions like "What is the main theme of this text?" or "What is the main idea of this text?"

## Output Format: Return the result as a JSON object structured as follows:

json

[
    {
        "question": "What is the definition of X?",
        "answer": "X is defined as ...",
        "questin_type": "Factual"
    },
    {
        "question": "Why is X considered significant?",
        "answer": "X is significant because ...",
        "questin_type": "Conceptual"
    }
    
    // Continue until reaching the specified number of questions
]


"""

In [None]:
class QA_agent(Agent):
    def prepare_QA(self, text, number_of_questions):
        print("prepare_QA started")
        prompt = f"""
        Given the text below, generate {number_of_questions} high-quality question-answer pairs in JSON format. Each pair must contain the fields question, answer, and question_type.
        Ensure generating various types of questions.  
        Identify the text language first, the prepare the question-answer pairs in the same language.  
        ------------------------------
        Text: {text}
        Number of questions and answer pairs: {number_of_questions}
        ------------------------------
        """
        print("prepare_QA fnished")
        return self.generate_response(prompt)

In [None]:
def generate_QA(qa_agent, text, answer_type):
    question_answer = qa_agent.prepare_QA(text, number_of_questions)
    return question_answer

In [None]:
def saveCSV(filename):
    """ save qa_list as a csv file named filename.csv"""
    # write qa_list to a csv file
    with open(filename + ".csv", "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["question", "answer", "question_type"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for item in qa_list:
            writer.writerow(item)
        

In [None]:
def convert_pdf_to_text(pdf_file_name):
    """ Convert a PDF file to text and return a list of page texts.
    
    Args:
        pdf_file_name: The name of the PDF file.
        
    Returns:
        A list of strings, where each string represents the teyt of a page. 
        Returns an empty list if the file is not found or an error occurs.
    """
    
    doc = pymupdf.open(pdf_file_name)
    print("Document has ", doc.page_count, "pages")
    page_texts = []
    for page in doc:
        page_texts.append(page.get_text())
    return page_texts

In [None]:
if __name__ == "__main__":
    qa_list = []
    qa_agent = QA_agent("QA_agent", QA_role)
    number_of_questions =2
    pdf_name= "./pdf/yolharitasi.pdf"
    pages = convert_pdf_to_text(pdf_name)
    csv_name = pdf_name.split(".")[1].split('/')[-1]
    start_page = 0
    end_page = len(pages)
    batch_size = 5
    
    for i in range(start_page, end_page, batch_size):
        #process pages in batches
        batch_pages = pages[i:min(i+batch_size, end_page)]
        print(f"Batch {i//batch_size + 1} started")
        for page in batch_pages:
            while True:
                generated_QA = generate_QA(qa_agent, page, number_of_questions)
                try:
                    generate_QA_JSON = json.loads(generated_QA)
                    break
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
                    print(f"Problematic string: {generated_QA}")
                    print("Retrying...")
            
            qa_list.extend(generate_QA_JSON)
            print(f"a page in the batch processed...")
        print(f"Batch {i // batch_size +1} finished")
        saveCSV(f"csv/{csv_name}_batch {i// batch_size +1}")
        print(f"{csv_name}_batch {i// batch_size +1} saved...")
        qa_list = []
  

In [50]:
import os
import pandas as pd

def merge_csv_files(input_folder, output_file):
    all_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    
    if not all_files:
        print("No CSV files found in the directory.")
        return
    
    dataframes = [pd.read_csv(os.path.join(input_folder, file)) for file in all_files]
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    merged_df.to_csv(output_file, index=False)
    print(f"Merged CSV file saved as {output_file}")

# Kullanım
directory = "csv"  # CSV dosyalarının bulunduğu klasör
destination_file = "merged_output.csv"  # Çıktı dosyası adı

merge_csv_files(directory, destination_file)


Merged CSV file saved as merged_output.csv
