## Setting Up the environment

In [None]:
# %pip install PyPDF2
# %pip install pandas
# %pip install re
# %pip install openai==0.28
# %pip install openpyxl
# %pip install pdf2image pytesseract
%pip install PyMuPDF

In [None]:

import openai
import os
import glob
from PyPDF2 import PdfReader
import pandas as pd
import re
from pdf2image import convert_from_path
import pytesseract
import fitz  # PyMuPDF

In [None]:
openai.api_key = ""

## Extract the PDF file into text

In [None]:
def is_text_extracted_meaningful(text):
    # Check if the text is too short or seems not meaningful
    return len(text.strip()) > 100  # Adjust the threshold as needed

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        # Try extracting text using PyMuPDF
        text = ""
        document = fitz.open(pdf_path)
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            extracted_text = page.get_text("text")
            if extracted_text:
                text += extracted_text

        # Fallback to OCR if text extraction is not meaningful
        if not is_text_extracted_meaningful(text):
            text = ""
            images = convert_from_path(pdf_path)
            for image in images:
                text += pytesseract.image_to_string(image)
        
        text = text.replace("\n", " ").replace("\t", " ")
        return text
    except Exception as e:
        print(f"Error occurred: {e}")
        return None


## ChatGPT Extraction

### 1. Personality Analysis

In [None]:
def gpt_extract_personality(statement):
    response = openai.ChatCompletion.create(
        model="gpt-4o",  # Use the appropriate model name
        messages=[
            {"role": "system", "content": "Find out 10 keywords that describe this person personality based on their essay and generate the explanation of each personality based on the content put it with the header 'Personality and Explanation', if the content is in traditional chinese then the output should be in traditional chinese, if the content is in english then the output should be in english. please explain why do you output the keyword in that order with the header 'Order Explanation', and also explain straightforwardly your process of thinking, put it in the header 'Process of Thinking' "},
            {"role": "user", "content": statement},
        ]
    )

    # Extract the assistant's reply
    assistant_reply = response['choices'][0]['message']['content']
    return assistant_reply



## Parsing the Output

In [None]:
def parse_analyzed_text(analyzed_text):
    lines = analyzed_text.split('\n')
    parsed_data = []
    current_keyword = ""
    current_explanation = ""
    
    # Regular expression patterns to match different keyword formats
    pattern1 = re.compile(r'\d+\.\s*\*\*(.*?)\*\*')
    pattern2 = re.compile(r'\*\*\d+\.\s*(.*?):\*\*')
    
    for line in lines:
        match1 = pattern1.match(line)
        match2 = pattern2.match(line)
        if match1:
            if current_keyword and current_explanation:
                parsed_data.append([current_keyword, current_explanation.strip()])
            current_keyword = match1.group(1).strip()
            current_explanation = line[match1.end():].strip()  # Start explanation from the rest of the line if present
        elif match2:
            if current_keyword and current_explanation:
                parsed_data.append([current_keyword, current_explanation.strip()])
            current_keyword = match2.group(1).strip()
            current_explanation = line[match2.end():].strip()  # Start explanation from the rest of the line if present
        else:
            if current_keyword:
                if line.strip():  # Only add lines that are not just empty or whitespace
                    current_explanation += ' ' + line.strip()
    
    # Add the last keyword and explanation if present
    if current_keyword and current_explanation:
        parsed_data.append([current_keyword, current_explanation.strip()])
    
    return parsed_data



### Save Output to Excel

In [None]:
def save_to_excel(parsed_data, ID, output_file):
    # Create a DataFrame from the parsed data
    df = pd.DataFrame(parsed_data, columns=['Personality', 'Explanation'])
    
    # Add a column for the file name
    df.insert(0, 'File Name', ID)
    
    if os.path.exists(output_file):
        # If the file exists, read the existing data
        existing_df = pd.read_excel(output_file)
        # Append the new data
        df = pd.concat([existing_df, df], ignore_index=True)
    
    # Save the updated DataFrame back to the Excel file
    df.to_excel(output_file, index=False)
    print(f"Results have been saved to {output_file}")

## Main Code

In [None]:
# Directory traversal and processing
base_dir = '2021-ET'
sub_dirs = ['et-rejected']
output_file = '2021et_rejected_GPTanalysis.xlsx'

for sub_dir in sub_dirs:
    sub_dir_path = os.path.join(base_dir, sub_dir)
    print(f"Processing subdirectory: {sub_dir_path}")
    
    student_dirs = sorted(glob.glob(os.path.join(sub_dir_path, '2021-ET-*')))
    
    if not student_dirs:
        print(f"No student directories found in {sub_dir_path}\n")
        continue
        
    for student_dir in student_dirs:
        print(f"Processing student directory: {student_dir}\n")
        
        pdf_files = sorted(glob.glob(os.path.join(student_dir, '*.pdf')))
        if not pdf_files:
            print(f"No PDF files found in {student_dir}")
            continue
        
        concatenated_text = ""
        for pdf_file in pdf_files:
            print(f"Reading PDF file: {pdf_file}")
            pdf_text = extract_text_from_pdf(pdf_file)
            concatenated_text += pdf_text + " "
        
        print(f"Concatenated text for {student_dir}:\n{concatenated_text[:1000]}...")  
        print(f"Total length of concatenated text: {len(concatenated_text)}")
        
        analyzed_text = gpt_extract_personality(concatenated_text)
        print(f"Analyzed text for {student_dir}:\n{analyzed_text}\n")
        parsed_data = parse_analyzed_text(analyzed_text)
        print(f"Parsed data for {student_dir}:\n{parsed_data}\n")
        
        student_dir_name = os.path.basename(student_dir)
        save_to_excel(parsed_data, student_dir_name, output_file)

print(f"Results have been saved to {output_file}")
   
