## Setting Up the environment

In [None]:
%pip install PyPDF2
%pip install pandas
%pip install re
%pip install openai==0.28
%pip install openpyxl

In [None]:

import openai
import os
import glob
from PyPDF2 import PdfReader
import pandas as pd
import re

In [None]:
openai.api_key = ""

## Extract the PDF file into text

In [None]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        text = text.replace("\n", " ").replace("\t", " ")
    return text

extract_text_from_pdf("sample_input.pdf")

## ChatGPT Extraction

### 1. Personality Analysis

In [None]:
def gpt_extract_personality(statement):
    response = openai.ChatCompletion.create(
        model="gpt-4o",  # Use the appropriate model name
        messages=[
            {"role": "system", "content": "Find out 10 keywords that describe this person personality based on their essay and generate the explanation of each personality based on the content, if the content is in chinese then the output should be in chinese, if the content is in english then the output should be in english."},
            {"role": "user", "content": statement},
        ]
    )

    # Extract the assistant's reply
    assistant_reply = response['choices'][0]['message']['content']
    return assistant_reply

## Parsing the Output

In [None]:
def parse_analyzed_text(analyzed_text):
    lines = analyzed_text.split('\n')
    parsed_data = []
    current_keyword = ""
    current_explanation = ""
    
    pattern = re.compile(r'\d+\. \*\*(.*?)\*\*:')
    
    for line in lines:
        match = pattern.match(line)
        if match:
            if current_keyword and current_explanation:
                parsed_data.append([current_keyword, current_explanation.strip()])
            current_keyword = match.group(1).strip()
            current_explanation = line[match.end():].strip()
        else:
            if current_keyword:
                current_explanation += ' ' + line.strip()
    
    if current_keyword and current_explanation:
        parsed_data.append([current_keyword, current_explanation.strip()])
    
    return parsed_data

### Save Output to Excel

In [None]:
def save_to_excel(parsed_data, ID,output_file):
    # Create a DataFrame from the parsed data
    df = pd.DataFrame(parsed_data, columns=['Personality', 'Explanation'])
    
    # Add a column for the file name
    df.insert(0, 'File Name', ID)
    
    with pd.ExcelWriter(output_file, mode='a', if_sheet_exists='overlay', engine='openpyxl') as writer:
        # Check if the sheet exists and get the last row to append the new data
        if 'Sheet1' in writer.sheets:
            startrow = writer.sheets['Sheet1'].max_row
        else:
            startrow = 0
        
        df.to_excel(writer, index=False, header=startrow==0, startrow=startrow)

    print(f"Results have been saved to {output_file}")

## Main Code

In [None]:
# Directory traversal and processing
base_dir = '2019-C'
sub_dirs = ['c-rejected']
output_file = '2019c_rejected_GPTanalysis.xlsx'

# Create an empty Excel file with the header if it doesn't exist
if not os.path.exists(output_file):
    df = pd.DataFrame(columns=['File Name', 'Personality', 'Explanation'])
    df.to_excel(output_file, index=False)

for sub_dir in sub_dirs:
    sub_dir_path = os.path.join(base_dir, sub_dir)
    print(f"Processing subdirectory: {sub_dir_path}")
    student_dirs = glob.glob(os.path.join(sub_dir_path, '2019-A-*'))
    if not student_dirs:
        print(f"No student directories found in {sub_dir_path}")
    for student_dir in student_dirs:
        print(f"Processing student directory: {student_dir}")
        
        pdf_files = glob.glob(os.path.join(student_dir, '*.pdf'))
        if not pdf_files:
            print(f"No PDF files found in {student_dir}")
        concatenated_text = ""
        for pdf_file in pdf_files:
            print(f"Reading PDF file: {pdf_file}")
            pdf_text = extract_text_from_pdf(pdf_file)
            print(f"Extracted text length from {pdf_file}: {len(pdf_text)}")
            concatenated_text += pdf_text + " "
        
        # Print the concatenated text for verification
        print(f"Concatenated text for {student_dir}:\n{concatenated_text[:1000]}...")  # Print first 1000 characters for brevity
        print(f"Total length of concatenated text: {len(concatenated_text)}")
        
        # Analyze concatenated text with ChatGPT
        analyzed_text = gpt_extract_personality(concatenated_text)
        
        # Parse the analyzed text to get keywords and explanations
        parsed_data = parse_analyzed_text(analyzed_text)
        
        # Get the student directory name
        student_dir_name = os.path.basename(student_dir)
        
        # Save the parsed data to Excel
        save_to_excel(parsed_data, student_dir_name, output_file)

print(f"Results have been saved to {output_file}")