## Setting Up the environment

In [None]:
%pip install PyPDF2
%pip install pandas
%pip install re
%pip install openai==0.28
%pip install openpyxl
%pip install pdf2image pytesseract
%pip install PyMuPDF

In [1]:
import openai
import glob
from PyPDF2 import PdfReader
import pandas as pd
import re
from pdf2image import convert_from_path
import pytesseract
from dotenv import load_dotenv
import os
import fitz
import json
from openpyxl import Workbook

Load OpenAI API

In [2]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

#### Data Cleaning: Extract Text from the PDF

In [3]:
def is_text_extracted_meaningful(text):
    # Check if the text is too short or seems not meaningful
    return len(text.strip()) > 100  # Adjust the threshold as needed

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        # Try extracting text using PyMuPDF
        text = ""
        document = fitz.open(pdf_path)
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            extracted_text = page.get_text("text")
            if extracted_text:
                text += extracted_text

        # Fallback to OCR if text extraction is not meaningful
        if not is_text_extracted_meaningful(text):
            text = ""
            images = convert_from_path(pdf_path)
            for image in images:
                text += pytesseract.image_to_string(image)
        
        text = text.replace("\n", " ").replace("\t", " ")
        return text
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

### GPT 4o

#### 1. Personality Analysis

In [None]:
personality_persona = "You are an expert in analyzing student application for a universty.You have been asked to review a student's application essay and provide an analysis on the student's motivation."
personality_prompt = f"""
Please analyze the following student's application statement to identify 10 keywords that indicate the student's personality. Provide an explanation of the student personality based on the keyword. For each keyword, provide the following information in the specified format:

[
  {{
    "Keyword": "",
    "Explanation": "",
    "Sentence Related to Keyword": ""
  }}
]

The output should be in this JSON format and no other output is accepted.

"""


#### 2. Motivation Analysis

In [None]:
def gpt_input(statement,persona,prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4o",  # Use the appropriate model name
        messages=[
            {"role": "system", "content":f"{persona} {prompt}"},
            {"role": "user", "content": statement},
        ]
    )

 # Extract the assistant's reply
    assistant_reply = response['choices'][0]['message']['content']
    return assistant_reply

### Parsing the Output

1. Personality

In [None]:
def parse_personality_text(analyzed_text):
    lines = analyzed_text.split('\n')
    parsed_data = []
    current_keyword = ""
    current_explanation = ""
    
    # Regular expression patterns to match different keyword formats
    pattern1 = re.compile(r'\d+\.\s*\*\*(.*?)\*\*')
    pattern2 = re.compile(r'\*\*\d+\.\s*(.*?):\*\*')
    
    for line in lines:
        match1 = pattern1.match(line)
        match2 = pattern2.match(line)
        if match1:
            if current_keyword and current_explanation:
                parsed_data.append([current_keyword, current_explanation.strip()])
            current_keyword = match1.group(1).strip()
            current_explanation = line[match1.end():].strip()  # Start explanation from the rest of the line if present
        elif match2:
            if current_keyword and current_explanation:
                parsed_data.append([current_keyword, current_explanation.strip()])
            current_keyword = match2.group(1).strip()
            current_explanation = line[match2.end():].strip()  # Start explanation from the rest of the line if present
        else:
            if current_keyword:
                if line.strip():  # Only add lines that are not just empty or whitespace
                    current_explanation += ' ' + line.strip()
    
    # Add the last keyword and explanation if present
    if current_keyword and current_explanation:
        parsed_data.append([current_keyword, current_explanation.strip()])
    
    return parsed_data

2. Motivation

In [None]:
# def parse_motivation_text(analyzed_text):
#     if not analyzed_text or not analyzed_text.strip():
#             print("The input text is empty or None.")
#             return None
        
#     try:
#         data = json.loads(analyzed_text)
#     except json.JSONDecodeError as e:
#         print(f"JSON decoding error: {e}")
#         return None
    
#     return data

def parse_motivation_text(analyzed_text):
    # Check if the input text is empty or None
    if not analyzed_text or not analyzed_text.strip():
        print("The input text is empty or None.")
        return None
    
    # Check for and remove the JSON heading if present
    if analyzed_text.startswith("```json"):
        analyzed_text = analyzed_text.split('\n', 1)[1]
    
    try:
        # Attempt to decode the JSON
        data = json.loads(analyzed_text)
    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}")
        return None
    
    return data

### Save Output to Excel

In [None]:
def save_to_excel(parsed_data, ID, output_file):
    # # Create a DataFrame from the parsed data
    # df = pd.DataFrame(parsed_data, columns=['Personality', 'Explanation'])
    
    # # Add a column for the file name
    # df.insert(0, 'File Name', ID)
    
    # if os.path.exists(output_file):
    #     # If the file exists, read the existing data
    #     existing_df = pd.read_excel(output_file)
    #     # Append the new data
    #     df = pd.concat([existing_df, df], ignore_index=True)
    
    # # Save the updated DataFrame back to the Excel file
    # df.to_excel(output_file, index=False)
    # print(f"Results have been saved to {output_file}")
     # Create a DataFrame from the parsed data
    # Check if parsed_data is valid
    if parsed_data is None:
        print("No data to save.")
        return
    
    # Create a DataFrame from the parsed data
    df = pd.DataFrame(parsed_data, columns=['Keyword', 'Explanation', 'Sentence Related to Keyword'])
    
    # Add a column for the file name
    df.insert(0, 'File Name', ID)
    
    if os.path.exists(output_file):
        # If the file exists, read the existing data
        existing_df = pd.read_excel(output_file)
        # Append the new data
        df = pd.concat([existing_df, df], ignore_index=True)
    
    # Save the updated DataFrame back to the Excel file
    df.to_excel(output_file, index=False, engine='openpyxl')
    print(f"Results have been saved to {output_file}")

### Main Code

In [None]:
motivation_persona = "You are an expert in analyzing student applications for a university. You have been asked to review a student's application essay and provide an analysis on the student's motivation."
motivation_prompt = f"""
Please analyze the following student's application statement to identify 10 keywords that indicate their motivation to study in university. For each keyword, provide the following information in the specified format:
[
  {{
    "Keyword": "",
    "Explanation": "",
    "Sentence Related to Keyword": ""
  }}
]
The output should be in this format and no other output is accepted.
"""
# personality_persona = "You are an expert in analyzing student application for a universty.You have been asked to review a student's application essay and provide an analysis on the student's motivation."
# personality_prompt = f"""
# Please analyze the following student's application statement to identify 10 keywords that indicate the student's personality. Provide an explanation of the student personality based on the keyword. For each keyword, provide the following information in the specified format:

# [
#   {{
#     "Keyword": "",
#     "Explanation": "",
#     "Sentence Related to Keyword": ""
#   }}
# ]

# The output should be in this format and no other output is accepted.

# """

base_dir = 'temp'
sub_dirs = ['c-enrolled']
output_file = 'temp_22c.xlsx'

for sub_dir in sub_dirs:
    sub_dir_path = os.path.join(base_dir, sub_dir)
    print(f"Processing subdirectory: {sub_dir_path}")
    
    student_dirs = sorted(glob.glob(os.path.join(sub_dir_path, '2021-A-*')))
    
    if not student_dirs:
        print(f"No student directories found in {sub_dir_path}\n")
        continue
        
    for student_dir in student_dirs:
        print(f"Processing student directory: {student_dir}\n")
        
        pdf_files = sorted(glob.glob(os.path.join(student_dir, '*.pdf')))
        if not pdf_files:
            print(f"No PDF files found in {student_dir}")
            continue
        
        concatenated_text = ""
        for pdf_file in pdf_files:
            print(f"Reading PDF file: {pdf_file}")
            pdf_text = extract_text_from_pdf(pdf_file)
            concatenated_text += pdf_text + " "
        
        print(f"Concatenated text for {student_dir}:\n{concatenated_text[:1000]}...")  
        print(f"Total length of concatenated text: {len(concatenated_text)}")
        
        analyzed_text = gpt_input(concatenated_text, motivation_persona, motivation_prompt)
        print(f"Analyzed text for {student_dir}:\n{analyzed_text}\n")
        parsed_data = parse_motivation_text(analyzed_text)
        print(f"Parsed data for {student_dir}:\n{parsed_data}\n")
        student_dir_name = os.path.basename(student_dir)
        save_to_excel(parsed_data, student_dir_name, output_file)

print(f"Results have been saved to {output_file}")

#### Motivation Category Analysis by GPT API

In [None]:
file_path = 'Statistics_2022eecs_GPT.xlsx'

df = pd.read_excel(file_path, sheet_name='ENROLLED_MOTIVATION')  # Replace 'Sheet1' with your sheet name
# Extract unique values from a specific column (e.g., 'Column1')
unique_values = df['Keyword'].unique()

# Create a new DataFrame for the unique values
unique_df = pd.DataFrame(unique_values, columns=['Unique Values'])
unique_list = unique_df.values.flatten().tolist()
print(unique_list)

Gave ChatGPT Persona

In [None]:
persona = """
You are a data analysis specializing in natural language processing and keyword extraction.
You will be given a list of keywords about student motivation to study in university, reorganize the following keywords into appropriate categories.
"""

In [None]:
def gpt_personality_conclusion(statement,persona):
    response = openai.ChatCompletion.create(
        model="gpt-4o",  # Use the appropriate model name
        messages=[
            {
                "role": "system", 
                "content": persona
            },
            {"role": "user", "content": statement},
        ],
    )

    # Extract the assistant's reply
    assistant_reply = response['choices'][0]['message']['content']
    return assistant_reply

In [None]:
# Step 1: Find the top keywords from all the keywords in excel
keywords_list = unique_list

keywords_prompt = f"""{persona}\n. Here is your task:\n
                    1. Read all of the Keywords.\n
                    2. Take the keyword input and recategorized this keyword into 20 category.\n
                    2. If they mention school name such as NTHU, make the category School.\n
                    3. Make sure all keyword got into category.\n
                    4. Make the category more specific. No miscellaneous category.\n
                    5. You should output the data in the following format:\n\n
                        1. [Category]: Keyword, Keyword, Keyword,..\n
                    The output should be in this format and no other output is accepted.\n
                    Here is the list of keywords that shows student motivation to study in university: {keywords_list}\n
                    """
keywords_category = gpt_personality_conclusion(keywords_prompt,persona)
print("Category and Keyword:")
print(keywords_category)


In [None]:
# Use regular expressions to extract keyword categories and keywords
pattern = re.compile(r'\d+\.\s+([^:]+):\s+(.+)')
matches = pattern.findall(keywords_category)

# Prepare data for the DataFrame
data = []
for category, keywords in matches:
    keywords_list = [k.strip() for k in keywords.split(',')]
    for keyword in keywords_list:
        data.append({'keyword category': category.strip(), 'keyword': keyword})

# Convert the data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
df.to_excel('categorized_keywords.xlsx', index=False)

print("Data has been saved to categorized_keywords.xlsx")

# Print the DataFrame to verify
print(df)

Find Sentence in Sentiment Analysis

In [15]:
base_dir = 'BM_P'
sub_dirs = ['Ambitious']

sentence_persona = "You are an expert in analyzing student application for a universty. You have been asked to review a student's application essay"

def gpt_sentence_identification(essay_text):
    # Prompt to send to GPT-4
    prompt = f"""
    Here is a student's essay:
    
    {essay_text}
    
    Identify three specific sentences or parts of the text that suggest the student is ambitious.

    The output should be like this format and no other format is accepted:
    {{
        "Sentence": ""
    }},
    {{
        "Sentence": ""
    }},
    {{
        "Sentence": ""
    }}
    """

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system", 
                "content": sentence_persona
            },
            {"role": "user", "content": prompt},
        ],
    )
    
    assistant_reply = response['choices'][0]['message']['content']
    return assistant_reply

def process_gpt_output(assistant_reply):
    try:
        # Convert the string to JSON format
        sentences_data = json.loads(f'[{assistant_reply.replace("},", "},")}]')
        # Extract just the sentences
        sentences = [entry["Sentence"] for entry in sentences_data]
        return sentences
    except Exception as e:
        print(f"Error processing GPT-4 output: {e}")
        return []

# main code
data = []
for sub_dir in sub_dirs:
    sub_dir_path = os.path.join(base_dir, sub_dir)
    print(f"Processing subdirectory: {sub_dir_path}")
    
    student_dirs = sorted(glob.glob(os.path.join(sub_dir_path, '2019-B-010')))
    
    if not student_dirs:
        print(f"No student directories found in {sub_dir_path}\n")
        continue
        
    for student_dir in student_dirs:
        print(f"Processing student directory: {student_dir}\n")
        
        pdf_files = sorted(glob.glob(os.path.join(student_dir, '*.pdf')))
        if not pdf_files:
            print(f"No PDF files found in {student_dir}")
            continue
        
        concatenated_text = ""
        for pdf_file in pdf_files:
            print(f"Reading PDF file: {pdf_file}")
            pdf_text = extract_text_from_pdf(pdf_file)
            concatenated_text += pdf_text + " "
        
        print(f"Concatenated text for {student_dir}:\n{concatenated_text[:1000]}...")  
        print(f"Total length of concatenated text: {len(concatenated_text)}")
    

    analyzed_text = gpt_sentence_identification(concatenated_text)
    if analyzed_text:
        print(f"Analyzed text for {student_dir}:\n{analyzed_text}\n")
            
        # Process the output and get the sentences
        sentences = process_gpt_output(analyzed_text)
            
        # Add to data for further processing
        data.append({"Student Directory": student_dir, "Ambitious Sentences": "\n".join(sentences)})

Processing subdirectory: BM_P/Ambitious
Processing student directory: BM_P/Ambitious/2019-B-010

Reading PDF file: BM_P/Ambitious/2019-B-010/讀書計畫.pdf
Concatenated text for BM_P/Ambitious/2019-B-010:
Sample extracted text from PDF ...
Total length of concatenated text: 31
Analyzed text for BM_P/Ambitious/2019-B-010:
I'm sorry, I would need the actual text from the student's essay to identify specific sentences or parts that suggest the student is ambitious. Please provide the relevant content.

Error processing GPT-4 output: Expecting value: line 1 column 2 (char 1)
