In [32]:
import fitz
import re
import os
import PyPDF2
import pandas as pd
from datetime import datetime
import logging
#from textblob import TextBlob
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ethanpanal/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [33]:
# Define paths
input_folder = "/Users/ethanpanal/Documents/UP/capstone_raw_files/php/raw_php_pdf"
#output_folder = "./bps/bps_redacted"

In [34]:
data = []

In [35]:
def extract_biggest_emotions(text):
    """Extracts biggest emotions as a comma-separated string, capturing emotions before and after the rating."""
    match = re.search(r'Biggest emotion/rating:\s*(.*?)(?:,\s*)?(\d{1,2})/10(?:,\s*(.*))?', text)
    
    if match:
        before_rating = match.group(1).strip()  # Emotions before the rating
        after_rating = match.group(3).strip() if match.group(3) else ""  # Emotions after the rating
        
        # Combine emotions before and after the rating
        emotions = before_rating
        if after_rating:
            emotions += ", " + after_rating
        
        # Clean up by removing leading commas/spaces
        emotions = emotions.lstrip(', ').strip()

        # Remove quotes, parentheses, and replace slashes with commas
        emotions = re.sub(r'[\"()]', '', emotions)  # Corrected regular expression
        emotions = emotions.replace('/', ',')  # Replace slashes with commas
        emotions = re.sub(r'\d+', '', emotions)  # Remove numbers
        emotions = emotions.lower()  # Convert to lowercase
        
        return emotions
    
    return None


def extract_strongest_rating(text):
    """Extracts the strongest rating before any craving/impulse section or the end of the text."""
    # Extract only the section between 'Biggest emotion/rating:' and 'Craving/impulse'
    match = re.search(r'Biggest emotion/rating:\s*(.*?)(?:Craving/impulse|$)', text, re.DOTALL)
    
    if match:
        section = match.group(1)  # Extracted section before cravings
        
        # Find ratings in the format of either 'emotion, rating' or 'rating, emotion'
        ratings = re.findall(r'(\d{1,2})/10', section)  # Extract ratings in /10 format
        
        if ratings:
            # Convert ratings to integers and return the highest rating (capped at 10)
            return min(max(map(int, ratings)), 10)
    
    return None  # Return None if no valid rating is found
    
def extract_cravings_rating(text):
    """Extracts the first cravings/impulse rating."""
    match = re.search(r'Craving[s]?/impulse.*?:\s*(\d{1,2})/10', text, re.IGNORECASE)
    if match:
        return int(match.group(1))  # Extract the first rating as an integer
    return None


def extract_coping_skill(text):
    if not text or not isinstance(text, str):
        return None  # Or handle it differently if necessary
        
    """Extracts the full coping skill reflection summary."""
    # Match 'reflection summary' (case insensitive) with flexible spacing and punctuation variations
    match = re.search(r'coping skill.*?summary[:]\s*(.*?)(?=\n[A-Z]|\Z)', text, re.IGNORECASE | re.DOTALL)
    
    # If a match is found, return the captured text
    if match:
        return match.group(1).strip()
    return None

def extract_coping_skills_yesterday(text):
    if not text or not isinstance(text, str):
        return None  # Or handle it differently if necessary
        
    """Extracts coping skills used yesterday by searching for keywords or quoted skills."""
    match = re.search(r'(?:yesterday.*?"([^"]+)"|[Cc]lient use[d]? "([^"]+)")', text, re.IGNORECASE)
    
    # Check which capturing group matched
    if match:
        return match.group(1) or match.group(2)
    
    return None

def extract_coping_skills_today(text):
    if not text or not isinstance(text, str):
        return None  # Or handle it differently if necessary
        
    """Extracts coping skills planned for today by looking for 'today' and quoted text."""
    match = re.search(r'(?:today.*?"([^"]+)"|".*?" today)', text, re.IGNORECASE)
    return match.group(1).strip() if match else None

def extract_all_coping_skills(text):
    """Extracts all coping skills found in quotes."""
    if not text or not isinstance(text, str):
        return None  # Or handle it differently if necessary
        
    # Regular expression to find all quoted coping skills
    matches = re.findall(r'"([^"]+)"', text)
    
    if matches:
        return ", ".join(matches)  # Join all skills found in quotes as a comma-separated string
    
    return None  # Return None if no skills are found

In [36]:
def classify_emotion_sentiment_with_vader(emotion_text):
    """Classify sentiment using VADER (NLTK)."""
    if not emotion_text:
        return "neutral"
    
    # Initialize the VADER sentiment analyzer
    sid = SentimentIntensityAnalyzer()
    
    # Get the sentiment scores
    sentiment_scores = sid.polarity_scores(emotion_text)
    
    # If compound score is positive, the sentiment is positive, negative if compound is negative
    if sentiment_scores['compound'] > 0:
        return "positive"
    elif sentiment_scores['compound'] < 0:
        return "negative"
    else: 
        return "negative"

In [37]:
def adjust_rating(emotion_sentiment, biggest_rating):
    # Check if biggest_rating is None
    if biggest_rating is None:
        return None  # Or some default value if you'd like, e.g., 0
    
    if emotion_sentiment == 'negative':
        return biggest_rating * -1
    else:
        return biggest_rating

In [38]:
# Iterate through all files in the folder
for filename in os.listdir(input_folder):
    if filename.endswith(".pdf"):
        # Extract group identifier from filename (format is bps_xxxxxx.pdf)
        group_identifier = filename.split('_')[1].replace('.pdf', '')

        filedate = pd.to_datetime(filename.split('_')[2].replace('.pdf',''),
                                  format='%Y%m%d').strftime('%m/%d/%Y')

        # PDF file path
        pdf_path = os.path.join(input_folder, filename)
        
        # text from the PDF 
        doc = fitz.open(pdf_path)
        extracted_text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            extracted_text += page.get_text("text")

        #print(extracted_text)

        coping_skill_text = extract_coping_skill(extracted_text)
        emotion_text = extract_biggest_emotions(extracted_text)
        emotion_sentiment = classify_emotion_sentiment_with_vader(emotion_text)

        strong_rating = extract_strongest_rating(extracted_text)

        # Combine the extracted data into a single dictionary
        result = {
            "group_identifier": group_identifier,
            "assmt_dt": filedate,
            "biggest_emotions": emotion_text,
            "emotion_sentiment": emotion_sentiment,
            "biggest_rating": strong_rating,
            "adjusted_rating": adjust_rating(emotion_sentiment, strong_rating),  # Calculate adjusted rating
            "cravings_rating": extract_cravings_rating(extracted_text),
            "coping_skill_reflection": coping_skill_text,
            "coping_skills": extract_all_coping_skills(coping_skill_text)
            #"coping_skill_yesterday": extract_coping_skills_yesterday(coping_skill_text),
            #"coping_skill_today": extract_coping_skills_today(coping_skill_text)

        }

        #result.update(bps_scores)  # Add the scores to the result

        # Append the result to the data list
        data.append(result)

In [39]:
indv_php = pd.DataFrame(data)

In [40]:
indv_php.tail()

Unnamed: 0,group_identifier,assmt_dt,biggest_emotions,emotion_sentiment,biggest_rating,adjusted_rating,cravings_rating,coping_skill_reflection,coping_skills
87,fc16b67cf2c3,02/03/2025,"joy, boredom",positive,6.0,6.0,3.0,"Client used ""positive affirmations"" yesterday ...",positive affirmations
88,e0f34e89bcb6,02/05/2025,"anxiety, shame",negative,9.0,-9.0,7.0,"Client used coping skill ""calling my mom"" yest...",calling my mom
89,6ebe69c5f8a8,01/21/2025,,neutral,,,2.0,"Client used ""patience"" as a coping skill yeste...",patience
90,98e234ba5e17,01/08/2025,joy,positive,5.0,5.0,0.0,"CL stated, ""I don't know, I got anxious at the...",
91,e0f34e89bcb6,02/10/2025,anxiety,negative,8.0,-8.0,4.0,"Client used ""going to the beach"" as a coping s...",going to the beach


In [41]:
indv_php.to_csv('/Users/ethanpanal/Documents/UP/capstone_raw_files/indv_php_anon.csv')