## Always run the below cell

In [26]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
from src.sys_prompt import mc_prompt_emotion_v2, mc_prompt_commitment_v2, mc_prompt_purpose_v2, mc_prompt_how_to_raise_funds_v1
from src.data_processing import get_data
import re
import json
import pandas as pd
import logging
import datetime

# Set file names

# Path of the processed file
file_to_save_processed = r'/Users/harshgarg/Desktop/mc-lm-genai/data/LM Applications_test4.xlsx'
# Path of the unprocessed file
file_to_import_unprocessed = r"/Users/harshgarg/Desktop/mc-lm-genai/data/LM Applications.xlsx"
# Path of partially processed file
file_to_import_partiallyProcessed = r"/Users/harshgarg/Desktop/mc-lm-genai/data/LM Applications.xlsx"
# Path of the file to run a single row from
file_to_import_single_cell = r"/Users/harshgarg/Desktop/mc-lm-genai/data/LM Applications.xlsx"

# Configure logging with timestamp
log_filename = f"mc_scoring_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    # handlers=[
    #     logging.FileHandler(log_filename),
    #     logging.StreamHandler()
    # ]
    filename=log_filename,
    force=True
)

# Log the start of the script
logging.info("Script execution started")


load_dotenv()
endpoint = os.getenv("ENDPOINT_URL")
deployment = os.getenv("DEPLOYMENT_NAME_1")
endpoint = os.getenv("ENDPOINT_URL")
subscription_key = os.getenv("AZURE_OPEN_AI_API_KEY")

# Initialize Azure OpenAI Service client with key-based authentication
client = AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=subscription_key,
    api_version="2024-05-01-preview",
)

necessary_columns = ['id',
       'In Memory', 'Matched Fundraising',
       'How will you match your fundraising target', 'Pledge Amount',
       'Finish Time',
       'Reason', 'PR', 'Ballot']


## Helper functions (the next 2 cells)

In [27]:
def get_llm_output(prompt, row_data):
    # Log the start of the function call
    logging.info(f"Processing row data using prompt: {prompt[:50]}...")
    logging.info(f"Processing row: {row_data}")
    
    try:
        # Prepare the chat prompt
        chat_prompt = [{"role": "user", "content": [{"type": "text", "text": prompt + "\n" + str(row_data)}]}]
        messages = chat_prompt

        # Generate the completion
        logging.info("Sending request to OpenAI API")
        completion = client.chat.completions.create(
            model=deployment,
            messages=messages,
            max_tokens=800,
            temperature=0.7,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
            stream=False,
        )

        # Parse the JSON content from the completion
        response_content = completion.choices[0].message.content
        logging.info("Received response")
        logging.info(f"Response: {response_content}")

        
        # Extract the JSON part from the markdown code block
        json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', response_content, re.DOTALL)
        
        if json_match:
            json_str = json_match.group(1)
            result = json.loads(json_str)
            logging.info("Successfully parsed JSON response")
        else:
            # Fallback if regex doesn't match
            # Try to evaluate directly (be cautious with eval)
            logging.warning("JSON regex match failed, attempting direct eval")
            result = eval(response_content)
            
        return result
    
    except Exception as e:
        # Log any exceptions that occur
        logging.error(f"Error processing: {str(e)}")
        # Return a default value in case of error
        return {"error": str(e), "score": 0}

In [28]:
def process_row(row_data, value, index):
    # Process fundraising scoring
    if pd.notna(row_data['How will you match your fundraising target']) and len(str(row_data['How will you match your fundraising target'])) > 0:
        logging.info(f"Processing fundraising for ID: {value['id']}")
        fundraising_result = get_llm_output(mc_prompt_how_to_raise_funds_v1, {'How will you match your fundraising target': row_data['How will you match your fundraising target']})
        logging.info(f"Fundraising score for ID {value['id']}: {fundraising_result.get('fundraising_score', 0)}")
    else:
        logging.info(f"No fundraising data for ID: {value['id']}, assigning default score 0")
        fundraising_result = {"fundraising_score": 0}

    # Process PR data for emotion, purpose and commitment scoring
    if pd.notna(row_data['PR']) and len(str(row_data['PR'])) > 0:
        logging.info(f"Processing emotional content for ID: {value['id']}")
        emotion_result = get_llm_output(mc_prompt_emotion_v2, {"PR": row_data['PR']})
        logging.info(f"Emotion score for ID {value['id']}: {emotion_result.get('emotion', 0)}")
        
        logging.info(f"Processing purpose content for ID: {value['id']}")
        purpose_result = get_llm_output(mc_prompt_purpose_v2, {"PR": row_data['PR']})
        logging.info(f"Purpose score for ID {value['id']}: {purpose_result.get('purpose', 0)}")
        
        logging.info(f"Processing commitment content for ID: {value['id']}")
        commitment_result = get_llm_output(mc_prompt_commitment_v2, {"PR": row_data['PR']})
        logging.info(f"Commitment score for ID {value['id']}: {commitment_result.get('commitment', commitment_result.get('committment', 0))}")
    else:
        logging.info(f"No PR data for ID: {value['id']}, assigning default scores 0")
        emotion_result = {"emotion": 0}
        purpose_result = {"purpose": 0}
        commitment_result = {"commitment": 0}

    # Update the dataframe with scores
    try:
        df.at[index, 'fundraising'] = fundraising_result['fundraising_score']
        df.at[index, 'emotion'] = emotion_result['emotion']
        df.at[index, 'purpose'] = purpose_result['purpose']
        
        # Handle the commitment/committment spelling variation in the result
        if 'commitment' in commitment_result:
            df.at[index, 'commitment'] = commitment_result['commitment']
        elif 'committment' in commitment_result:
            df.at[index, 'commitment'] = commitment_result['committment']
        else:
            df.at[index, 'commitment'] = 0
            logging.warning(f"No commitment score found for ID {value['id']}")
            
        # Calculate final score
        df.at[index, 'Final_score'] = 0  # Reset the score before adding components
        
        # Add scores from LLM evaluations
        df.at[index, 'Final_score'] += float(df.at[index, 'emotion']) + float(df.at[index, 'purpose']) + float(df.at[index, 'commitment']) + float(df.at[index, 'fundraising'])
        logging.info(f"Base scores sum for ID {value['id']}: {df.at[index, 'Final_score']}")
        
        # Score for Matched Fundraising
        if value['Matched Fundraising'].endswith("Yes"):
            df.at[index, 'Final_score'] += 1
            logging.info(f"Added 1 point for Matched Fundraising (Yes) for ID {value['id']}")
        elif value['Matched Fundraising'].endswith("No"):
            df.at[index, 'Final_score'] += 0
            logging.info(f"Added 0 points for Matched Fundraising (No) for ID {value['id']}")
        else:
            df.at[index, 'Final_score'] += 0.5
            logging.info(f"Added 0.5 points for Matched Fundraising (Other) for ID {value['id']}")
        
        # Score for In Memory
        if value['In Memory'].startswith("Y"):
            df.at[index, 'Final_score'] += 1
            logging.info(f"Added 1 point for In Memory (Y) for ID {value['id']}")
        elif value['In Memory'].startswith("N"):
            df.at[index, 'Final_score'] += 0
            logging.info(f"Added 0 points for In Memory (N) for ID {value['id']}")
        else:
            df.at[index, 'Final_score'] += 0.5
            logging.info(f"Added 0.5 points for In Memory (Other) for ID {value['id']}")
        
        # Score for Public Ballot
        if value['Ballot'].endswith("Yes"):
            df.at[index, 'Final_score'] += 1
            logging.info(f"Added 1 point for Ballot (Yes) for ID {value['id']}")
        elif value['Ballot'].endswith("No"):
            df.at[index, 'Final_score'] += 0
            logging.info(f"Added 0 points for Ballot (No) for ID {value['id']}")
        else:
            df.at[index, 'Final_score'] += 0.5
            logging.info(f"Added 0.5 points for Ballot (Other) for ID {value['id']}")
        
        # Score for Pledge Amount (normalized)
        pledge_score = df.at[index, 'Pledge Amount']/max_amount if pd.notna(df.at[index, 'Pledge Amount']) else 0
        df.at[index, 'Final_score'] += pledge_score
        logging.info(f"Added {pledge_score:.4f} points for Pledge Amount for ID {value['id']}")
        
        logging.info(f"Final score for ID {value['id']}: {df.at[index, 'Final_score']}")
        
    except Exception as e:
        logging.error(f"Error updating scores for ID {value['id']}: {str(e)}")
    
    # Log progress every 10 applications
    if index % 10 == 0 and index > 0:
        logging.info(f"Progress: {index}/{len(df)} applications processed ({index/len(df)*100:.1f}%)")

## Process a fresh (unprocessed file)

In [31]:

# Import data
df = get_data(path=file_to_import_unprocessed)

logging.info(f"Processing {len(df) if 'df' in locals() else 0} applications")

df["emotion"] = ""
df["purpose"] = ""
df["commitment"] = ""
df["fundraising"] = ""
df['Final_score'] = 0

max_amount = max(df['Pledge Amount'])

df['Matched Fundraising'] = df['Matched Fundraising'].fillna("No")
df['In Memory'] = df['In Memory'].fillna("N")
df['Ballot'] = df['Ballot'].fillna("No")

for index, value in df.iterrows():
    logging.info(f"Processing application {index+1}/{len(df)}, ID: {value['id']}")
    
    row_data = value[necessary_columns].to_dict()
    
    process_row(row_data, value, index)

    # adding 2 to index to indicate the row number in excel
    print(f"Row {index + 2} processed.")

# Save to excel

df.to_excel(file_to_save_processed, index=False)

  df.at[index, 'Final_score'] += float(df.at[index, 'emotion']) + float(df.at[index, 'purpose']) + float(df.at[index, 'commitment']) + float(df.at[index, 'fundraising'])


Row 2 processed.
Row 3 processed.
Row 4 processed.
Row 5 processed.
Row 6 processed.
Row 7 processed.
Row 8 processed.
Row 9 processed.
Row 10 processed.
Row 11 processed.
Row 12 processed.
Row 13 processed.
Row 14 processed.
Row 15 processed.
Row 16 processed.
Row 17 processed.
Row 18 processed.
Row 19 processed.
Row 20 processed.
Row 21 processed.
Row 22 processed.
Row 23 processed.
Row 24 processed.
Row 25 processed.
Row 26 processed.
Row 27 processed.
Row 28 processed.
Row 29 processed.
Row 30 processed.
Row 31 processed.
Row 32 processed.
Row 33 processed.
Row 34 processed.
Row 35 processed.
Row 36 processed.
Row 37 processed.
Row 38 processed.
Row 39 processed.
Row 40 processed.
Row 41 processed.
Row 42 processed.
Row 43 processed.
Row 44 processed.
Row 45 processed.
Row 46 processed.
Row 47 processed.
Row 48 processed.
Row 49 processed.
Row 50 processed.
Row 51 processed.
Row 52 processed.
Row 53 processed.
Row 54 processed.
Row 55 processed.
Row 56 processed.
Row 57 processed.


KeyboardInterrupt: 

In [36]:
df.to_excel(r"/Users/harshgarg/Desktop/mc-lm-genai/data/LM Applications_test4.xlsx", index=False)

In [33]:
df.head(15)

Unnamed: 0,id,event_number,booking_number,event_desc,option_desc,In Memory,Matched Fundraising,How will you match your fundraising target,Pledge Amount,Finish Time,...,T&Cs,Recruitment Source,Reason,PR,Ballot,emotion,purpose,commitment,fundraising,Final_score
0,39CQ3L9I,11184,100681750,TCS London Marathon 2025,Guaranteed Place - Rejected,Y - In memory,Matched fundraising : Unsure,I will fundraise my money through sponsor shee...,2500.0,03:50:00,...,Yes: General T&C,Web/Social Media,The activity appeals to me,I would love to run the London Marathon repres...,Have you entered the public ballot? Yes,4.5,4.5,4.7,1.5,17.783333
1,BC5O13N4,11184,100681773,TCS London Marathon 2025,Guaranteed Place - Rejected,N - Not in memory,Matched fundraising: No,I will set up a just giving page. I will take ...,2100.0,06:00:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,2 of my grandparents used the Marie Curie serv...,Have you entered the public ballot? No,4.8,5.0,5.0,4.0,18.87
2,849B0V7D,11184,100681780,TCS London Marathon 2025,Guaranteed Place - Rejected,N - Not in memory,Matched fundraising : Unsure,I have four churches and work across a large a...,4000.0,04:00:00,...,Yes: General T&C,Web/Social Media,Marie Curie provides/ has provided care for a ...,I have personally experienced the incredible w...,Have you entered the public ballot? Yes,4.8,5.0,5.0,4.5,20.933333
3,73830OD8,11184,100681781,TCS London Marathon 2025,Guaranteed Place - Rejected,Y - In memory,Matched fundraising: No,I will host dinner parties and bake sells. Als...,3000.0,04:30:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,I want to run the marathon for Marie Curie bec...,Have you entered the public ballot? Yes,4.8,5.0,5.0,3.5,20.4
4,B53KQUEZ,11184,100681782,TCS London Marathon 2025,Guaranteed Place - Rejected,Y - In memory,Matched fundraising : Unsure,I have large cooperate connections plus I plan...,3000.0,04:42:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,Marie curie did great work for my auntie when ...,Have you entered the public ballot? Yes,5.0,5.0,5.0,4.5,22.1
5,07GVWHLL,11184,100681783,TCS London Marathon 2025,Guaranteed Place - Rejected,Y - In memory,Matched fundraising : Unsure,"I hope that my colleagues, friends and family ...",2500.0,06:00:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,Marie curie provided end of life care for my g...,Have you entered the public ballot? Yes,5.0,5.0,5.0,2.5,20.083333
6,2EPQG86F,11184,100681786,TCS London Marathon 2025,Guaranteed Place - Confirmed,Y - In memory,Matched fundraising : Unsure,I will fundraise via the following:- Charit...,2500.0,04:30:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,Marie Curie have been instrumental in caring f...,Have you entered the public ballot? Yes,4.7,4.8,4.8,4.5,21.383333
7,I9E308O2,11184,100681790,TCS London Marathon 2025,Guaranteed Place - Rejected,Y - In memory,Matched fundraising : Unsure,I will fundraise using a go fund me from my fa...,2000.0,04:30:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,I would like to run for Marie Curie as they su...,Have you entered the public ballot? Yes,4.8,5.0,5.0,2.5,19.866667
8,O3121NUP,11184,100681792,TCS London Marathon 2025,Guaranteed Place - Rejected,Y - In memory,Matched fundraising : Unsure,I plan to mainly gain my sponsors through my f...,4000.0,04:10:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,"When my dad was terminally ill with cancer, Ma...",Have you entered the public ballot? No,5.0,5.0,5.0,3.5,20.133333
9,1DFNSI3F,11184,100681793,TCS London Marathon 2025,Guaranteed Place - Rejected,Y - In memory,Matched fundraising : Unsure,"Sponsorship from family , friends and clients",3000.0,05:00:00,...,Yes: General T&C,"Other, see notes",Marie Curie provides/ has provided care for a ...,My mother in law decided that she no longer wi...,Have you entered the public ballot? Yes,5.0,5.0,5.0,2.0,19.6


In [None]:
# To test out a single row

df = pd.read_excel(file_to_import_single_cell, index_col=False)

row_index_in_excel_to_check = 16
index = row_index_in_excel_to_check - 2
value = df.iloc[index]

row_data = value[necessary_columns].to_dict()
print(row_data)
    
process_row(row_data, value, index)


2025-04-30 14:16:46,994 - INFO - Processing fundraising for ID: VVEYG9G5
2025-04-30 14:16:46,995 - INFO - Processing row data using prompt: 
You are a helpful assistant evaluating responses ...
2025-04-30 14:16:46,995 - INFO - Processing row: {'How will you match your fundraising target': 'I would have my lovely family, friends, work colleagues supporting me and would have a fundraising event. My sister done this and raised an amazing amount of money for Marie Curie!'}
2025-04-30 14:16:46,995 - INFO - Sending request to OpenAI API


{'id': 'VVEYG9G5', 'In Memory': 'Y - In memory', 'Matched Fundraising': 'Matched fundraising : Unsure', 'How will you match your fundraising target': 'I would have my lovely family, friends, work colleagues supporting me and would have a fundraising event. My sister done this and raised an amazing amount of money for Marie Curie!', 'Pledge Amount': 2000.0, 'Finish Time': '04:00:00', 'Reason': 'Marie Curie provides/ has provided care for a loved one', 'PR': 'Rielston', 'Ballot': 'Have you entered the public ballot?  Yes'}


2025-04-30 14:16:47,491 - INFO - HTTP Request: POST https://marie-curie-temp-embedding-model-deployment.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2024-05-01-preview "HTTP/1.1 200 OK"
2025-04-30 14:16:47,497 - INFO - Received response
2025-04-30 14:16:47,497 - INFO - Response: ```json
{"fundraising_score": 3.5}
```
2025-04-30 14:16:47,497 - INFO - Successfully parsed JSON response
2025-04-30 14:16:47,498 - INFO - Fundraising score for ID VVEYG9G5: 3.5
2025-04-30 14:16:47,498 - INFO - Processing emotional content for ID: VVEYG9G5
2025-04-30 14:16:47,498 - INFO - Processing row data using prompt: 
You are a very helpful assistant who is a talente...
2025-04-30 14:16:47,498 - INFO - Processing row: {'PR': 'Rielston'}
2025-04-30 14:16:47,498 - INFO - Sending request to OpenAI API
2025-04-30 14:16:47,897 - INFO - HTTP Request: POST https://marie-curie-temp-embedding-model-deployment.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2024-05-01

In [35]:
# to continue if file was iterrupted

df = pd.read_excel(r"/Users/harshgarg/Desktop/mc-lm-genai/data/LM Applications_test4.xlsx", index_col=False)

last_checked_index_in_excel = 515
# Configure the starting index to continue from where we left off
start_index = last_checked_index_in_excel - 2
logging.info(f"Starting processing from index {start_index} (out of {len(df)} total applications)")

# Start iterating from the specified index
for index, value in df.iloc[start_index:].iterrows():
    logging.info(f"Processing application {index+1}/{len(df)}, ID: {value['id']}")
    
    row_data = value[necessary_columns].to_dict()
    
    process_row(row_data, value, index)
    print(f"Row {index + 2} processed.")



Row 515 processed.
Row 516 processed.
Row 517 processed.
Row 518 processed.
Row 519 processed.
Row 520 processed.
Row 521 processed.
Row 522 processed.
Row 523 processed.
Row 524 processed.
Row 525 processed.
Row 526 processed.
Row 527 processed.
Row 528 processed.
Row 529 processed.
Row 530 processed.
Row 531 processed.
Row 532 processed.
Row 533 processed.
Row 534 processed.
Row 535 processed.
Row 536 processed.
Row 537 processed.
Row 538 processed.
Row 539 processed.
Row 540 processed.
Row 541 processed.
Row 542 processed.
Row 543 processed.
Row 544 processed.
Row 545 processed.
Row 546 processed.
Row 547 processed.
Row 548 processed.
Row 549 processed.
Row 550 processed.
Row 551 processed.
Row 552 processed.
Row 553 processed.
Row 554 processed.
Row 555 processed.
Row 556 processed.
Row 557 processed.
Row 558 processed.
Row 559 processed.
Row 560 processed.
Row 561 processed.
Row 562 processed.
Row 563 processed.
Row 564 processed.
Row 565 processed.
Row 566 processed.
Row 567 proc