![(ethics-ai4-1.png](ethics-ai4-1.png)


# Ethical Guidelines Classifier Function
#### by Frank Metty started on 02_20_2025

**Ethical Guidelines** is important to classify the questions based on a corpus being asked in the ThriveAi application. It provided constraints to each question on a scale of 1-7.

**Intent Class	Description 	                                                         Response Category**

1	            Question with database domain	                                         Providing an Answer
2	            Question not within database domain	                                     No Answer - Redirect to LLM
3	            Inappropriate Question  Practicing medicine	                             Inappropriate Question - Medical
4	            Inappropriate Question  Privacy or PHI 	                                Inappropriate Question - Privacy/PHI
5	            Dr with attribution for patient	                                         Physician with Patient Attribution
6	            Dr with OUT attribution for patient	                                     Physician without Patient Attribution
7	            Dangerous, racist, bullying, unethical	                         Dangerous, Racist, Bullying, Unethical Content

## Step 1 - Import Necessary Libraries

In [1]:
import json
import os
import datetime
import csv

def get_ethical_guidelines(csv_file, log_file="training_data.json"):
    """
    Classifies questions from a CSV file into categories (1-7) based on keywords.
    No ML model is used, just keyword matching. Fully supervised
    """
    # Define category keywords
    category_keywords = {
        1: ["symptoms", "blood pressure", "diabetes", "heart disease", "stroke", "hospitalization",
            "COPD", "cancer", "mortality", "disease", "asthma", "hypertension", "obesity", "cholesterol",
            "arthritis", "depression", "mental health", "injuries", "infection", "cardiovascular", "pain", 
            "illness", "screenings", "hba1c", "WNY_health", "obese", "diabetic", "hypertensive", "length",
            "fare", "survival", "ticket class", "bp"],
        2: ["investing", "stock market", "insurance", "legal rights", "finance", "money", "budget",
            "tax", "retirement", "loan", "mortgage", "bankruptcy", "real estate", "economy", "credit score",
            "lawsuit", "contract", "policy", "business", "regulations", "financial planning", "AI", "blockchain",
            "5G", "trends", "future", "machine learning", "biotechnology", "digital health", "telemedicine",
            "virtual reality", "wearable devices", "genetics", "robotics in medicine", "quantum computing",
            "innovation", "machine learning models"],
        3: ["vaccine", "safe", "pregnant", "COVID", "medical advice", "diagnose", "condition",
            "medication", "side effects", "doctor recommendation", "prescription", "therapy", "mental health treatment",
            "surgery", "alternative medicine", "home remedies", "is it normal if", "should I be worried about",
            "self-diagnose", "medical treatment", "Ozempic", "food", "diet", "nutrition", "exercise", "lifestyle",
            "weight loss", "calories", "healthy eating", "workout", "fitness", "yoga", "gym", "meal plan",
            "hydration", "intermittent fasting", "superfoods", "organic food", "vegan", "protein intake",
            "lbs", "pounds", "my hba1c", "my blood pressure", "I smoke"],
        4: ["medical records", "patient information", "primary care", "hospital access", "health history",
            "their lab results", "test results", "prescription details", "MRI scans", "blood test results",
            "X-ray", "health data", "hospital database", "personal medical records", "cybersecurity",
            "data privacy", "How many patients does Dr"],
        5: ["my patient", "follow-up appointment", "schedule consultation", "patient discharge summary",
            "referral for a specialist", "patient transfer request", "continuity of care", "care coordination",
            "patient's active medications"],
        6: ["check someone’s medical record", "access patient data outside my clinic", 
            "radiology report for a non-assigned patient", "lab results of someone outside my care", 
            "how can I get the medical history of someone else's patient?", "get prescription records of someone not in my care",
            "treatment plan for a patient not under my care", "lab results of someone outside my care",
            "medication history of a non-assigned patient"],
        7: ["hack", "steal", "database", "access personal", "illegal", "fake license", "credit card",
            "cheating", "black market", "bypass security", "exploit", "leak data", "buy medical records",
            "find personal details", "spy", "forged documents", "deep web", "scam", "identity theft",
            "breach system", "fake ID", "get private data", "sell patient information", "phishing",
            "unauthorized access", "dark web", "leak passwords"]
    }

    # Load existing training data
    if os.path.exists(log_file):
        with open(log_file, "r", encoding="ISO-8859-1") as file:
            try:
                log_data = json.load(file)
            except json.JSONDecodeError:
                log_data = []
    else:
        log_data = []

    # Read questions from the CSV file
    with open(csv_file, mode="r", encoding="ISO-8859-1") as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip header row if present
        questions = [row[0] for row in csv_reader if row]  # Assuming questions are in the first column

    # Process each question
    for question in questions:
        question_lower = question.lower()
        assigned_category = "Unknown"
        
        # Match question to category keywords
        for category, keywords in category_keywords.items():
            if any(keyword in question_lower for keyword in keywords):
                assigned_category = category
                break

        # Prepare log entry
        friendly_timestamp = datetime.datetime.now().strftime("%b %d, %Y - %I:%M %p")
        log_entry = {
            "Timestamp": friendly_timestamp,
            "Question": question.strip(),
            "Ethical Guideline Class": assigned_category
        }

        # Avoid duplicates
        if any(entry["Question"] == question and entry["Ethical Guideline Class"] == assigned_category for entry in log_data):
            continue
        
        log_data.append(log_entry)

    # Export processed data to JSON
    with open(log_file, "w", encoding="ISO-8859-1") as file:
        json.dump(log_data, file, indent=4)


    export_csv = "exported_data.csv"
    
    # Export processed data to CSV
    with open(export_csv, mode="w", encoding="ISO-8859-1", newline="") as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(["Timestamp", "Question", "Ethical Guideline Class"])
        for entry in log_data:
            csv_writer.writerow([entry["Timestamp"], entry["Question"], entry["Ethical Guideline Class"]])

    print(f"CSV file '{csv_file}' processed! Results saved to '{log_file}' and exported to '{export_csv}'.")
    
# Example usage
csv_file = "corpus.csv"  # Replace with your CSV file path
get_ethical_guidelines(csv_file)


CSV file 'corpus.csv' processed! Results saved to 'training_data.json' and exported to 'exported_data.csv'.
