# Step 1: Text Analysis with OpenAI API
This notebook demonstrates how to use the OpenAI API for text analysis. We will:
1. Test the OpenAI API connection.
2. Analyze a single text for sentiment and key topics.
3. Process multiple texts and save the results to a JSON file.

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    api_key = os.environ.get("OPENAI_API_KEY")
)

openai_model = "gpt-4o-mini"

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model=openai_model
)

# Print the response
print(chat_completion.choices[0].message.content)

# Step 2: Analyze a Single Text
We will send a single piece of text to the OpenAI API and request:
1. Sentiment analysis (positive, neutral, negative).
2. Extraction of key topics.

In [None]:
import json

openai_temperature=0.7

def analyze_text_with_openai(text):
    """
    Analyze a single text using OpenAI API for sentiment and key topics, returning structured JSON output.

    Args:
        text (str): Input text to analyze.

    Returns:
        dict: Structured analysis results including key topics and sentiment.
    """
    try:
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant for text analysis.",
                },
                {
                    "role": "user", 
                    "content": f"""
                        Analyze the following text and provide the result in JSON format. The JSON should include:
                        - "key_topics": A list of key topics mentioned in the text.
                        - "sentiment": An object that contains a summary of the overall sentiment (only values allowed as overal sentiment are: "positive", "neutral", or "negative") along with reasoning.

                        Text: {text}

                        Your response should be in JSON format. Do not include any explanations, only provide a RFC8259 compliant.

                        The JSON output should be in the following format:
                        {{
                            "key_topics": [
                                "hotel cleanliness",
                                "service speed"
                            ],
                            "sentiment": {{
                                "summary": "neutral",
                                "reasoning": "the text mentions a positive aspect of cleanliness"
                            }}
                        }}

                        Do not include markdown code blocks in your response. Remove the ```json markdown from the output.
                    """
                }
            ],
            model=openai_model,
            temperature=openai_temperature
        )

        # Parse the JSON output from the response
        structured_analysis = response.choices[0].message.content.strip()

        # Validate and parse the JSON output
        if not structured_analysis:
            print("Error: Received an empty response from OpenAI.")
            return None

        # Convert the JSON string to a Python dictionary
        analysis_result = json.loads(structured_analysis)
        analysis_result["review"] = text
        return analysis_result

    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        print("Raw Response:", structured_analysis)
        return None
    except Exception as e:
        print(f"Error analyzing text: {e}")
        return None

# Test the updated function with a sample text
sample_text = "The hotel was clean, but the service was slow and unhelpful."
# Call the function
result = analyze_text_with_openai(sample_text)
# Display the result
print(json.dumps(result, indent=4))

# Step 3: Batch Process Multiple Texts
We will apply the text analysis function to multiple texts from a sample dataset.

In [None]:
import pandas as pd

# Sample dataset of texts
data = pd.DataFrame({
    "text": [
        "The hotel was amazing, and the staff was very helpful.",
        "The room was dirty, and the food was terrible.",
        "Good location, but the service was slow."
    ]
})

# Apply the function to each text
data['analysis'] = data['text'].apply(analyze_text_with_openai)

# Display the results
print(data)

# Step 4: Save Analysis Results to JSON

In this step, we save the analyzed results to a JSON file. The JSON format provides a clear structure for the data, making it easier to read and process in downstream tasks.

In [None]:
# Save results to a JSON file
output_file = "../data/processed/text_analysis_results.json"

# Convert the DataFrame's 'analysis' column to a list of dictionaries
results = data['analysis'].tolist()

# Save the list to a JSON file
with open(output_file, "w") as json_file:
    json.dump(results, json_file, indent=4)

print(f"Results saved to {output_file}")

# Step 5: Group Reviews by Sentiment

In this step, we group the reviews into three categories:
1. Positive
2. Negative
3. Neutral

The reviews are grouped based on the `sentiment["summary"]` field from the analysis JSON file.

In [None]:
# Load the analysis results
input_file = "../data/processed/text_analysis_results.json"
with open(input_file, "r") as file:
    data = json.load(file)

# Group reviews by sentiment
grouped_reviews = {"positive": [], "negative": [], "neutral": []}

for entry in data:
    sentiment = entry["sentiment"]["summary"]
    if sentiment in grouped_reviews:
        grouped_reviews[sentiment].append(entry)

# Display grouped reviews counts
for sentiment, reviews in grouped_reviews.items():
    print(f"{sentiment.capitalize()} reviews: {len(reviews)}")

# Step 6: Create Common Lists of Key Topics for Each Sentiment (Adjusting Neutral)

In this step:
1. For each sentiment category (positive and negative), we aggregate `key_topics`.
2. Neutral reviews are excluded for now to maintain focus on clear positive and negative insights.

In [None]:
# Initialize lists for key topics (excluding neutral for now)
key_topics_by_sentiment = {"positive": [], "negative": []}

# Aggregate key topics by sentiment
for sentiment, reviews in grouped_reviews.items():
    if sentiment == "neutral":
        continue  # Skip neutral for now
    for review in reviews:
        key_topics_by_sentiment[sentiment].extend(review["key_topics"])

# Display results
for sentiment, topics in key_topics_by_sentiment.items():
    print(f"\nKey Topics for {sentiment.capitalize()} Sentiment (Excluding Neutral):")
    print(topics)

# Step 7: Define and Test `generalize_key_topics_with_openai`

### Why This Function is Necessary

In our analysis pipeline, we need to generalize raw `key_topics` into broader categories while maintaining traceability to the original topics:
1. **Traceability**:
   - Grouped categories should map back to their specific topics for detailed analysis.
   - This helps us identify which specific issues contributed to each general topic.
   
2. **Actionable Insights**:
   - By generalizing topics, we reduce redundancy and simplify reporting.
   - This allows us to calculate meaningful statistics and generate recommendations effectively.

### Goals of This Step

1. Define the `generalize_key_topics_with_openai` function.
2. Test it with a small sample of topics to ensure it works correctly.
3. Validate that AI returns generalized topics as a JSON object mapping general categories to specific topics.

In [None]:
def generalize_key_topics_with_openai(topics_list):
    """
    Use OpenAI to group similar topics and generalize them, while maintaining traceability.

    Args:
        topics_list (list of str): A flat list of all topics for a specific sentiment.

    Returns:
        dict: A dictionary mapping generalized topics to their specific topics.
    """
    try:
        # Prepare topics as input for AI
        topics_string = "\n".join(topics_list)
        prompt = f"""
            Group the following topics into specific, distinct categories based on logical themes. Avoid overly broad or generic groupings. 

            Each generalized topic should clearly represent a single theme, such as cleanliness, service, food, or facilities. Do not combine unrelated topics into the same group.

            Your goal is to maximize diversity in categories while keeping logical connections. 
            Provide each generalized topic as a key, and list the specific topics it includes as values.
            Do not include any explanations, only provide a JSON outputR that is FC8259 compliant.

            Topics:
            {topics_string}

            Example output:
            {{
            "Cleanliness": ["room cleanliness"],
            "Staff Performance": ["staff helpfulness", "service speed"],
            "Food Quality": ["food quality", "breakfast variety"],
            "Overall Hotel Quality": ["hotel quality"]
            }}

            Do not include markdown code blocks in your response. Remove the ```json markdown from the output.
        """
        # Send the request to OpenAI
        response = client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant for topic analysis."},
                {"role": "user", "content": prompt}
            ],
            model=openai_model,
            temperature=openai_temperature
        )

        # Parse the JSON response
        generalized_topics = json.loads(response.choices[0].message.content.strip())
        return generalized_topics

    except Exception as e:
        print(f"Error generalizing topics with OpenAI: {e}")
        return {}

# Sample topics to test the function
sample_topics = [
    "hotel quality",
    "room cleanliness",
    "staff helpfulness",
    "service speed",
    "food quality",
    "breakfast variety"
]

# Test the generalization function
generalized_topics = generalize_key_topics_with_openai(sample_topics)

# Display the results
print("Sample Topics:")
print(sample_topics)

print("\nGeneralized Topics Returned by AI:")
print(json.dumps(generalized_topics, indent=4))

# Step 8: Apply Generalization to Aggregated Topics

In this step:
1. Use the `generalize_key_topics_with_openai` function to generalize the aggregated `key_topics` for positive and negative sentiments.
2. Replace the original `key_topics` in the dataset with their corresponding generalized topics.
3. Save the updated dataset to a new JSON file.

In [None]:
# Initialize storage for generalized topics
generalized_topics_by_sentiment = {}

# Apply generalization for positive and negative sentiments
for sentiment, topics in key_topics_by_sentiment.items():
    print(f"Generalizing topics for {sentiment.capitalize()} sentiment...")
    generalized_topics = generalize_key_topics_with_openai(topics)
    generalized_topics_by_sentiment[sentiment] = generalized_topics
    print(f"Generalized Topics for {sentiment.capitalize()} Sentiment:")
    print(json.dumps(generalized_topics, indent=4))

# Extend reviews with generalized topics
for sentiment, reviews in grouped_reviews.items():
    if sentiment in generalized_topics_by_sentiment:
        generalized_topics = generalized_topics_by_sentiment[sentiment]
        for review in reviews:
            review["generalized_key_topics"] = list({
                general_topic
                for general_topic, specific_topics in generalized_topics.items()
                if any(specific_topic in review["key_topics"] for specific_topic in specific_topics)
            })

# Save the updated dataset
output_file = "../data/processed/text_analysis_generalized.json"
with open(output_file, "w") as file:
    json.dump(data, file, indent=4)

print(f"Generalized topics and updated dataset saved to {output_file}")