In [1]:

# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
# embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'


# Paths

root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis'
path_input = os.path.join(root_dir, "Data", "2024 Trimester 1.xlsx")
path_db_prepared = os.path.join(root_dir, "Data", "db_prepared_HRC.json")

path_db_analysed = os.path.join(root_dir, "Data", "db_analysed_empty.json")
path_db_progress_backup = os.path.join(root_dir, "Data", "db_progress_backup.json")

path_db_embedded = os.path.join(root_dir, "Data", "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "Data", "db_clustered.json")
path_db_final = os.path.join(root_dir, "Data", "db_final.json")

In [32]:
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

logging.getLogger("openai").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR)
import pandas as pd

def load_excel_to_data(excel_path):
    """
    Loads an Excel file and converts it into a list of dictionaries,
    ensuring proper encoding.
    
    Args:
        excel_path (str): Path to the input Excel file.
    Returns:
        list: List of dictionaries representing the data.
    """
    try:
        logger.info("Loading Excel file: %s", excel_path)
        dataframe = pd.read_excel(excel_path, engine='openpyxl')  # Ensure the correct engine is used
        data_as_dict = dataframe.to_dict(orient='records')
        if data_as_dict:
            logger.info("Removing the first entry of the dataset.")
            data_as_dict = data_as_dict[1:]  # Remove the first row if necessary
        logger.info("Excel data successfully loaded and converted to dictionary.")
        return data_as_dict
    except Exception as e:
        logger.error("Error loading Excel: %s", e)
        raise


In [31]:
data = load_excel_to_data(path_input)

2024-11-19 15:49:31,587 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\2024 Trimester 1.xlsx
2024-11-19 15:49:39,008 - INFO - Removing the first entry of the dataset.
2024-11-19 15:49:39,008 - INFO - Excel data successfully loaded and converted to dictionary.


In [44]:
data[14]

{'Unnamed: 0': 15,
 'Respondent ID': 114588000000,
 'Collector ID': 431039728,
 'Start Date': Timestamp('2024-04-29 15:05:34'),
 'End Date': Timestamp('2024-04-29 15:09:57'),
 'IP Address': '132.184.130.75',
 'Email Address': nan,
 'First Name': nan,
 'Last Name': nan,
 'Custom Data 1': nan,
 '1_Rate_Overall Rival Stars Horse Racing': 2,
 '1_Rate_Breeding': 2,
 '1_Rate_Training my horses': 2,
 '1_Rate_Racing - Story': 2,
 '1_Rate_Steeplechase': 2,
 '1_Rate_Cross Country': 2,
 '1_Rate_Free Roam': 2,
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 2,
 '1_Rate_Completing Story Goals': 2,
 '1_Rate_Foal Caring': 2,
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': 2,
 '1_Rate_Steeplechase Stars': 2,
 '1_Rate_Arabian Days': 2,
 '1_Rate_Empowering my horse with skills': '3 (Neutral)',
 '1_Rate_Customizing my horse with tack': 2,
 '1_Rate_Teams': '1 (Strongly Dislike)',
 '2_LeastEnjoy_Breeding horses for Coats': nan,
 '2_LeastEnjoy_Breeding horses for Stats': 'Breeding 

In [35]:
import json
import numpy as np

def clean_json_data(data):
    """
    Cleans a list of dictionaries to ensure all entries are JSON-serializable 
    and handles missing or invalid values appropriately.
    Args:
        data (list): List of dictionaries representing the dataset.
    Returns:
        list: Cleaned list of dictionaries.
    """
    def is_serializable(value):
        """
        Checks if a value can be serialized to JSON.
        """
        try:
            json.dumps(value)
            return True
        except (TypeError, ValueError):
            return False

    def clean_value(value):
        """
        Cleans individual values in the dataset:
        - Replaces NaN or None with an empty string.
        - Leaves JSON-serializable values unchanged.
        """
        if value is None or (isinstance(value, float) and np.isnan(value)):
            return ""
        if is_serializable(value):
            return value
        # For any non-serializable value, convert it to a string
        return str(value)

    def clean_entry(entry):
        """
        Cleans a single dictionary by applying `clean_value` to each field.
        """
        return {key: clean_value(value) for key, value in entry.items()}

    cleaned_data = [clean_entry(entry) for entry in data]
    original_count = len(data)
    cleaned_count = len(cleaned_data)
    print(f"Cleaned {original_count - cleaned_count} entries from the dataset.")
    return cleaned_data


In [36]:
data_cleaned = clean_json_data(data)

Cleaned 0 entries from the dataset.


In [37]:
data_cleaned[2]

{'Unnamed: 0': 3,
 'Respondent ID': 114585000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-24 19:30:24',
 'End Date': '2024-04-29 20:41:02',
 'IP Address': '31.94.74.128',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': '3 (Neutral)',
 '1_Rate_Training my horses': '3 (Neutral)',
 '1_Rate_Racing - Story': 4,
 '1_Rate_Steeplechase': 2,
 '1_Rate_Cross Country': 4,
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '5 (Strongly Like)',
 '1_Rate_Completing Story Goals': '3 (Neutral)',
 '1_Rate_Foal Caring': '3 (Neutral)',
 '1_Rate_Pasture': '3 (Neutral)',
 '1_Rate_Foaling season event': '3 (Neutral)',
 '1_Rate_Steeplechase Stars': '1 (Strongly Dislike)',
 '1_Rate_Arabian Days': 2,
 '1_Rate_Empowering my horse with skills': '5 (Strongly Like)',
 '1_Rate_Customizing my horse with tack': 4,
 '1_Rate_Teams': '5 (Str

In [86]:
def filter_and_enrich_data(data, columns_of_interest):
    """
    Filters and enriches a dataset by specified columns.

    Args:
        data (list): List of dictionaries representing the dataset.
        columns_of_interest (list): List of column names to check in each entry.

    Returns:
        list: Filtered and enriched dataset.
    """
    removed_count = 0  # Track the number of removed entries
    filtered_data = []  # List to store the remaining entries

    for entry_idx, entry in enumerate(data):
        try:
            # Check if all specified columns are empty
            if all(
                not str(entry.get(col, "")).strip() for col in columns_of_interest
            ):
                removed_count += 1  # Count this entry as removed
                logger.debug(f"Removed entry #{entry_idx}: All columns empty: {entry}")
            else:
                # Create a new key 'player_response' with concatenated text from specified columns
                player_response = " ".join(
                    str(entry.get(col, "")).strip()
                    for col in columns_of_interest
                    if str(entry.get(col, "")).strip()
                )
                
                # Check if player_response has more than 3 words
                if len(player_response.split()) > 3:
                    entry["player_response"] = player_response
                    filtered_data.append(entry)
                else:
                    removed_count += 1  # Count this entry as removed
                    logger.debug(f"Removed entry #{entry_idx}: player_response too short: {entry}")
        except Exception as e:
            logger.error(f"Error processing entry #{entry_idx}: {e}")
            raise

    # Log and print the number of removed entries
    logger.info(f"Total entries removed: {removed_count}")
    print(f"Total entries removed: {removed_count}")

    return filtered_data


In [88]:
columns_of_interest = [
    "24_[OPTIONAL] Is there anything you'd like to share about the game (good, bad, frustrating, improvement, wishlist, etc.)?"
]


In [89]:
filtered_data = filter_and_enrich_data(data_cleaned, columns_of_interest)


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

2024-11-19 16:55:21,282 - INFO - Total entries removed: 4397
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Total entries removed: 4397


In [90]:
filtered_data[10]

{'Unnamed: 0': 31,
 'Respondent ID': 114588000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-29 13:26:55',
 'End Date': '2024-04-29 13:40:08',
 'IP Address': '99.251.136.92',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': 4,
 '1_Rate_Training my horses': 4,
 '1_Rate_Racing - Story': 4,
 '1_Rate_Steeplechase': 4,
 '1_Rate_Cross Country': 4,
 '1_Rate_Free Roam': 4,
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 4,
 '1_Rate_Completing Story Goals': 4,
 '1_Rate_Foal Caring': '3 (Neutral)',
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': '5 (Strongly Like)',
 '1_Rate_Steeplechase Stars': 4,
 '1_Rate_Arabian Days': '5 (Strongly Like)',
 '1_Rate_Empowering my horse with skills': '3 (Neutral)',
 '1_Rate_Customizing my horse with tack': '5 (Strongly Like)',
 '1_Rate_Teams': '5 (Strongly Like)',
 '2_LeastEnjoy_Breeding horses for Coats': '',


In [48]:
len(filtered_data)

2760

In [16]:
def save_to_json(data, output_path):
    """
    Saves data to a JSON file with proper encoding.
    
    Args:
        data (list): Data to save.
        output_path (str): Path to the output JSON file.
    """
    try:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        logger.info(f"Data successfully saved to {output_path}")
    except Exception as e:
        logger.error(f"Error saving JSON: {e}")
        raise

In [92]:
save_to_json(filtered_data, path_db_prepared)


2024-11-19 16:56:00,914 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


In [93]:
import random

def get_random_sample(data, sample_size, seed=None):
    """
    Returns a random sample of the specified size from the dataset.
    """
    if seed is not None:
        random.seed(seed)  # Set the random seed for reproducibility
    
    if sample_size > len(data):
        raise ValueError(f"Sample size ({sample_size}) cannot exceed the dataset size ({len(data)}).")
    
    return random.sample(data, sample_size)


In [94]:
sample_size = 100
seed = 42
sample_data = get_random_sample(filtered_data, sample_size, seed=seed)

In [96]:
sample_data[0]

{'Unnamed: 0': 1520,
 'Respondent ID': 114587000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-26 21:01:16',
 'End Date': '2024-04-26 21:09:56',
 'IP Address': '172.97.49.205',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': '1 (Strongly Dislike)',
 '1_Rate_Training my horses': '1 (Strongly Dislike)',
 '1_Rate_Racing - Story': '3 (Neutral)',
 '1_Rate_Steeplechase': '3 (Neutral)',
 '1_Rate_Cross Country': '3 (Neutral)',
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 2,
 '1_Rate_Completing Story Goals': 4,
 '1_Rate_Foal Caring': '1 (Strongly Dislike)',
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': '5 (Strongly Like)',
 '1_Rate_Steeplechase Stars': '3 (Neutral)',
 '1_Rate_Arabian Days': '1 (Strongly Dislike)',
 '1_Rate_Empowering my horse with skills': 4,
 '1_Rate_Customizing my horse with tack': 4,

In [97]:
save_to_json(sample_data, path_db_prepared)

2024-11-19 16:56:39,212 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


## Translate

In [2]:
from lingua import Language, LanguageDetectorBuilder


def detect_player_language(data):
    """
    Detects the language of the 'player_response' field for each JSON tuple
    and adds a new key 'player_language' with the detected language.

    Args:
        data (list): List of JSON-like dictionaries.

    Returns:
        list: Updated list of dictionaries with 'player_language' key.
    """
    # Initialize the language detector
    detector = LanguageDetectorBuilder.from_languages(
        Language.ENGLISH, Language.SPANISH, Language.CHINESE, Language.GERMAN, Language.FRENCH
    ).build()

    for entry in data:
        player_response = entry.get("player_response", "")
        if player_response.strip():  # Check if 'player_response' is not empty
            language = detector.detect_language_of(player_response)
            entry["player_language"] = language.name.lower()  # Store language name in lowercase
        else:
            entry["player_language"] = None  # No language detected for empty responses
    return data


In [3]:
import json

def read_json(file_path):
    """
    Reads a JSON file and returns its contents as a Python object.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        object: The contents of the JSON file as a Python data structure (e.g., dict or list).
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        raise
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON from file '{file_path}': {e}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred while reading the file '{file_path}': {e}")
        raise


In [4]:
data = read_json(path_db_prepared)

In [5]:
language = detect_player_language(data)

In [6]:
language[3]

{'Unnamed: 0': 3046,
 'Respondent ID': 114585000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-25 07:47:24',
 'End Date': '2024-04-25 08:15:36',
 'IP Address': '68.229.99.20',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': 4,
 '1_Rate_Training my horses': '5 (Strongly Like)',
 '1_Rate_Racing - Story': '5 (Strongly Like)',
 '1_Rate_Steeplechase': '5 (Strongly Like)',
 '1_Rate_Cross Country': '5 (Strongly Like)',
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '3 (Neutral)',
 '1_Rate_Completing Story Goals': '5 (Strongly Like)',
 '1_Rate_Foal Caring': '3 (Neutral)',
 '1_Rate_Pasture': 4,
 '1_Rate_Foaling season event': 4,
 '1_Rate_Steeplechase Stars': '5 (Strongly Like)',
 '1_Rate_Arabian Days': '5 (Strongly Like)',
 '1_Rate_Empowering my horse with skills': '3 (Neutral)',
 '1_Rate_Customizing my horse wit

In [7]:
# Initialize token counters
prompt_tokens = 0
completion_tokens = 0

# Global API settings
api_settings = {"client": None, "model": None}

def configure_api(api_client, model_name):
    """
    Configures the global API client and model.
    Args:
        api_client: The initialized OpenAI client.
        model_name (str): The model name to use.
    """
    global api_settings
    api_settings["client"] = api_client
    api_settings["model"] = model_name


def track_tokens(response):
    """
    Updates the global token counters based on the response.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens

In [8]:
configure_api(client, chat_model_name)

In [9]:
from langchain.prompts import PromptTemplate

# Define the translation prompt template with examples
prompt_template_translation = PromptTemplate.from_template(
'''You are a professional translator. Translate the following text into English if it is not already in English.

[h0]==================================================================[\h0]
TEXT: "兄弟们，我把星空退款的钱拿来买这个了，我做的对吗"

TRANSLATION: "Brothers, I used the refund money from the stars to buy this. Did I do the right thing?"

[h0]==================================================================[\h0]
TEXT: "Me toma demasiado tiempo entrenar caballos."

TRANSLATION: "It takes too much time to train horses."

[h0]==================================================================[\h0]
TEXT: "Grinding just to get good tack, grain etc. Itâ€™s very time consuming sadly"

TRANSLATION: "Grinding just to get good tack, grain etc. It's very time consuming sadly"

[h0]==================================================================[\h0]
TEXT: "{text}"

TRANSLATION:
'''
)


In [10]:
def translate_player_responses(data, api_settings):
    """
    Translates the 'player_response' field for entries where 'player_language' is not 'english'.

    Args:
        data (list): List of JSON-like dictionaries.
        api_settings (dict): Dictionary with API settings, including the client and model.

    Returns:
        list: Updated list with translated 'player_response' fields.
    """
    for entry_idx, entry in enumerate(data):
        try:
            detected_language = entry.get("player_language", "none")
            player_response = entry.get("player_response", "")

            # Skip translation for English or empty responses
            if detected_language in ["english", "none"] or not player_response.strip():
                continue

            logger.info(f"Translating entry #{entry_idx} (Language: {detected_language})")
            
            # Format the prompt
            prompt_translation = prompt_template_translation.format(
                text=player_response
            )

            # Make API call to translate
            response = api_settings["client"].chat.completions.create(
                model=api_settings["model"],
                messages=[
                    {"role": "system", "content": "You are a helpful assistant for translation."},
                    {"role": "user", "content": prompt_translation},
                ],
                max_tokens=1024
            )

            # Track tokens if needed
            if "track_tokens" in api_settings:
                api_settings["track_tokens"](response)

            # Extract translation from the response
            translation_text = response.choices[0].message.content.strip()

            # Replace the 'player_response' with the translated text
            entry["player_response"] = translation_text

        except Exception as e:
            logger.error(f"Error translating entry #{entry_idx}: {e}")
            raise

    return data

In [13]:
translated_data = translate_player_responses(data, api_settings)

2024-11-20 08:10:01,380 - INFO - Translating entry #23 (Language: german)
2024-11-20 08:10:03,406 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [14]:
translated_data[10]

{'Unnamed: 0': 428,
 'Respondent ID': 114588000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-28 16:34:49',
 'End Date': '2024-04-28 16:46:12',
 'IP Address': '172.226.36.83',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '3 (Neutral)',
 '1_Rate_Breeding': '3 (Neutral)',
 '1_Rate_Training my horses': 2,
 '1_Rate_Racing - Story': '3 (Neutral)',
 '1_Rate_Steeplechase': '3 (Neutral)',
 '1_Rate_Cross Country': '3 (Neutral)',
 '1_Rate_Free Roam': 2,
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '3 (Neutral)',
 '1_Rate_Completing Story Goals': '3 (Neutral)',
 '1_Rate_Foal Caring': 2,
 '1_Rate_Pasture': '3 (Neutral)',
 '1_Rate_Foaling season event': '3 (Neutral)',
 '1_Rate_Steeplechase Stars': '3 (Neutral)',
 '1_Rate_Arabian Days': '3 (Neutral)',
 '1_Rate_Empowering my horse with skills': 2,
 '1_Rate_Customizing my horse with tack': '3 (Neutral)',
 '1_Rate_Teams': 2,
 '2_LeastEnjoy_B

In [17]:
save_to_json(translated_data, path_db_prepared)

2024-11-20 08:10:56,628 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


In [18]:
import random

def list_player_responses(data, shuffle=False, sample_size=20):
    """
    Lists a specified number of 'player_response' values from the dataset.

    Args:
        data (list): List of dictionaries containing player responses.
        shuffle (bool): Whether to shuffle the data before listing. Defaults to False.
        sample_size (int): Number of player responses to list. Defaults to 20.

    Returns:
        list: A list of 'player_response' values.
    """
    if shuffle:
        random.shuffle(data)  # Shuffle data for randomness

    responses = [
        entry.get("player_response", "No response available") for entry in data
    ]

    # Limit the output to the sample size
    return responses[:sample_size]


In [19]:
# List 20 player responses, shuffled
responses = list_player_responses(translated_data, shuffle=True)
print(responses)


['wishlist ( things that could be added) :D   1. Show jumping!   2. The ability to train the horse or at least earn XP while riding in free roam.   3. Being able to clean, feed, pet, hug the horses. build Relationships with them fr.ex., doing good things for it like feeding it cleaning it hugging it would increase the bond but not doing these things or doing the opposite would decrease the bond. Also! The ability clean their stables. ( with this, maybe an option to sign a contract with a permanent Stable hand - each hire has different skills and characteristics and pay (Same could be done with jockeys) )  4.Stable customizations (construction and color wise )   5. More focus on our human character. More outfit customizations that could match with the saddles. More physical appearance customizations, and maybe Personality.', 'I would really love to be able to pet my horses or brush them and interact with them more, and also more animations. I would also like more western disciplines, es

## Topic extraction

In [20]:
prompt_template_topic = PromptTemplate.from_template(
'''Please list the most important topics and their respective original context in the review of a game in a JSON format with "Topic", "Category", and "Context" arguments. No more than 10 topics.
Topics should focus on specific game features or aspects. A feature in the game should be a noun rather than a verb or an adjective.
Each topic should be categorized as a "fact" or a "request".
Respond in JSON format.

[h0]==================================================================[\h0]
REVIEW: 

"The customization options for characters are so limited, and it's frustrating not to have more outfit choices. Also, why can't I rename my horse after I buy it? However, I do enjoy the free roam mode—riding through open fields feels relaxing and immersive."

TOPICS:

{{"Topics":
    [
        {{
            "Topic": "Character Customization",
            "Category": "request",
            "Context": "The customization options for characters are so limited, and it's frustrating not to have more outfit choices."
        }},
        {{
            "Topic": "Horse Renaming",
            "Category": "request",
            "Context": "It's frustrating not to be able to rename my horse after I buy it."
        }},
        {{
            "Topic": "Free Roam",
            "Category": "fact",
            "Context": "Riding through open fields feels relaxing and immersive."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"Too much useless nonsense."

TOPICS:

{{"Topics":
    [
        {{"Topic": "Game Content",
          "Category": "request",
          "Context": "Too much useless nonsense."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"This game has great mechanics, but the breeding system feels random and unfair. I've bred so many horses, yet the coats and stats don't seem to follow any logical pattern. On the other hand, I appreciate how detailed the horse animations are—it makes the game come alive."

TOPICS:

{{"Topics":
    [
        {{
            "Topic": "Game Mechanics",
            "Category": "fact",
            "Context": "This game has great mechanics"
        }},
        {{
            "Topic": "Breeding System",
            "Category": "request",
            "Context": "The breeding system feels random and unfair. Coats and stats don't seem to follow any logical pattern."
        }},
        {{
            "Topic": "Horse Animations",
            "Category": "fact",
            "Context": "The horse animations are detailed and make the game come alive."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"{review}"

TOPICS:

'''
)


In [28]:
# Initialize token counters
prompt_tokens = 0
completion_tokens = 0

def configure_api(api_client, model_name):
    """
    Configures the global API client and model.

    Args:
        api_client: The initialized API client.
        model_name (str): The model name to use.
    """
    global api_settings
    api_settings["client"] = api_client
    api_settings["model"] = model_name
    logger.info(f"API configured with model: {model_name}")

def track_tokens(response):
    """
    Updates the global token counters based on the API response.

    Args:
        response: The API response containing token usage.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens
    logger.info(f"Tokens used - Prompt: {response.usage.prompt_tokens}, Completion: {response.usage.completion_tokens}")


# def extract_topics(entry, prompt_template_topic, api_settings, review_fields):
#     """
#     Extracts topics from an entry's combined review fields using a prompt template.
# 
#     Args:
#         entry (dict): The review entry containing multiple fields to combine.
#         prompt_template (PromptTemplate): The template used for topic extraction.
#         api_settings (dict): API configuration with 'client' and 'model'.
#         review_fields (list): List of fields to combine for the review.
# 
#     Returns:
#         dict: Extracted topics in JSON format.
#     """
#     # Combine review fields
#     combined_review = " ".join(entry.get(field, "").strip() for field in review_fields)
# 
#     # Generate the prompt using the combined review
#     prompt_topic = prompt_template_topic.format(review=combined_review)
# 
#     # Log the entry ID (assumes the entry has an 'ID' field for tracking)
#     entry_id = entry.get("Respondent ID", "unknown")
#     logger.info(f"Extracting topics for entry ID {entry_id}")
# 
#     try:
#         # Send the request to the API
#         response = api_settings["client"].chat.completions.create(
#             model=api_settings["model"],
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant for game review analysis."},
#                 {"role": "user", "content": prompt_topic},
#             ],
#             max_tokens=1024,
#             response_format={"type": "json_object"}
#         )
#         track_tokens(response)
#         return json.loads(response.choices[0].message.content)
# 
#     except Exception as e:
#         logger.error(f"Error extracting topics for entry ID {entry_id}: {e}")
#         return {"error": str(e)}


In [24]:
# Configure the API
configure_api(client, chat_model_name)

2024-11-20 08:38:39,429 - INFO - API configured with model: gpt-4o-mini


In [29]:
review_fields = ["player_response"]
topics = extract_topics(translated_data[1], prompt_template_topic, api_settings, review_fields)

2024-11-20 08:45:19,022 - INFO - Extracting topics for entry ID 114585000000
2024-11-20 08:45:22,906 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-20 08:45:22,906 - INFO - Tokens used - Prompt: 658, Completion: 235


In [30]:
topics

{'Topics': [{'Topic': 'Horse Interaction',
   'Category': 'request',
   'Context': 'I would really love to be able to pet my horses or brush them and interact with them more.'},
  {'Topic': 'Western Disciplines',
   'Category': 'request',
   'Context': 'I would like more western disciplines, especially barrel racing.'},
  {'Topic': 'Stable Visits',
   'Category': 'request',
   'Context': 'Being able to visit other people’s stables would be fun too.'},
  {'Topic': 'Player Interaction',
   'Category': 'request',
   'Context': 'More interacting with other players, for example, a team could organize a weekly trail ride together.'},
  {'Topic': 'Foal Care',
   'Category': 'request',
   'Context': 'Take care of other people’s foals and horses.'},
  {'Topic': "Friend's Studs",
   'Category': 'request',
   'Context': 'Use our friend’s studs.'}]}

In [31]:
translated_data[1]

{'Unnamed: 0': 4524,
 'Respondent ID': 114585000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-24 22:11:39',
 'End Date': '2024-04-24 22:43:48',
 'IP Address': '142.170.79.11',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': 4,
 '1_Rate_Training my horses': 4,
 '1_Rate_Racing - Story': '5 (Strongly Like)',
 '1_Rate_Steeplechase': 4,
 '1_Rate_Cross Country': '3 (Neutral)',
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '5 (Strongly Like)',
 '1_Rate_Completing Story Goals': '5 (Strongly Like)',
 '1_Rate_Foal Caring': 2,
 '1_Rate_Pasture': 4,
 '1_Rate_Foaling season event': 4,
 '1_Rate_Steeplechase Stars': '5 (Strongly Like)',
 '1_Rate_Arabian Days': '3 (Neutral)',
 '1_Rate_Empowering my horse with skills': '5 (Strongly Like)',
 '1_Rate_Customizing my horse with tack': '5 (Strongly Like)',
 '1_Rate_Teams': 

In [33]:
def append_topics(data, prompt_template, api_settings, review_fields):
    """
    Iterates through the dataset, extracts topics for each entry,
    and appends the extracted topics directly to the JSON tuple.

    Args:
        data (list): List of dictionaries containing the dataset.
        prompt_template (PromptTemplate): The template used for topic extraction.
        api_settings (dict): API configuration with 'client' and 'model'.
        review_fields (list): List of fields to combine for the review.

    Returns:
        list: Updated dataset with extracted topics appended directly.
    """
    updated_data = []

    for entry in data:
        try:
            # Extract topics for the current entry
            extracted_topics = extract_topics(entry, prompt_template, api_settings, review_fields)
            
            # Append the topics directly to the entry
            entry["topics"] = extracted_topics.get("Topics", [])
            
            # Add the updated entry to the result list
            updated_data.append(entry)

        except Exception as e:
            logger.error(f"Failed to process entry ID {entry.get('ID', 'unknown')}: {e}")
            entry["topics"] = [{"error": str(e)}]
            updated_data.append(entry)

    return updated_data


In [34]:
updated_data = append_topics(translated_data, prompt_template_topic, api_settings, review_fields)

2024-11-20 08:56:10,910 - INFO - Extracting topics for entry ID 114586000000
2024-11-20 08:56:17,951 - INFO - Tokens used - Prompt: 751, Completion: 373
2024-11-20 08:56:17,967 - INFO - Extracting topics for entry ID 114585000000


KeyboardInterrupt: 

In [35]:
import os
import json

def load_existing_data(file_path):
    """
    Loads the existing analysed data if the file exists, otherwise returns an empty list.

    Args:
        file_path (str): Path to the existing analysed file.

    Returns:
        tuple: A list of processed entries and a set of IDs already processed.
    """
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
        processed_ids = {entry["ID"] for entry in existing_data}
        return existing_data, processed_ids
    return [], set()


def save_progress(file_path, data):
    """
    Saves the current progress to the specified file.

    Args:
        file_path (str): Path to the file where progress should be saved.
        data (list): List of entries to save.
    """
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def process_with_failsafe_and_batching(data, prompt_template, api_settings, review_fields, output_path, batch_size=10):
    """
    Processes the dataset in batches, extracts topics, skips already processed entries,
    and saves progress incrementally after each batch.

    Args:
        data (list): List of dictionaries containing the dataset.
        prompt_template (PromptTemplate): The template used for topic extraction.
        api_settings (dict): API configuration with 'client' and 'model'.
        review_fields (list): List of fields to combine for the review.
        output_path (str): Path to save the analysed data.
        batch_size (int): Number of entries to process before saving progress. Defaults to 10.

    Returns:
        None
    """
    # Load existing data and processed IDs
    existing_data, processed_ids = load_existing_data(output_path)
    logger.info(f"Loaded {len(existing_data)} entries from existing file.")

    # Prepare a list for the updated data
    updated_data = existing_data
    batch_counter = 0

    for entry in data:
        entry_id = entry.get("ID")

        # Skip if the ID is already processed
        if entry_id in processed_ids:
            logger.info(f"Skipping already processed entry ID {entry_id}.")
            continue

        try:
            # Extract topics
            extracted_topics = extract_topics(entry, prompt_template, api_settings, review_fields)
            entry["topics"] = extracted_topics.get("Topics", [])

            # Append the entry to updated data
            updated_data.append(entry)
            batch_counter += 1

            # Save progress after processing a batch
            if batch_counter >= batch_size:
                save_progress(output_path, updated_data)
                logger.info(f"Progress saved after processing a batch of {batch_size} entries.")
                batch_counter = 0  # Reset the batch counter

        except Exception as e:
            logger.error(f"Error processing entry ID {entry_id}: {e}")
            entry["topics"] = [{"error": str(e)}]
            updated_data.append(entry)

            # Save progress even after an error
            save_progress(output_path, updated_data)
            logger.info(f"Progress saved after error on entry ID {entry_id}.")

    # Final save after all entries are processed
    save_progress(output_path, updated_data)
    logger.info("Final progress saved after completing all entries.")


In [38]:
batch_size = 10
process_with_failsafe_and_batching(translated_data, prompt_template_topic, api_settings, review_fields, path_db_analysed, batch_size=batch_size)

2024-11-20 09:04:34,506 - INFO - Loaded 0 entries from existing file.
2024-11-20 09:04:34,507 - INFO - Extracting topics for entry ID 114586000000
2024-11-20 09:04:37,196 - INFO - Tokens used - Prompt: 751, Completion: 154
2024-11-20 09:04:37,196 - INFO - Extracting topics for entry ID 114585000000
2024-11-20 09:04:41,441 - INFO - Tokens used - Prompt: 658, Completion: 267
2024-11-20 09:04:41,441 - INFO - Extracting topics for entry ID 114585000000
2024-11-20 09:04:43,192 - INFO - Tokens used - Prompt: 617, Completion: 94
2024-11-20 09:04:43,208 - INFO - Extracting topics for entry ID 114588000000


KeyboardInterrupt: 

## TOPIC + SENTIMENT

In [49]:
prompt_template_topic = PromptTemplate.from_template(
'''Please list the most important topics and their respective original context in the review of a game in a JSON format with "Topic", "Category", and "Context" arguments. No more than 10 topics.
Topics should focus on specific game features or aspects. A feature in the game should be a noun rather than a verb or an adjective.
Each topic should be categorized as a "fact" or a "request".
Respond in JSON format.

[h0]==================================================================[\h0]
REVIEW: 

"The customization options for characters are so limited, and it's frustrating not to have more outfit choices. Also, why can't I rename my horse after I buy it? However, I do enjoy the free roam mode—riding through open fields feels relaxing and immersive."

TOPICS:

{{"Topics":
    [
        {{
            "Topic": "Character Customization",
            "Category": "request",
            "Context": "The customization options for characters are so limited, and it's frustrating not to have more outfit choices."
        }},
        {{
            "Topic": "Horse Renaming",
            "Category": "request",
            "Context": "It's frustrating not to be able to rename my horse after I buy it."
        }},
        {{
            "Topic": "Free Roam",
            "Category": "fact",
            "Context": "Riding through open fields feels relaxing and immersive."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"Too much useless nonsense."

TOPICS:

{{"Topics":
    [
        {{"Topic": "Game Content",
          "Category": "request",
          "Context": "Too much useless nonsense."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"This game has great mechanics, but the breeding system feels random and unfair. I've bred so many horses, yet the coats and stats don't seem to follow any logical pattern. On the other hand, I appreciate how detailed the horse animations are—it makes the game come alive."

TOPICS:

{{"Topics":
    [
        {{
            "Topic": "Game Mechanics",
            "Category": "fact",
            "Context": "This game has great mechanics"
        }},
        {{
            "Topic": "Breeding System",
            "Category": "request",
            "Context": "The breeding system feels random and unfair. Coats and stats don't seem to follow any logical pattern."
        }},
        {{
            "Topic": "Horse Animations",
            "Category": "fact",
            "Context": "The horse animations are detailed and make the game come alive."
        }}
    ]
}}

[h0]==================================================================[\h0]
REVIEW: 

"{review}"

TOPICS:

'''
)

prompt_template_sentiment = PromptTemplate.from_template(
'''What's the sentiment of the review with regard to the topic?
Always answer with 'Positive' or 'Negative' or 'Inconclusive'.

REVIEW: My first D&D experience and I'm enjoying it a lot.
TOPIC: D&D
SENTIMENT: Positive 

REVIEW: This game lacks a proper ending or epilog
TOPIC: epilogue
SENTIMENT: Negative

REVIEW: Posted: August 8
TOPIC: release date
SENTIMENT: Inconclusive 

REVIEW: {review}
TOPIC: {topic}
SENTIMENT: '''
)

In [46]:
# import json
# import os
# from pathlib import Path
# 
# # Global API settings
# api_settings = {"client": None, "model": None}
# 
# def configure_api(api_client, model_name):
#     """
#     Configures the global API client and model.
#     Args:
#         api_client: The initialized OpenAI client.
#         model_name (str): The model name to use.
#     """
#     global api_settings
#     api_settings["client"] = api_client
#     api_settings["model"] = model_name
# 
# def track_tokens(response):
#     """
#     Updates the global token counters based on the API response.
# 
#     Args:
#         response: The API response containing token usage.
#     """
#     global prompt_tokens, completion_tokens
#     prompt_tokens += response.usage.prompt_tokens
#     completion_tokens += response.usage.completion_tokens
#     logger.info(f"Tokens used - Prompt: {response.usage.prompt_tokens}, Completion: {response.usage.completion_tokens}")
# 
# 
# def extract_topics(entry, id_column, prompt_template_topic, api_settings, review_fields):
#     """
#     Extracts topics from an entry's combined review fields using a prompt template.
# 
#     Args:
#         entry (dict): The review entry containing multiple fields to combine.
#         prompt_template (PromptTemplate): The template used for topic extraction.
#         api_settings (dict): API configuration with 'client' and 'model'.
#         review_fields (list): List of fields to combine for the review.
# 
#     Returns:
#         dict: Extracted topics in JSON format.
#     """
#     # Combine review fields
#     combined_review = " ".join(entry.get(field, "").strip() for field in review_fields)
# 
#     # Generate the prompt using the combined review
#     prompt_topic = prompt_template_topic.format(review=combined_review)
# 
#     # Log the entry ID (assumes the entry has an 'ID' field for tracking)
#     entry_id = entry.get(f"{id_column}", "unknown")
#     logger.info(f"Extracting topics for entry ID {entry_id}")
# 
#     try:
#         # Send the request to the API
#         response = api_settings["client"].chat.completions.create(
#             model=api_settings["model"],
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant for game review analysis."},
#                 {"role": "user", "content": prompt_topic},
#             ],
#             max_tokens=1024,
#             response_format={"type": "json_object"}
#         )
#         track_tokens(response)
#         return json.loads(response.choices[0].message.content)
# 
#     except Exception as e:
#         logger.error(f"Error extracting topics for entry ID {entry_id}: {e}")
#         return {"error": str(e)}
#  
#     
# def load_existing_progress(output_path, id_column):
#     """
#     Loads existing progress from the output file if it exists.
#     Returns the processed data as a list and the set of processed IDs.
# 
#     Args:
#         output_path (str): Path to the output JSON file.
#         id_column (str): The column name where IDs are stored.
# 
#     Returns:
#         tuple: A list of processed data and a set of processed IDs.
#     """
#     if Path(output_path).exists():
#         logger.info(f"Loading existing progress from {output_path}")
#         with open(output_path, "r", encoding="utf-8") as f:
#             processed_data = json.load(f)
#         processed_ids = {entry[id_column] for entry in processed_data}
#     else:
#         logger.info(f"No existing progress found. Starting fresh.")
#         processed_data = []
#         processed_ids = set()
#     return processed_data, processed_ids
# 
# def save_progress(processed_data, output_path):
#     """
#     Saves the current progress to a JSON file.
# 
#     Args:
#         processed_data (list): The list of processed entries.
#         output_path (str): Path to the output JSON file.
#     """
#     try:
#         logger.info(f"Saving progress to {output_path}")
#         with open(output_path, "w", encoding="utf-8") as f:
#             json.dump(processed_data, f, indent=4, ensure_ascii=False)
#     except Exception as e:
#         logger.error(f"Failed to save progress: {e}")
#         raise
# 
# def process_entry(entry, id_column, prompt_template_topic, prompt_template_sentiment, review_fields):
#     """
#     Processes a single entry by extracting topics and appending them to the entry.
# 
#     Args:
#         entry (dict): The JSON entry to process.
#         prompt_template (PromptTemplate): The prompt template for topic extraction.
#         review_fields (list): List of fields to combine for review text.
# 
#     Returns:
#         dict: The processed entry with extracted topics.
#     """
#     try:
#         topics = extract_topics(entry, id_column, prompt_template_topic, api_settings, review_fields)
#         analyze_sentiments(entry, topics, prompt_template_sentiment, api_settings)
#     except Exception as e:
#         logger.error(f"Error processing entry ID {entry['ID']}: {e}")
#         entry["topics"] = [{"error": str(e)}]
#     return entry
# 
# def analyze_sentiments(entry, id_column, topics, prompt_template_sentiment):
#     """
#     Performs sentiment analysis on extracted topics.
#     """
#     entry["topics"] = []
#     for topic in topics.get("Topics", []):
#         logger.info(f"Analyzing sentiment for topic '{topic['Topic']}' (Entry ID {entry[f'{id_column}']})")
#         prompt_sentiment = prompt_template_sentiment.format(
#             review=topic["Context"],
#             topic=topic["Topic"]
#         )
#         try:
#             response = api_settings["client"].chat.completions.create(
#                 model=api_settings["model"],
#                 messages=[
#                     {"role": "system", "content": "You are a helpful assistant expert in sentiment analysis."},
#                     {"role": "user", "content": prompt_sentiment},
#                 ],
#                 max_tokens=1024
#             )
#             track_tokens(response)
#             sentiment = response.choices[0].message.content.strip()
#             entry["topics"].append({
#                 "topic": topic["Topic"],
#                 "sentiment": sentiment,
#                 "category": topic["Category"],
#                 "sentence": topic["Context"]
#             })
#         except Exception as e:
#             logger.error(f"Error analyzing sentiment for topic '{topic['Topic']}' (Entry ID {entry['ID']}): {e}")
#             raise
# 
# def analyse_data(translated_data, id_column, output_path, prompt_template_topic, prompt_template_sentiment, api_settings, review_fields, batch_size=10):
#     """
#     Main function to analyse translated data with fail-safe batching and progress saving.
# 
#     Args:
#         translated_data (list): The dataset to process.
#         id_column (str): Column name where IDs are stored.
#         output_path (str): Path to save the analysed data.
#         prompt_template (PromptTemplate): The prompt template for topic extraction.
#         api_settings (dict): API configuration with 'client' and 'model'.
#         review_fields (list): List of fields to combine for review text.
#         batch_size (int): Number of entries to process before saving progress. Defaults to 10.
#     """
#     # Load existing progress
#     processed_data, processed_ids = load_existing_progress(output_path, id_column)
# 
#     try:
#         batch_counter = 0
# 
#         for entry in translated_data:
#             entry_id = entry.get(id_column)
#             
#             # Skip already processed entries
#             if entry_id in processed_ids:
#                 logger.info(f"Skipping already processed entry ID {entry_id}")
#                 continue
# 
#             # Process the entry
#             processed_entry = process_entry(entry,id_column, prompt_template_topic, prompt_template_sentiment, review_fields)
#             processed_data.append(processed_entry)
#             processed_ids.add(entry_id)
#             batch_counter += 1
# 
#             # Save progress after every batch
#             if batch_counter >= batch_size:
#                 save_progress(processed_data, output_path)
#                 logger.info(f"Progress saved after processing {batch_counter} entries.")
#                 batch_counter = 0  # Reset batch counter
# 
#     except KeyboardInterrupt:
#         # Save progress if interrupted
#         logger.warning("Processing interrupted by user. Saving progress...")
#         save_progress(processed_data, output_path)
#         raise
# 
#     # Final save after all entries are processed
#     save_progress(processed_data, output_path)
#     logger.info("Processing completed. Final progress saved.")


In [47]:
id_column = "Unnamed: 0"
review_fields = ["player_response"]
batch_size = 5

# Analyse data
analyse_data(
    translated_data=translated_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    review_fields=review_fields,
    batch_size=batch_size
)

2024-11-20 10:02:57,124 - INFO - Loading existing progress from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_analysed.json
2024-11-20 10:02:57,124 - INFO - Skipping already processed entry ID 2085
2024-11-20 10:02:57,124 - INFO - Skipping already processed entry ID 4524
2024-11-20 10:02:57,124 - INFO - Skipping already processed entry ID 4146
2024-11-20 10:02:57,139 - INFO - Skipping already processed entry ID 616
2024-11-20 10:02:57,139 - INFO - Skipping already processed entry ID 2041
2024-11-20 10:02:57,139 - INFO - Skipping already processed entry ID 1369
2024-11-20 10:02:57,139 - INFO - Skipping already processed entry ID 3953
2024-11-20 10:02:57,139 - INFO - Skipping already processed entry ID 2701
2024-11-20 10:02:57,139 - INFO - Extracting topics for entry ID 5005
2024-11-20 10:02:57,139 - ERROR - Error extracting topics for entry ID 5005: 'NoneType' object has no attribute 'chat'


KeyError: 'ID'

## chat gpt

In [65]:
import json
import os
from pathlib import Path

api_settings = {"client": None, "model": None}

def configure_api(api_client, model_name):
    """
    Configures the global API client and model.
    Args:
        api_client: The initialized OpenAI client.
        model_name (str): The model name to use.
    """
    global api_settings
    api_settings["client"] = api_client
    api_settings["model"] = model_name


def track_tokens(response):
    """
    Updates the global token counters based on the API response.

    Args:
        response: The API response containing token usage.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens
    # logger.info(f"Tokens used - Prompt: {response.usage.prompt_tokens}, Completion: {response.usage.completion_tokens}")

    
def load_existing_progress(output_path, id_column):
    """
    Loads existing progress from the output file if it exists.
    Returns the processed data as a list and the set of processed IDs.

    Args:
        output_path (str): Path to the output JSON file.
        id_column (str): The column name where IDs are stored.

    Returns:
        tuple: A list of processed data and a set of processed IDs.
    """
    if Path(output_path).exists():
        logger.info(f"Loading existing progress from {output_path}")
        with open(output_path, "r", encoding="utf-8") as f:
            processed_data = json.load(f)
        processed_ids = {entry[id_column] for entry in processed_data}
    else:
        logger.info(f"No existing progress found. Starting fresh.")
        processed_data = []
        processed_ids = set()
    return processed_data, processed_ids

def save_progress(processed_data, output_path):
    """
    Saves the current progress to a JSON file.

    Args:
        processed_data (list): The list of processed entries.
        output_path (str): Path to the output JSON file.
    """
    try:
        logger.info(f"Saving progress to {output_path}")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(processed_data, f, indent=4, ensure_ascii=False)
    except Exception as e:
        logger.error(f"Failed to save progress: {e}")
        raise


def extract_topics(entry, entry_id, prompt_template_topic, api_settings, review_fields):
    """
    Extracts topics from an entry's combined review fields using a prompt template.

    Args:
        entry (dict): The review entry containing multiple fields to combine.
        prompt_template_topic (PromptTemplate): The template used for topic extraction.
        api_settings (dict): API configuration with 'client' and 'model'.
        review_fields (list): List of fields to combine for the review.

    Returns:
        dict: Extracted topics in JSON format.
    """
    combined_review = " ".join(entry.get(field, "").strip() for field in review_fields)
    prompt_topic = prompt_template_topic.format(review=combined_review)
    logger.info(f"Extracting topics for entry ID {entry_id}")

    try:
        response = api_settings["client"].chat.completions.create(
            model=api_settings["model"],
            messages=[
                {"role": "system", "content": "You are a helpful assistant for game review analysis."},
                {"role": "user", "content": prompt_topic},
            ],
            max_tokens=1024,
            response_format={"type": "json_object"}
        )
        track_tokens(response)
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        logger.error(f"Error extracting topics for entry ID {entry_id}: {e}")
        return {"error": str(e)}


def analyze_sentiments(entry, entry_id, topics, prompt_template_sentiment, api_settings):
    """
    Performs sentiment analysis on extracted topics.

    Args:
        entry (dict): The JSON entry being processed.
        topics (dict): Topics extracted from the review.
        prompt_template_sentiment (PromptTemplate): Template for sentiment analysis.
        api_settings (dict): API configuration with 'client' and 'model'.

    Returns:
        list: Topics with appended sentiment data.
    """
    entry["topics"] = []
    for topic in topics.get("Topics", []):
        logger.info(f"Analyzing sentiment for topic '{topic['Topic']}' (Entry ID {entry_id})")
        try:
            prompt_sentiment = prompt_template_sentiment.format(
                review=topic["Context"], topic=topic["Topic"]
            )
            response = api_settings["client"].chat.completions.create(
                model=api_settings["model"],
                messages=[
                    {"role": "system", "content": "You are a helpful assistant for sentiment analysis."},
                    {"role": "user", "content": prompt_sentiment},
                ],
                max_tokens=1024,
            )
            track_tokens(response)
            sentiment = response.choices[0].message.content.strip()
            entry["topics"].append({
                "topic": topic["Topic"],
                "sentiment": sentiment,
                "category": topic["Category"],
                "sentence": topic["Context"]
            })
        except Exception as e:
            logger.error(f"Error analyzing sentiment for topic '{topic['Topic']}' (Entry ID {entry_id}): {e}")
            raise


def process_entry(entry, id_column, prompt_template_topic, prompt_template_sentiment, api_settings, review_fields):
    """
    Processes a single entry by extracting topics and analyzing their sentiments.

    Args:
        entry (dict): The JSON entry to process.
        id_column (str): The ID column name.
        prompt_template_topic (PromptTemplate): Template for topic extraction.
        prompt_template_sentiment (PromptTemplate): Template for sentiment analysis.
        api_settings (dict): API configuration.
        review_fields (list): List of review fields to combine.

    Returns:
        dict: Processed entry with topics and sentiments.
    """
    global prompt_tokens, completion_tokens
    logger.info(f"Tokens used so far: Prompt Tokens: {prompt_tokens}, Completion Tokens: {completion_tokens}")
    
    try:
        entry_id = entry.get(id_column, "unknown")
        topics = extract_topics(entry, entry_id, prompt_template_topic, api_settings, review_fields)
        analyze_sentiments(entry, entry_id, topics, prompt_template_sentiment, api_settings)        
    except Exception as e:
        logger.error(f"Error processing entry ID {entry[{id_column}]}: {e}")
    return entry


def analyse_data(translated_data, id_column, output_path, prompt_template_topic, prompt_template_sentiment, api_settings, review_fields, batch_size=10):
    """
    Main function to analyse translated data with fail-safe batching and progress saving.

    Args:
        translated_data (list): Dataset to process.
        id_column (str): Column name where IDs are stored.
        output_path (str): Path to save analysed data.
        prompt_template_topic (PromptTemplate): Template for topic extraction.
        prompt_template_sentiment (PromptTemplate): Template for sentiment analysis.
        api_settings (dict): API configuration.
        review_fields (list): List of review fields to combine.
        batch_size (int): Number of entries to process before saving progress.
    """
    processed_data, processed_ids = load_existing_progress(output_path, id_column)

    try:
        batch_counter = 0
        for entry in translated_data:
            entry_id = entry.get(id_column)
            if entry_id in processed_ids:
                logger.info(f"Skipping already processed entry ID {entry_id}")
                continue

            processed_entry = process_entry(
                entry, id_column, prompt_template_topic, prompt_template_sentiment, api_settings, review_fields
            )
            processed_data.append(processed_entry)
            processed_ids.add(entry_id)
            batch_counter += 1

            if batch_counter >= batch_size:
                save_progress(processed_data, output_path)
                logger.info(f"Progress saved after processing {batch_counter} entries.")
                batch_counter = 0

    except KeyboardInterrupt:
        logger.warning("Processing interrupted by user. Saving progress...")
        save_progress(processed_data, output_path)
        raise
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
    finally:
        save_progress(processed_data, output_path)
        logger.info("Processing completed. Final progress saved.")


In [67]:
#initialize api

configure_api(client, chat_model_name)

# Parameters
id_column = "Unnamed: 0"                # Column name for entry IDs
review_fields = ["player_response"]     # Which cols should be analyzed?
batch_size = 5                          # Fail-safe batching. The higher the number, the less often the progress is saved.

# Run analysis
analyse_data(
    translated_data=translated_data,
    id_column=id_column,
    output_path=path_db_analysed,
    prompt_template_topic=prompt_template_topic,
    prompt_template_sentiment=prompt_template_sentiment,
    api_settings=api_settings,
    review_fields=review_fields,
    batch_size=batch_size
)


2024-11-20 10:39:13,153 - INFO - Loading existing progress from C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_analysed.json
2024-11-20 10:39:13,153 - INFO - Skipping already processed entry ID 2085
2024-11-20 10:39:13,153 - INFO - Skipping already processed entry ID 4524
2024-11-20 10:39:13,153 - INFO - Skipping already processed entry ID 4146
2024-11-20 10:39:13,153 - INFO - Skipping already processed entry ID 616
2024-11-20 10:39:13,153 - INFO - Skipping already processed entry ID 2041
2024-11-20 10:39:13,153 - INFO - Tokens used so far: Prompt Tokens: 19248, Completion Tokens: 4011
2024-11-20 10:39:13,153 - INFO - Extracting topics for entry ID 1369
2024-11-20 10:39:16,473 - INFO - Analyzing sentiment for topic 'Breeding Space' (Entry ID 1369)
2024-11-20 10:39:16,968 - INFO - Analyzing sentiment for topic 'New Breeds' (Entry ID 1369)
2024-11-20 10:39:17,353 - INFO - Analyzing sentiment for topic 'Live Events' (Entry ID 1369)
2024-11-20 10:39:17,759 - INFO - An

KeyboardInterrupt: 