In [1]:

# General modules
import os
import openai
from dotenv import load_dotenv

# Language models
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_api_key
client = openai.Client()

chat_model_name = 'gpt-4o-mini'
embed_model_name = "all-MiniLM-L6-v2"
# embedding_model_name = 'sentence-transformers/all-mpnet-base-v2'


# Paths

root_dir = r'C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis'
path_input = os.path.join(root_dir, "Data", "2024 Trimester 1.xlsx")
path_db_prepared = os.path.join(root_dir, "Data", "db_prepared.json")

path_db_analysed = os.path.join(root_dir, "Data", "db_analysed.json")
path_db_progress_backup = os.path.join(root_dir, "Data", "db_progress_backup.json")

path_db_embedded = os.path.join(root_dir, "Data", "db_embedded.json")
path_db_clustered = os.path.join(root_dir, "Data", "db_clustered.json")
path_db_final = os.path.join(root_dir, "Data", "db_final.json")

In [30]:
import pandas as pd
import logging
# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO,
)
logger = logging.getLogger(__name__)
import pandas as pd

def load_excel_to_data(excel_path):
    """
    Loads an Excel file and converts it into a list of dictionaries,
    ensuring proper encoding.
    
    Args:
        excel_path (str): Path to the input Excel file.
    Returns:
        list: List of dictionaries representing the data.
    """
    try:
        logger.info("Loading Excel file: %s", excel_path)
        dataframe = pd.read_excel(excel_path, engine='openpyxl')  # Ensure the correct engine is used
        data_as_dict = dataframe.to_dict(orient='records')
        if data_as_dict:
            logger.info("Removing the first entry of the dataset.")
            data_as_dict = data_as_dict[1:]  # Remove the first row if necessary
        logger.info("Excel data successfully loaded and converted to dictionary.")
        return data_as_dict
    except Exception as e:
        logger.error("Error loading Excel: %s", e)
        raise


In [31]:
data = load_excel_to_data(path_input)

2024-11-19 15:49:31,587 - INFO - Loading Excel file: C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\2024 Trimester 1.xlsx
2024-11-19 15:49:39,008 - INFO - Removing the first entry of the dataset.
2024-11-19 15:49:39,008 - INFO - Excel data successfully loaded and converted to dictionary.


In [44]:
data[14]

{'Unnamed: 0': 15,
 'Respondent ID': 114588000000,
 'Collector ID': 431039728,
 'Start Date': Timestamp('2024-04-29 15:05:34'),
 'End Date': Timestamp('2024-04-29 15:09:57'),
 'IP Address': '132.184.130.75',
 'Email Address': nan,
 'First Name': nan,
 'Last Name': nan,
 'Custom Data 1': nan,
 '1_Rate_Overall Rival Stars Horse Racing': 2,
 '1_Rate_Breeding': 2,
 '1_Rate_Training my horses': 2,
 '1_Rate_Racing - Story': 2,
 '1_Rate_Steeplechase': 2,
 '1_Rate_Cross Country': 2,
 '1_Rate_Free Roam': 2,
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 2,
 '1_Rate_Completing Story Goals': 2,
 '1_Rate_Foal Caring': 2,
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': 2,
 '1_Rate_Steeplechase Stars': 2,
 '1_Rate_Arabian Days': 2,
 '1_Rate_Empowering my horse with skills': '3 (Neutral)',
 '1_Rate_Customizing my horse with tack': 2,
 '1_Rate_Teams': '1 (Strongly Dislike)',
 '2_LeastEnjoy_Breeding horses for Coats': nan,
 '2_LeastEnjoy_Breeding horses for Stats': 'Breeding 

In [35]:
import json
import numpy as np

def clean_json_data(data):
    """
    Cleans a list of dictionaries to ensure all entries are JSON-serializable 
    and handles missing or invalid values appropriately.
    Args:
        data (list): List of dictionaries representing the dataset.
    Returns:
        list: Cleaned list of dictionaries.
    """
    def is_serializable(value):
        """
        Checks if a value can be serialized to JSON.
        """
        try:
            json.dumps(value)
            return True
        except (TypeError, ValueError):
            return False

    def clean_value(value):
        """
        Cleans individual values in the dataset:
        - Replaces NaN or None with an empty string.
        - Leaves JSON-serializable values unchanged.
        """
        if value is None or (isinstance(value, float) and np.isnan(value)):
            return ""
        if is_serializable(value):
            return value
        # For any non-serializable value, convert it to a string
        return str(value)

    def clean_entry(entry):
        """
        Cleans a single dictionary by applying `clean_value` to each field.
        """
        return {key: clean_value(value) for key, value in entry.items()}

    cleaned_data = [clean_entry(entry) for entry in data]
    original_count = len(data)
    cleaned_count = len(cleaned_data)
    print(f"Cleaned {original_count - cleaned_count} entries from the dataset.")
    return cleaned_data


In [36]:
data_cleaned = clean_json_data(data)

Cleaned 0 entries from the dataset.


In [37]:
data_cleaned[2]

{'Unnamed: 0': 3,
 'Respondent ID': 114585000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-24 19:30:24',
 'End Date': '2024-04-29 20:41:02',
 'IP Address': '31.94.74.128',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': '3 (Neutral)',
 '1_Rate_Training my horses': '3 (Neutral)',
 '1_Rate_Racing - Story': 4,
 '1_Rate_Steeplechase': 2,
 '1_Rate_Cross Country': 4,
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '5 (Strongly Like)',
 '1_Rate_Completing Story Goals': '3 (Neutral)',
 '1_Rate_Foal Caring': '3 (Neutral)',
 '1_Rate_Pasture': '3 (Neutral)',
 '1_Rate_Foaling season event': '3 (Neutral)',
 '1_Rate_Steeplechase Stars': '1 (Strongly Dislike)',
 '1_Rate_Arabian Days': 2,
 '1_Rate_Empowering my horse with skills': '5 (Strongly Like)',
 '1_Rate_Customizing my horse with tack': 4,
 '1_Rate_Teams': '5 (Str

In [86]:
def filter_and_enrich_data(data, columns_of_interest):
    """
    Filters and enriches a dataset by specified columns.

    Args:
        data (list): List of dictionaries representing the dataset.
        columns_of_interest (list): List of column names to check in each entry.

    Returns:
        list: Filtered and enriched dataset.
    """
    removed_count = 0  # Track the number of removed entries
    filtered_data = []  # List to store the remaining entries

    for entry_idx, entry in enumerate(data):
        try:
            # Check if all specified columns are empty
            if all(
                not str(entry.get(col, "")).strip() for col in columns_of_interest
            ):
                removed_count += 1  # Count this entry as removed
                logger.debug(f"Removed entry #{entry_idx}: All columns empty: {entry}")
            else:
                # Create a new key 'player_response' with concatenated text from specified columns
                player_response = " ".join(
                    str(entry.get(col, "")).strip()
                    for col in columns_of_interest
                    if str(entry.get(col, "")).strip()
                )
                
                # Check if player_response has more than 3 words
                if len(player_response.split()) > 3:
                    entry["player_response"] = player_response
                    filtered_data.append(entry)
                else:
                    removed_count += 1  # Count this entry as removed
                    logger.debug(f"Removed entry #{entry_idx}: player_response too short: {entry}")
        except Exception as e:
            logger.error(f"Error processing entry #{entry_idx}: {e}")
            raise

    # Log and print the number of removed entries
    logger.info(f"Total entries removed: {removed_count}")
    print(f"Total entries removed: {removed_count}")

    return filtered_data


In [88]:
columns_of_interest = [
    "24_[OPTIONAL] Is there anything you'd like to share about the game (good, bad, frustrating, improvement, wishlist, etc.)?"
]


In [89]:
filtered_data = filter_and_enrich_data(data_cleaned, columns_of_interest)


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

2024-11-19 16:55:21,282 - INFO - Total entries removed: 4397
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Total entries removed: 4397


In [90]:
filtered_data[10]

{'Unnamed: 0': 31,
 'Respondent ID': 114588000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-29 13:26:55',
 'End Date': '2024-04-29 13:40:08',
 'IP Address': '99.251.136.92',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': 4,
 '1_Rate_Training my horses': 4,
 '1_Rate_Racing - Story': 4,
 '1_Rate_Steeplechase': 4,
 '1_Rate_Cross Country': 4,
 '1_Rate_Free Roam': 4,
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 4,
 '1_Rate_Completing Story Goals': 4,
 '1_Rate_Foal Caring': '3 (Neutral)',
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': '5 (Strongly Like)',
 '1_Rate_Steeplechase Stars': 4,
 '1_Rate_Arabian Days': '5 (Strongly Like)',
 '1_Rate_Empowering my horse with skills': '3 (Neutral)',
 '1_Rate_Customizing my horse with tack': '5 (Strongly Like)',
 '1_Rate_Teams': '5 (Strongly Like)',
 '2_LeastEnjoy_Breeding horses for Coats': '',


In [48]:
len(filtered_data)

2760

In [91]:
def save_to_json(data, output_path):
    """
    Saves data to a JSON file with proper encoding.
    
    Args:
        data (list): Data to save.
        output_path (str): Path to the output JSON file.
    """
    try:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        logger.info(f"Data successfully saved to {output_path}")
    except Exception as e:
        logger.error(f"Error saving JSON: {e}")
        raise

In [92]:
save_to_json(filtered_data, path_db_prepared)


2024-11-19 16:56:00,914 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


In [93]:
import random

def get_random_sample(data, sample_size, seed=None):
    """
    Returns a random sample of the specified size from the dataset.
    """
    if seed is not None:
        random.seed(seed)  # Set the random seed for reproducibility
    
    if sample_size > len(data):
        raise ValueError(f"Sample size ({sample_size}) cannot exceed the dataset size ({len(data)}).")
    
    return random.sample(data, sample_size)


In [94]:
sample_size = 100
seed = 42
sample_data = get_random_sample(filtered_data, sample_size, seed=seed)

In [96]:
sample_data[0]

{'Unnamed: 0': 1520,
 'Respondent ID': 114587000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-26 21:01:16',
 'End Date': '2024-04-26 21:09:56',
 'IP Address': '172.97.49.205',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': '1 (Strongly Dislike)',
 '1_Rate_Training my horses': '1 (Strongly Dislike)',
 '1_Rate_Racing - Story': '3 (Neutral)',
 '1_Rate_Steeplechase': '3 (Neutral)',
 '1_Rate_Cross Country': '3 (Neutral)',
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': 2,
 '1_Rate_Completing Story Goals': 4,
 '1_Rate_Foal Caring': '1 (Strongly Dislike)',
 '1_Rate_Pasture': 2,
 '1_Rate_Foaling season event': '5 (Strongly Like)',
 '1_Rate_Steeplechase Stars': '3 (Neutral)',
 '1_Rate_Arabian Days': '1 (Strongly Dislike)',
 '1_Rate_Empowering my horse with skills': 4,
 '1_Rate_Customizing my horse with tack': 4,

In [97]:
save_to_json(sample_data, path_db_prepared)

2024-11-19 16:56:39,212 - INFO - Data successfully saved to C:\Users\fbohm\Desktop\Projects\DataScience\cluster_analysis\Data\db_prepared.json


## Translate

In [98]:
from lingua import Language, LanguageDetectorBuilder


def detect_player_language(data):
    """
    Detects the language of the 'player_response' field for each JSON tuple
    and adds a new key 'player_language' with the detected language.

    Args:
        data (list): List of JSON-like dictionaries.

    Returns:
        list: Updated list of dictionaries with 'player_language' key.
    """
    # Initialize the language detector
    detector = LanguageDetectorBuilder.from_languages(
        Language.ENGLISH, Language.SPANISH, Language.CHINESE, Language.GERMAN, Language.FRENCH
    ).build()

    for entry in data:
        player_response = entry.get("player_response", "")
        if player_response.strip():  # Check if 'player_response' is not empty
            language = detector.detect_language_of(player_response)
            entry["player_language"] = language.name.lower()  # Store language name in lowercase
        else:
            entry["player_language"] = None  # No language detected for empty responses
    return data


In [99]:
import json

def read_json(file_path):
    """
    Reads a JSON file and returns its contents as a Python object.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        object: The contents of the JSON file as a Python data structure (e.g., dict or list).
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        raise
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON from file '{file_path}': {e}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred while reading the file '{file_path}': {e}")
        raise


In [100]:
data = read_json(path_db_prepared)

In [101]:
language = detect_player_language(data)

In [102]:
language[3]

{'Unnamed: 0': 3046,
 'Respondent ID': 114585000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-25 07:47:24',
 'End Date': '2024-04-25 08:15:36',
 'IP Address': '68.229.99.20',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '5 (Strongly Like)',
 '1_Rate_Breeding': 4,
 '1_Rate_Training my horses': '5 (Strongly Like)',
 '1_Rate_Racing - Story': '5 (Strongly Like)',
 '1_Rate_Steeplechase': '5 (Strongly Like)',
 '1_Rate_Cross Country': '5 (Strongly Like)',
 '1_Rate_Free Roam': '3 (Neutral)',
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '3 (Neutral)',
 '1_Rate_Completing Story Goals': '5 (Strongly Like)',
 '1_Rate_Foal Caring': '3 (Neutral)',
 '1_Rate_Pasture': 4,
 '1_Rate_Foaling season event': 4,
 '1_Rate_Steeplechase Stars': '5 (Strongly Like)',
 '1_Rate_Arabian Days': '5 (Strongly Like)',
 '1_Rate_Empowering my horse with skills': '3 (Neutral)',
 '1_Rate_Customizing my horse wit

In [103]:
# Initialize token counters
prompt_tokens = 0
completion_tokens = 0

# Global API settings
api_settings = {"client": None, "model": None}

def configure_api(api_client, model_name):
    """
    Configures the global API client and model.
    Args:
        api_client: The initialized OpenAI client.
        model_name (str): The model name to use.
    """
    global api_settings
    api_settings["client"] = api_client
    api_settings["model"] = model_name


def track_tokens(response):
    """
    Updates the global token counters based on the response.
    """
    global prompt_tokens, completion_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens

In [104]:
configure_api(client, chat_model_name)

In [105]:
from langchain.prompts import PromptTemplate

# Define the translation prompt template with examples
prompt_template_translation = PromptTemplate.from_template(
'''You are a professional translator. Translate the following text into English if it is not already in English.

[h0]==================================================================[\h0]
TEXT: "兄弟们，我把星空退款的钱拿来买这个了，我做的对吗"

TRANSLATION: "Brothers, I used the refund money from the stars to buy this. Did I do the right thing?"

[h0]==================================================================[\h0]
TEXT: "Me toma demasiado tiempo entrenar caballos."

TRANSLATION: "It takes too much time to train horses."

[h0]==================================================================[\h0]
TEXT: "Grinding just to get good tack, grain etc. Itâ€™s very time consuming sadly"

TRANSLATION: "Grinding just to get good tack, grain etc. It's very time consuming sadly"

[h0]==================================================================[\h0]
TEXT: "{text}"

TRANSLATION:
'''
)


In [106]:
def translate_player_responses(data, api_settings):
    """
    Translates the 'player_response' field for entries where 'player_language' is not 'english'.

    Args:
        data (list): List of JSON-like dictionaries.
        api_settings (dict): Dictionary with API settings, including the client and model.

    Returns:
        list: Updated list with translated 'player_response' fields.
    """
    for entry_idx, entry in enumerate(data):
        try:
            detected_language = entry.get("player_language", "none")
            player_response = entry.get("player_response", "")

            # Skip translation for English or empty responses
            if detected_language in ["english", "none"] or not player_response.strip():
                continue

            logger.info(f"Translating entry #{entry_idx} (Language: {detected_language})")
            
            # Format the prompt
            prompt_translation = prompt_template_translation.format(
                text=player_response
            )

            # Make API call to translate
            response = api_settings["client"].chat.completions.create(
                model=api_settings["model"],
                messages=[
                    {"role": "system", "content": "You are a helpful assistant for translation."},
                    {"role": "user", "content": prompt_translation},
                ],
                max_tokens=1024
            )

            # Track tokens if needed
            if "track_tokens" in api_settings:
                api_settings["track_tokens"](response)

            # Extract translation from the response
            translation_text = response.choices[0].message.content.strip()

            # Replace the 'player_response' with the translated text
            entry["player_response"] = translation_text

        except Exception as e:
            logger.error(f"Error translating entry #{entry_idx}: {e}")
            raise

    return data

In [107]:
translated_data = translate_player_responses(data, api_settings)

2024-11-19 16:57:08,086 - INFO - Translating entry #23 (Language: german)
2024-11-19 16:57:08,745 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [108]:
translated_data[10]

{'Unnamed: 0': 428,
 'Respondent ID': 114588000000,
 'Collector ID': 431039728,
 'Start Date': '2024-04-28 16:34:49',
 'End Date': '2024-04-28 16:46:12',
 'IP Address': '172.226.36.83',
 'Email Address': '',
 'First Name': '',
 'Last Name': '',
 'Custom Data 1': '',
 '1_Rate_Overall Rival Stars Horse Racing': '3 (Neutral)',
 '1_Rate_Breeding': '3 (Neutral)',
 '1_Rate_Training my horses': 2,
 '1_Rate_Racing - Story': '3 (Neutral)',
 '1_Rate_Steeplechase': '3 (Neutral)',
 '1_Rate_Cross Country': '3 (Neutral)',
 '1_Rate_Free Roam': 2,
 '1_Rate_Live Events & Leaderboards (Flat racing & Cross Country)': '3 (Neutral)',
 '1_Rate_Completing Story Goals': '3 (Neutral)',
 '1_Rate_Foal Caring': 2,
 '1_Rate_Pasture': '3 (Neutral)',
 '1_Rate_Foaling season event': '3 (Neutral)',
 '1_Rate_Steeplechase Stars': '3 (Neutral)',
 '1_Rate_Arabian Days': '3 (Neutral)',
 '1_Rate_Empowering my horse with skills': 2,
 '1_Rate_Customizing my horse with tack': '3 (Neutral)',
 '1_Rate_Teams': 2,
 '2_LeastEnjoy_B

## Topic extraction