In [1]:
# general imports
import pandas as pd
import numpy as np
import os
import json

# my imports
from helper.utils import configure_api, read_json, save_to_json, logger
from helper.data_pipeline import gather_data, translate_data, analyse_data, embed_data

# Specify models
chat_model_name = 'gpt-4o-mini'
openai_embedding_model = "text-embedding-3-small"
local_embedding_model = "all-MiniLM-L6-v2"
configure_api(chat_model_name)

# Specify paths for storing data
root_dir = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data'
project = 'HRC'
root_dir = os.path.join(root_dir, project)

In [7]:
# Steam Reviews
from helper.redshift_conector_standalone import fetch_query_results

data_source = 'Steam'
longname = 'com.pikpok.hrc'
id_column = 'recommendationid'
text_column = 'review_text'
timestamp_column = 'timestamp_updated'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def steam_query_function():
    # SQL Query Redshift
    sql_query = """
    SELECT *
    FROM steam_review
    where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'
    LIMIT 10
    """
    logger.info(f"Query Redshift with: {sql_query}")

    try:
        results_json, results_df = fetch_query_results(sql_query)
        # Print the first row of the DataFrame
        logger.info("Successfully fetched query results, with shape: %s", results_df.shape)
    except Exception as e:
        logger.error(f"Error fetching query results: {e}")
        raise
    
    return results_json


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source,  
            query_function=steam_query_function, 
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column,
            longname=longname)
translate_data(root_dir, data_source)
analyse_data(root_dir, data_source)
embed_data(root_dir, data_source, embed_key)

2025-03-21 12:00:29,538 - INFO - Query Redshift with: 
    SELECT *
    FROM steam_review
    where app_id_name = '1166860_Rival_Stars_Horse_Racing_Desktop_Edition'
    LIMIT 10
    
2025-03-21 12:00:40,206 - INFO - Successfully fetched query results, with shape: (10, 14)
2025-03-21 12:00:40,286 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_prepared.json
2025-03-21 12:00:40,296 - INFO - Loading existing translated reviews from: S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Steam\db_translated.json
2025-03-21 12:00:40,326 - INFO - Found 0 new reviews to check for translation.
2025-03-21 12:00:40,326 - INFO - No new reviews to add. All IDs already exist.
2025-03-21 12:00:40,326 - INFO - ###### Translation completed. Total reviews translated: 0 ######

2025-03-21 12:00:40,510 - INFO - Skipping entry 0 (ID: 184973192) - already analysed.
2025-0

KeyboardInterrupt: 

In [8]:
# Google play review
import datetime
import google_play_scraper as gps

data_source = 'Google Play'
id_column = 'reviewId'
text_column = 'content'
timestamp_column = 'at'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def google_play_query_function():
    app_id = "com.pikpok.hrc.play"
    longname = '.'.join(app_id.split('.')[:3])
    result, continuation_token = gps.reviews(
        app_id,
        # lang="ger",  # Language (English)
        # country="us",  # Country (United States)
        count=5,  # Number of reviews to fetch
        sort=gps.Sort.NEWEST
    )
    for e in result:
        e['longname'] = longname
        for k in e.keys():
            if isinstance(e[k], datetime.datetime):
                e[k] = int(e[k].timestamp())
    return json.dumps(result)


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source, 
            query_function=google_play_query_function, 
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column)
translate_data(root_dir, data_source)
analyse_data(root_dir, data_source)
embed_data(root_dir, data_source, embed_key)

2025-03-20 13:29:08,893 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Google Play\db_prepared.json
2025-03-20 13:29:08,908 - INFO - Loading existing translated reviews from: S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Google Play\db_translated.json
2025-03-20 13:29:08,908 - INFO - Found 0 new reviews to check for translation.
2025-03-20 13:29:08,918 - INFO - No new reviews to add. All IDs already exist.
2025-03-20 13:29:08,918 - INFO - ###### Translation completed. Total reviews translated: 0 ######

2025-03-20 13:29:08,928 - INFO - Skipping entry 0 (ID: 3bf20350-5d28-4bd5-b04d-1f9324e713ee) - already analysed.
2025-03-20 13:29:08,928 - INFO - Skipping entry 1 (ID: 7aff65b6-1729-4514-be0a-fbd4bdf835b4) - already analysed.
2025-03-20 13:29:08,928 - INFO - Skipping entry 2 (ID: ca1b711b-9c27-4142-8d9a-6629bb65b236) - already analysed.
2025-03-20 13:

In [6]:
# Zendesk CS tickets (temporarily using subcategory as the context)
import psycopg2

data_source = 'Zendesk'
id_column = 'ticket_id'
text_column = 'content'
timestamp_column = 'created_at'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def zendesk_query_function():
    conn = psycopg2.connect(host=os.getenv("POSTGRESQL_HOST"), 
                            database=os.getenv("POSTGRESQL_DATABASE"),
                            user=os.getenv("POSTGRESQL_USER"), 
                            password=os.getenv("POSTGRESQL_PASSWORD"), 
                            port=os.getenv("POSTGRESQL_PORT"))
    curr = conn.cursor()  # TODO: please fill in the query and replace the date filter
    s_query = """
    select coalesce(intent_subcategory2, coalesce(intent_subcategory1, intent_primary)) as content,
           * 
    from zendesk 
    where product = 'HRC' 
    and coalesce(intent_subcategory2, coalesce(intent_subcategory1, intent_primary)) is not null
    order by created_at desc
    limit 5
    """
    s_query = s_query.replace('>>', '>').replace('<<', '<')
    curr.execute(s_query)
    df = pd.DataFrame(curr.fetchall(), columns=[i[0] for i in curr.description])
    df['created_at'] = df['created_at'].apply(lambda x: int(x.timestamp()))
    df['longname'] = ['com.pikpok.' + str(x).lower() if x is not None else None for x in df['product']]
    return df.to_json(orient='records')


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source, 
            query_function=zendesk_query_function, 
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column)
translate_data(root_dir, data_source)
analyse_data(root_dir, data_source)
embed_data(root_dir, data_source, embed_key)

2025-03-19 14:26:46,028 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Zendesk\db_prepared.json
2025-03-19 14:26:46,048 - INFO - No existing file found. Starting fresh.
2025-03-19 14:26:46,048 - INFO - Found 60 new reviews to process.
2025-03-19 14:26:46,076 - INFO - Updated file saved to: S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Zendesk\db_translated.json
2025-03-19 14:26:46,076 - INFO - Translation completed. Total reviews translated: 0


In [2]:
# Surveys
# concat different cols to one
import pandas as pd

# Replace 'input.json' with the path to your JSON file
file = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Zendesk_csv\HRC_tickets_export.xlsx'
df = pd.read_excel(file)

# Specify the columns you want to combine
cols_to_concat = ['Add',
                  'Change',
                  'Remove'
                  ]

# If a column is empty or NaN, it will be skipped
df['combined_text'] = df[cols_to_concat].apply(
    lambda row: ' '.join(str(x) for x in row if pd.notnull(x) and str(x).strip() != ''),
    axis=1
)
df['ID'] = range(1, len(df) + 1)

# Save the DataFrame to an Excel file
df.to_excel('output.xlsx', index=False)

KeyError: "None of [Index(['Add', 'Change', 'Remove'], dtype='object')] are in the [columns]"

In [2]:
# Surveys / tabular data
data_source = 'Zendesk_csv'
file = r'S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Zendesk_csv\HRC_tickets_export.xlsx'
longname = 'com.pikpok.hrc'
id_column = 'id'
text_column = "description"   #["If you had a magic wand and you could add, change or remove anything from the game, what would it be and why?", "Unnamed: 19", "Unnamed: 20"]
timestamp_column = 'created_at'
language_column = 'language'
embed_key = "sentence"  # "topic" or "sentence"


def survey_query_function(file):
    file_name, file_extension = os.path.splitext(file)
    if '.csv' == file_extension:
        df = pd.read_csv(file)
        js = json.loads(df.to_json(orient='records'))
    elif '.xls' == file_extension or '.xlsx' == file_extension:
        df = pd.read_excel(file)
        js = json.loads(df.to_json(orient='records'))
    elif '.txt' == file_extension:
        with open(file, 'r') as f:
            js = json.load(f)
    for e in js:
        if 'Respondent ID' in e.keys() and 'pcubed_id' in e.keys() and 'review_id' not in e.keys():
            e['review_id'] = str(e['Respondent ID']) + ':' + str(e['pcubed_id'])
    return json.dumps(js)


path_dir = os.path.join(root_dir, data_source)
if not os.path.exists(path_dir):
    os.mkdir(path_dir)
    
gather_data(root_dir, data_source, 
            query_function=survey_query_function,
            query_function_args=[file,],
            id_column=id_column, 
            text_column=text_column, 
            timestamp_column=timestamp_column)
translate_data(root_dir, data_source)
analyse_data(root_dir, data_source)
embed_data(root_dir, data_source, embed_key)

2025-03-26 15:06:51,221 - INFO - Data successfully saved to S:\SID\Analytics\Working Files\Individual\Florian\Projects\DataScience\cluster_analysis\Data\HRC\Zendesk_csv\db_prepared.json
2025-03-26 15:06:51,292 - INFO - No existing translation file found. Starting fresh.
2025-03-26 15:06:51,311 - INFO - Found 1964 new reviews to check for translation.
2025-03-26 15:06:51,762 - INFO - Translating review ID: 103349 (Detected Language: portuguese)
2025-03-26 15:06:53,712 - INFO - Total Tokens used: Prompt: 220, Completion: 24
2025-03-26 15:06:53,715 - INFO - Translating review ID: 103345 (Detected Language: spanish)
2025-03-26 15:06:54,464 - INFO - Total Tokens used: Prompt: 439, Completion: 43
2025-03-26 15:06:54,465 - INFO - Translating review ID: 103344 (Detected Language: german)
2025-03-26 15:06:57,233 - INFO - Total Tokens used: Prompt: 650, Completion: 54
2025-03-26 15:06:57,237 - INFO - Translating review ID: 103343 (Detected Language: spanish)
2025-03-26 15:06:57,930 - INFO - Tota

KeyboardInterrupt: 