# (Part of the) Quantitative Analysis of Themes in ASAQ

In [None]:
import math
import re
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from prettytable import PrettyTable
from scipy.stats import spearmanr
from textblob import TextBlob

print('Reading in the data...', end='', flush=True)

data_string = 'data/data.ods'
sheet_string = 'Thematic Analysis + ASAQ'
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# columns: 'agentName', 'EXP', 'AGE', 'AGEGROUP', 'SEX', 'REGION.OF.COUNTRY', 'EDUCATION'
df = pd.read_excel(data_string, sheet_name=sheet_string)

print('Done! ✅', flush=True)

In [None]:
import subprocess


def print_licenses_from_requirements():
    try:
        # From requirements.txt: Lmstudio has MIT
        packages = [
            "matplotlib",
            "numpy",
            "pandas",
            "seaborn",
            "spacy",
            "textblob",
            "scipy",
            "prettytable",
            "lmstudio",
            "pip-licenses"
        ]
        cmd = ['pip-licenses', '-p']
        for pkg in packages:
            cmd.extend([pkg])
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            check=True
        )
        print("Licenses for packages in requirements.txt:\n")
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error while retrieving licenses:", e.stderr)


print_licenses_from_requirements()
print('Done! ✅', flush=True)

In [None]:
print('Creating the metadata...', end='', flush=True)

metadata = {}
df_demographics = df[['agentName', 'EXP', 'AGE', 'AGEGROUP', 'SEX', 'REGION.OF.COUNTRY', 'EDUCATION', 'Themes', 'Themes (Peer)', 'Direction']]

df_questionnaire = df.iloc[:, df.columns.get_loc('ASAQ_HLA2'):df.columns.get_loc('ASAQ_UAI4') + 1]
columns_to_negate = [col for col in df_questionnaire.columns if col.startswith('R_')]
df_questionnaire[columns_to_negate] = df_questionnaire[columns_to_negate] * -1

df_intercoder = df[['Themes', 'Themes (Peer)']][0:100]

shorthand_age = {
    '18 - 25': '18-25',
    '26 - 35': '26-35',
    '36 - 45': '36-45',
    '46 - 55': '46-55',
    '56 - 65': '56-65',
    'Older than 65': '65+'
}

shorthand_sex = {
    'Male': 'Male',
    'Female': 'Female'
}

shorthand_region = {
    'Western Europe': 'W. Europe',
    'Latin America - Carribean': 'Latin A.',
    'Africa': 'Africa',
    'Asia Pacific': 'Asia P.',
    'Eastern Europe': 'E. Europe',
    'USA': 'USA'
}

shorthand_education = {
    'Graduate degree (MA / MSc / MPhil / other)': 'Grad.',
    'Undergraduate degree (BA / BSc / other)': 'UGrad.',
    'High school diploma / A-Level': 'H. Sch.',
    'Secondary education (GED / GCSE)': 'Sec.',
    'Technical / Community College': 'Com. Col.',
    'No formal qualification': 'No Edu.',
    'Doctorate degree (PhD / other)': 'PhD'
}

shorthand_agent = {
    'CoPilot': 'CoPilot',
    'Bard': 'Bard',
    'Alexa': 'Alexa',
    'Bing': 'Bing',
    'ChatGPT': 'ChatGPT',
    'Google Assistant': 'Google Assistant',
    'Siri': 'Siri',
    'Roomba': 'Roomba',
    'Eliza': 'Eliza',
    'the dog': 'the dog',
}

shorthand_all_themes = {
    "Agent's Cognition": 'COG',
    "Agent's Coherence": 'COH',
    "Agent's Creativeness": 'CRE',
    "Agent's Efficiency": 'EFF',
    "Agent's Emotional Presence": 'EMP',
    "Agent's Enjoyability": 'ENJ',
    "Agent's Helpfulness": 'HLP',
    "Agent's Intentionality": 'INT',
    "Agent's Interestingness": 'INS',
    "Agent's Intuitiveness": 'ITU',
    "Agent's Limitation": 'LIM',
    "Agent's Personality": 'PER',
    "Agent's Quickness": 'QCK',
    "Agent's Reliability": 'REL',
    "Agent's Sociability": 'SOC',
    "Agent's Usability": 'USA',
    "Attitude": 'ATT',
    "Ease of Life": 'EOL',
    "Emotional Experience": 'EMX',
    "Human-like Behaviour": 'HLB',
    "Performance": 'PRF',
    "Potential": 'POT',
    "Productivity": 'PRO',
    "Unopinionated": 'UNO',
    "User Acceptance": 'UAC',
    "User's Autonomy": 'AUT',
    "User's Emotional Presence": 'UEP',
    "User's Engagement": 'ENG',
    "User's Trust": 'TRU',
    "User-Agent Alliance": 'UAA',
    "User-Agent Interplay": 'UAI',
    "Limitations": 'LIP'
}

mapping_peer = {
    "Agent's Cognition": [],
    "Agent's Coherence": ["Accuracy"],
    "Agent's Creativeness": ["Creativity"],
    "Agent's Efficiency": ["Efficiency"],
    "Agent's Emotional Presence": [],
    "Agent's Enjoyability": ["Enjoyability"],
    "Agent's Helpfulness": ["Helpfulness"],
    "Agent's Intentionality": [],
    "Agent's Interestingness": ["Interestingness"],
    "Agent's Intuitiveness": [],
    "Agent's Limitation": [],
    "Agent's Personality": [],
    "Agent's Quickness": [],
    "Agent's Reliability": [],
    "Agent's Sociability": [],
    "Agent's Usability": ["Usability", "Accessibility", "Convenience"],
    "Attitude": ["Entertainment"],
    "Ease of Life": [],
    "Emotional Experience": ["Emotional Connection"],
    "Human-like Behaviour": ["Human-like Behavior"],
    "Performance": [],
    "Potential": ["Potential"],
    "Productivity": ["Productivity"],
    "User Acceptance": [],
    "User's Autonomy": [],
    "User's Emotional Presence": [],
    "User's Engagement": ["Engagement"],
    "User's Trust": ["Trust"],
    "User-Agent Alliance": [],
    "User-Agent Interplay": [],
    "Limitations": []
}

mapping_asaq = {
    "Agent's Cognition": ['R_ASAQ_AC1', 'R_ASAQ_AI3'],
    "Agent's Coherence": ['R_ASAQ_AC2', 'R_ASAQ_AC3'],
    "Agent's Creativeness": [],
    "Agent's Efficiency": [],
    "Agent's Emotional Presence": ['ASAQ_AEI1', 'ASAQ_AEI2', 'R_ASAQ_AEI3', 'ASAQ_AEI4', 'R_ASAQ_AEI5'],
    "Agent's Enjoyability": ['ASAQ_AL2', 'R_ASAQ_AL3', 'ASAQ_AEI1', 'R_ASAQ_AEI3', "ASAQ_AE3", "R_ASAQ_AE4"],
    "Agent's Helpfulness": ['ASAQ_UAL4'],
    "Agent's Intentionality": ['ASAQ_AI1', 'ASAQ_AI2', 'R_ASAQ_AI3', 'ASAQ_AI4'],
    "Agent's Interestingness": ['ASAQ_AE2'],
    "Agent's Intuitiveness": ['ASAQ_AU1', 'ASAQ_AU2'],
    "Agent's Limitation": [],
    "Agent's Personality": ['ASAQ_APP1', 'R_ASAQ_APP2'],
    "Agent's Quickness": ['ASAQ_AU3'],
    "Agent's Reliability": ['ASAQ_UT3'],
    "Agent's Sociability": ['ASAQ_AS1', 'ASAQ_AS2', 'ASAQ_AS3'],
    "Agent's Usability": ['ASAQ_AU1', 'ASAQ_AU2'],
    "Attitude": ['R_ASAQ_AE4', 'ASAQ_AT1', 'ASAQ_AT2', 'R_ASAQ_AT3'],
    "Ease of Life": [],
    "Emotional Experience": ['ASAQ_AEI1', 'ASAQ_AEI2', 'R_ASAQ_AEI3', 'ASAQ_AEI4', 'R_ASAQ_AEI5', 'ASAQ_AL5', 'ASAQ_UEP1', 'ASAQ_UEP2', 'ASAQ_UEP3', 'ASAQ_UEP4'],
    "Human-like Behaviour": ['ASAQ_HLB1', 'ASAQ_HLB2', 'ASAQ_HLB3', 'ASAQ_HLB4', 'ASAQ_HLB5', 'ASAQ_NB2'],
    "Performance": ['ASAQ_PF1'],
    "Potential": [],
    "Productivity": [],
    "User Acceptance": ['ASAQ_UAA1', 'ASAQ_UAA2', 'R_ASAQ_UAA3'],
    "User's Autonomy": [],
    "User's Emotional Presence": ['ASAQ_AL5'],
    "User's Engagement": ['ASAQ_UE1', 'ASAQ_UE2', 'ASAQ_UE3'],
    "User's Trust": ['ASAQ_UT1', 'ASAQ_UT2'],
    "User-Agent Alliance": ['ASAQ_AL4', 'ASAQ_AL5', 'ASAQ_UAL1', 'ASAQ_UAL2', 'ASAQ_UAL3'],
    "User-Agent Interplay": ['ASAQ_UEP2', 'ASAQ_UAI1', 'ASAQ_UAI4'],
    "Limitations": []
}

mapping_asaq_peer = {
    "Agent's Cognition": [],
    "Agent's Coherence": ['R_ASAQ_AC1', 'R_ASAQ_AC2', 'R_ASAQ_AC3', 'R_ASAQ_AC4'],
    "Agent's Creativeness": [],
    "Agent's Efficiency": [],
    "Agent's Emotional Presence": ['ASAQ_AEI1', 'ASAQ_AEI2', 'R_ASAQ_AEI3', 'ASAQ_AEI4', 'R_ASAQ_AEI5'],
    "Agent's Enjoyability": ['ASAQ_AL2', 'R_ASAQ_AL3', 'R_ASAQ_AE1', 'ASAQ_AE3', 'R_ASAQ_AE4'],
    "Agent's Helpfulness": [],
    "Agent's Intentionality": ['ASAQ_AI1', 'ASAQ_AI2', 'R_ASAQ_AI3', 'ASAQ_AI4'],
    "Agent's Interestingness": ['ASAQ_AE2'],
    "Agent's Intuitiveness": [],
    "Agent's Limitation": [],
    "Agent's Personality": ['ASAQ_APP1', 'R_ASAQ_APP2', 'ASAQ_APP3'],
    "Agent's Quickness": [],
    "Agent's Reliability": ['ASAQ_UT3'],
    "Agent's Sociability": ['ASAQ_AS1', 'ASAQ_AS2', 'ASAQ_AS3'],
    "Agent's Usability": ['ASAQ_AU1', 'ASAQ_AU2', 'ASAQ_AU3'],
    "Attitude": ['ASAQ_AT1', 'ASAQ_AT2', 'R_ASAQ_AT3'],
    "Ease of Life": [],
    "Emotional Experience": ['ASAQ_UEP1', 'ASAQ_UEP2', 'ASAQ_UEP3', 'ASAQ_UEP4', 'ASAQ_UAL6'],
    "Human-like Behaviour": ['ASAQ_HLB1', 'ASAQ_HLB3', 'ASAQ_HLB4', 'ASAQ_NB2', 'ASAQ_NB3'],
    "Performance": ['ASAQ_PF1', 'ASAQ_PF2', 'ASAQ_PF3'],
    "Potential": [],
    "Productivity": [],
    "User Acceptance": ['ASAQ_UAA1', 'ASAQ_UAA2', 'R_ASAQ_UAA3'],
    "User-Agent Alliance": ['ASAQ_AL4', 'ASAQ_UAL1', 'ASAQ_UAL2', 'ASAQ_UAL3', 'ASAQ_UAL4', 'ASAQ_UAL5'],
    "User-Agent Interplay": ['ASAQ_UAI1', 'ASAQ_UAI2', 'ASAQ_UAI3', 'ASAQ_UAI4'],
    "User's Autonomy": [],
    "User's Emotional Presence": [],
    "User's Engagement": ['ASAQ_UE1', 'ASAQ_UE2', 'ASAQ_UE3'],
    "User's Trust": ['ASAQ_UT1', 'ASAQ_UT2', 'ASAQ_UT3'],
    "Limitations": []
}

global_ctr = Counter()

age_themes, sex_themes, region_themes, education_themes, agent_themes = {}, {}, {}, {}, {}

for idx, row in df_demographics.iterrows():
    if not isinstance(row['Themes'], str):
        continue
    themes = [item.strip() for item in row['Themes'].split(',') if item.strip() != '' and item.strip().lower() != 'nan']
    local_ctr = Counter(themes)
    global_ctr.update(local_ctr)

    age_themes.setdefault(row['AGEGROUP'], Counter()).update(local_ctr)
    sex_themes.setdefault(row['SEX'], Counter()).update(local_ctr)
    region_themes.setdefault(row['REGION.OF.COUNTRY'], Counter()).update(local_ctr)
    education_themes.setdefault(row['EDUCATION'], Counter()).update(local_ctr)
    agent_themes.setdefault(row['agentName'], Counter()).update(local_ctr)

for t in [age_themes, sex_themes, region_themes, education_themes, agent_themes]:
    for k, _ in global_ctr.items():  # k: theme, v: count
        for _, ctr in t.items():  # k: '18-25', v: counter of themes
            if not k in ctr:  # if the theme is not present, insert counter of 0
                ctr.update({k: 0})

peer_counter = Counter()
for idx, row in df_demographics.iterrows():
    if not isinstance(row['Themes (Peer)'], str):
        continue
    peer_themes = [item.strip() for item in row['Themes (Peer)'].split(',') if item.strip() != '' and item.strip().lower() != 'nan']
    local_ctr = Counter(peer_themes)
    peer_counter.update(local_ctr)

metadata['dfD'] = df_demographics
metadata['dfQ'] = df_questionnaire
metadata['dfi'] = df_intercoder
metadata['custom cmap'] = LinearSegmentedColormap.from_list("custom_blue_yellow", ["#003366", "#DAE8FC", "#FFD700"])

metadata['global counter'] = global_ctr
metadata['all themes'] = list(global_ctr.keys())
metadata['all themes sorted'] = sorted(list(global_ctr.keys()))
metadata['all themes count'] = len(global_ctr.keys())
metadata['all themes sum'] = sum(global_ctr.values())

metadata['peer counter'] = peer_counter
metadata['peer themes'] = sorted(list(peer_counter.keys()))
metadata['peer themes count'] = len(peer_counter.keys())
metadata['peer themes sum'] = sum(peer_counter.values())
metadata['mapping peer'] = mapping_peer

metadata['mapping asaq'] = mapping_asaq
metadata['mapping asaq peer'] = mapping_asaq_peer

metadata['age themes'] = {k: v for k, v in age_themes.items() if k != 'Unknown'}
metadata['sex themes'] = {k: v for k, v in sex_themes.items() if k != 'Other'}
metadata['region themes'] = {k: v for k, v in region_themes.items() if k != 'Unknown'}
metadata['education themes'] = {k: v for k, v in education_themes.items() if k != "Don't know / not applicable"}
metadata['agent themes'] = agent_themes

metadata['age count'] = Counter(df_demographics[df_demographics['AGEGROUP'] != 'Unknown']['AGEGROUP'])
metadata['sex count'] = Counter(df_demographics[df_demographics['SEX'] != 'Other']['SEX'])
metadata['region count'] = Counter(df_demographics[df_demographics['REGION.OF.COUNTRY'] != 'Unknown']['REGION.OF.COUNTRY'])
metadata['education count'] = Counter(df_demographics[df_demographics['EDUCATION'] != "Don't know / not applicable"]['EDUCATION'])
metadata['agent count'] = Counter(df_demographics['agentName'])

metadata['shorthand'] = shorthand_age | shorthand_sex | shorthand_region | shorthand_education | shorthand_agent
metadata['shorthand age'] = shorthand_age
metadata['shorthand sex'] = shorthand_sex
metadata['shorthand region'] = shorthand_region
metadata['shorthand education'] = shorthand_education
metadata['shorthand agent'] = shorthand_agent
metadata['shorthand all themes'] = shorthand_all_themes

age_themes_percentages, sex_themes_percentages, region_themes_percentages, education_themes_percentages, agent_themes_percentages = {}, {}, {}, {}, {}

for k, v in metadata['age themes'].items():  # k: '18 - 25' , v: Counter(Themes...)
    for kc, vc in v.items():  # k: Theme , v: Count
        percentage = (vc / metadata['age count'][k]) * 100
        age_themes_percentages.setdefault(k, {}).setdefault(kc, percentage)

# How many % of females have Attitude as a theme. 'vc' females with 'Attitude' as theme divided by total amount of 'females'
for k, v in metadata['sex themes'].items():  # 'Male' : Counter(Themes...)
    for kc, vc in v.items():  # k: Theme , v: Count
        percentage = (vc / metadata['sex count'][k]) * 100
        sex_themes_percentages.setdefault(k, {}).setdefault(kc, percentage)

for k, v in metadata['region themes'].items():  # 'Africa' : Counter(Themes...)
    for kc, vc in v.items():  # k: Theme , v: Count
        percentage = (vc / metadata['region count'][k]) * 100
        region_themes_percentages.setdefault(k, {}).setdefault(kc, percentage)

for k, v in metadata['education themes'].items():  # 'UnderGraduate' : Counter(Themes...)
    for kc, vc in v.items():  # k: Theme , v: Count
        percentage = (vc / metadata['education count'][k]) * 100
        education_themes_percentages.setdefault(k, {}).setdefault(kc, percentage)

for k, v in metadata['agent themes'].items():  # 'CoPilot' : Counter(Themes...)
    for kc, vc in v.items():  # k: Theme , v: Count
        percentage = (vc / metadata['agent count'][k]) * 100
        agent_themes_percentages.setdefault(k, {}).setdefault(kc, percentage)

metadata['percentages age'] = age_themes_percentages
metadata['percentages sex'] = sex_themes_percentages
metadata['percentages region'] = region_themes_percentages
metadata['percentages education'] = education_themes_percentages
metadata['percentages agent'] = agent_themes_percentages

print('Done! ✅', flush=True)

In [None]:
table = PrettyTable(['Theme', 'k', 'interpret(k)', 'a', 'b', 'c', 'd'])


def interpret_agreement(v):
    if v <= 0:
        return "Poor agreement"
    elif 0 < v < 0.20:
        return "Slight agreement"
    elif 0.20 <= v < 0.40:
        return "Fair agreement"
    elif 0.40 <= v < 0.60:
        return "Moderate agreement"
    elif 0.60 <= v < 0.80:
        return "Substantial agreement"
    elif 0.80 <= v <= 1.00:
        return "Almost perfect agreement"
    else:
        return "Invalid value"


Cohen = {}
for theme, peer_theme_lst in metadata['mapping peer'].items():
    if not peer_theme_lst:
        continue
    kappa, a, b, c, d = 0, 0, 0, 0, 0
    theme_present = False
    peer_theme_present = False
    for index, row in metadata['dfi'].iterrows():
        if theme.lower() in row['Themes'].lower():
            theme_present = True
        for peer_theme in peer_theme_lst:
            if peer_theme.lower() in str(row['Themes (Peer)']).lower():
                peer_theme_present = True
        if theme_present and peer_theme_present:
            a += 1
        elif theme_present and not peer_theme_present:
            b += 1
        elif not theme_present and peer_theme_present:
            c += 1
        else:
            d += 1
        theme_present = False
        peer_theme_present = False
    N = (a + b + c + d)
    P_o = (a + d) / N
    P_e = ((a + b) / N) * ((a + c) / N) + ((c + d) / N) * ((b + d) / N)
    kappa = (P_o - P_e) / (1.0 - P_e)
    Cohen[theme] = kappa
    table.add_row([theme, round(kappa, 2), interpret_agreement(kappa), a, b, c, d])

detailed_results = {}
for k, v in Cohen.items():
    detailed_results[k] = interpret_agreement(v)

print(table)

In [None]:
def percent_agreement(mapping, peer_mapping):
    jaccard_results = {}
    overlap_results = {}
    total_intersection = 0
    total_union = 0

    for theme_name in mapping.keys():
        set1 = set(mapping[theme_name])
        set2 = set(peer_mapping[theme_name])

        intersection = set1.intersection(set2)
        union = set1.union(set2)
        overlap = min(len(set1), len(set2))

        jaccard = len(intersection) / len(union) if len(union) else 1.0
        jaccard_results[theme_name] = round(jaccard, 2)

        overlap = len(intersection) / overlap if overlap else 0 if max(len(set1), len(set2)) else 1
        overlap_results[theme_name] = int(overlap * 100)

        total_intersection += len(intersection)
        total_union += len(union)

    overall_agreement = round(total_intersection / total_union, 2) if total_union else 1.0
    return jaccard_results, overlap_results, overall_agreement


jres, ores, total = percent_agreement(metadata['mapping asaq'], metadata['mapping asaq peer'])

table_asaq_ica = PrettyTable(['Theme', 'Mapping ASAQ', 'Mapping ASAQ (Peer)', 'jaccard', 'overlap (%)'])
for k, v in jres.items():
    table_asaq_ica.add_row([k,
                            ', '.join(item.replace('ASAQ_', '').replace('R_', '') for item in metadata['mapping asaq'][k]),
                            ', '.join(item.replace('ASAQ_', '').replace('R_', '') for item in metadata['mapping asaq peer'][k]),
                            v,
                            ores[k]])

mean_overlap = sum(ores.values()) / len(ores)
table_asaq_ica.add_row(['---Total---', '------', '------', total, mean_overlap])
print(table_asaq_ica)

In [None]:
theme_scores = {}


def interpret_spearman(rho, p=None):
    abs_rho = abs(rho)

    if abs_rho < 0.30:
        strength = "weak"
    elif abs_rho < 0.60:
        strength = "moderate"
    elif abs_rho < 0.90:
        strength = "strong"
    elif abs_rho <= 1.00:
        strength = "very strong"
    else:
        return "Invalid rho value"

    if rho > 0:
        direction = "positive"
    elif rho < 0:
        direction = "negative"
    else:
        direction = "no"

    if p is not None:
        if p > 0.05:
            significance = "no statistical significance"
        elif p > 0.01:
            significance = "statistically significant"
        elif p > 0.001:
            significance = "strong statistical significance"
        else:
            significance = "very strong statistical significance"
    else:
        significance = None

    if direction == "no" or abs_rho == 0:
        msg = f"No correlation (ρ = {rho:.2f})"
    else:
        msg = f"{strength.capitalize()} {direction} correlation"

    if significance is not None:
        msg += f", {significance.capitalize()}"

    return msg


# Note: all R_ columns in the questionnaire are already multiplied by -1 at the start.

# 'Theme' : averaged 'ASAQ Themes'
for theme, cols in metadata['mapping asaq'].items():
    if cols:
        valid_cols = [col for col in cols if col in df.columns]
        if cols != valid_cols:
            print("[ERROR] ", cols, valid_cols)
        if valid_cols:
            theme_scores[theme] = df[valid_cols].mean(axis=1)

themes_names = []
for k in sorted(theme_scores):
    themes_names.append(k)

user_matrix = []
asaq_matrix = []
for index, row in metadata['dfD'].iterrows():
    text = row['Direction']
    if pd.isna(text):
        text = ''
    matches = re.findall(r"(.*?=[\s]*-?\d+)", text)

    # result: {'Theme': v, ...} with v = [-1,0,1] for the user at row [index]
    result = {}
    for match in matches:
        key, value = match.split('=')
        result[key.strip()] = int(value)  # Key = [-1,0,1]

    user_vector = []
    asaq_vector = []

    for tn in themes_names:
        v = theme_scores[tn]
        if tn in result.keys():
            user_vector.append(result[tn])
        else:
            user_vector.append(np.nan)
        asaq_vector.append(float(v[index]))
        # asaq_vector.append(round(float(v[index])))

    user_matrix.append(user_vector)
    asaq_matrix.append(asaq_vector)

# user_matrix:
# - Matrix from the thematic analysis scores:
#             T1 .......... Tn
# User row-0 [1 0 1 -1 ...  1]
# User row-1 [1 1 1 -1 ... -1]
# etc...

# asaq_matrix:
# - Matrix from the ASAQ Questionnaire score:
#             T1 .......... Tn
# User row-0 [3 2 1 -3 ...  1]
# User row-1 [0 1 1 -3 ... -1]
# etc...

corr_table = PrettyTable(['Theme', 'rho', 'p', 'CI (95%)', 'correlation', 'significance'])

idx_tn = 0
theme_correlations = {}
for tn in themes_names:
    compareA = []
    compareB = []
    for idx in range(len(user_matrix)):
        compareA.append(user_matrix[idx][idx_tn])
        compareB.append(asaq_matrix[idx][idx_tn])

    rho, p = spearmanr(compareA, compareB, nan_policy='omit')

    # 95% Confidence Interval: https://stackoverflow.com/questions/75044240/correlation-coefficient-confidence-intervals
    count = np.sum(~np.isnan(compareA) & ~np.isnan(compareB))

    if count > 3 and abs(rho) < 1.0:
        stderr = 1.0 / math.sqrt(count - 3)
        delta = 1.96 * stderr
        lower = math.tanh(math.atanh(rho) - delta)
        upper = math.tanh(math.atanh(rho) + delta)
    else:
        lower, upper = float('nan'), float('nan')


    def format_p(p, decimals=3, threshold=1e-3):
        if p < threshold and p != 0:
            return np.format_float_scientific(p, precision=1)
        else:
            return round(p, decimals)


    theme_correlations[themes_names[idx_tn]] = (float(rho), float(p))
    inter_corr, inter_sign = interpret_spearman(rho, p).split(', ')
    corr_table.add_row([themes_names[idx_tn], round(rho, 2), format_p(p), f"[{round(lower, 2)}, {round(upper, 2)}]", inter_corr, inter_sign])
    idx_tn += 1

# https://stats.stackexchange.com/questions/55288/understanding-the-p-value-in-spearmans-rank-correlation
print(corr_table)

In [None]:
df_to_plot_age = pd.DataFrame(metadata['age themes'])
df_to_plot_age = df_to_plot_age[df_to_plot_age.index != 'Unopinionated']
custom_sort_order = df_to_plot_age.sum(axis=1).sort_values(ascending=False).index
df_to_plot_age = df_to_plot_age.loc[custom_sort_order]
df_to_plot_age.rename(columns=metadata['shorthand age'], inplace=True)

# Stacked bar chart
ax = df_to_plot_age.plot(kind='barh', stacked=True, figsize=(10, 6), colormap=metadata['custom cmap'], width=0.8)

# Get handles and labels from the legend
handles, labels = ax.get_legend_handles_labels()

# Sort labels (and handles accordingly) alphabetically
sorted_pairs = sorted(zip(labels, handles), key=lambda x: x[0])
sorted_labels, sorted_handles = zip(*sorted_pairs)

# Update the legend with sorted labels
ax.legend(sorted_handles, sorted_labels)

# plt.title("Themes by Age Group")
plt.xlabel("Count")
plt.ylabel("Themes")
plt.tight_layout()
plt.savefig('data/img/themes_by_agegroup.png', dpi=600)
plt.show()

# ---------------------------------------------------------------------------------------------------------------
df_to_plot_sex = pd.DataFrame(metadata['sex themes'])
df_to_plot_sex = df_to_plot_sex.loc[custom_sort_order]
df_to_plot_sex.rename(columns=metadata['shorthand sex'], inplace=True)

# Stacked bar chart
ax = df_to_plot_sex.plot(kind='barh', stacked=True, figsize=(10, 6), colormap=metadata['custom cmap'], width=0.8)

# plt.title("Themes by Gender")
plt.xlabel("Count")
plt.ylabel("Themes")
plt.tight_layout()
plt.savefig('data/img/themes_by_gender.png', dpi=600)
plt.show()

# ---------------------------------------------------------------------------------------------------------------

df_to_plot_region = pd.DataFrame(metadata['region themes'])
df_to_plot_region = df_to_plot_region.loc[custom_sort_order]
df_to_plot_region.rename(columns=metadata['shorthand region'], inplace=True)

# Stacked bar chart
ax = df_to_plot_region.plot(kind='barh', stacked=True, figsize=(10, 6), colormap=metadata['custom cmap'], width=0.8)

# plt.title("Themes by Region")
plt.xlabel("Count")
plt.ylabel("Themes")
plt.tight_layout()
plt.savefig('data/img/themes_by_region.png', dpi=600)
plt.show()

# ---------------------------------------------------------------------------------------------------------------

df_to_plot_education = pd.DataFrame(metadata['education themes'])
df_to_plot_education = df_to_plot_education.loc[custom_sort_order]
df_to_plot_education.rename(columns=metadata['shorthand education'], inplace=True)

# Stacked bar chart
ax = df_to_plot_education.plot(kind='barh', stacked=True, figsize=(10, 6), colormap=metadata['custom cmap'], width=0.8)

# plt.title("Themes by Education")
plt.xlabel("Count")
plt.ylabel("Themes")
plt.tight_layout()
plt.savefig('data/img/themes_by_education.png', dpi=600)
plt.show()

# ---------------------------------------------------------------------------------------------------------------

df_to_plot_agent = pd.DataFrame(metadata['agent themes'])
df_to_plot_agent = df_to_plot_agent.loc[custom_sort_order]
df_to_plot_agent.rename(columns=metadata['shorthand agent'], inplace=True)

# Stacked bar chart
ax = df_to_plot_agent.plot(kind='barh', stacked=True, figsize=(10, 6), colormap=metadata['custom cmap'], width=0.8)

plt.title("Themes by Agent")
plt.xlabel("Count")
plt.ylabel("Themes")
plt.tight_layout()
plt.savefig('data/img/themes_by_agent.png', dpi=600)
plt.show()

# ---------------------------------------------------------------------------------------------------------------

fig, axes = plt.subplots(1, 4, figsize=(25, 6))
axes = axes.flatten()

# Plot 4 bar charts
df_to_plot_age.plot(kind='barh', stacked=True, ax=axes[0], colormap=metadata['custom cmap'], width=0.8)
axes[0].set_title("Themes by Age Group")
axes[0].set_xlabel("Count")
axes[0].set_ylabel("Themes")

df_to_plot_sex.plot(kind='barh', stacked=True, ax=axes[1], colormap=metadata['custom cmap'], width=0.8)
axes[1].set_title("Themes by Sex")
axes[1].set_xlabel("Count")
axes[1].set_ylabel("")
axes[1].set_yticklabels([])

df_to_plot_region.plot(kind='barh', stacked=True, ax=axes[2], colormap=metadata['custom cmap'], width=0.8)
axes[2].set_title("Themes by Region")
axes[2].set_xlabel("Count")
axes[2].set_ylabel("")
axes[2].set_yticklabels([])

df_to_plot_education.plot(kind='barh', stacked=True, ax=axes[3], colormap=metadata['custom cmap'], width=0.8)
axes[3].set_title("Themes by Education")
axes[3].set_xlabel("Count")
axes[3].set_ylabel("")
axes[3].set_yticklabels([])

plt.tight_layout()

plt.savefig('data/img/themes_by.png', dpi=600)

plt.show()

In [None]:
percentages_age = metadata['percentages age']
percentages_sex = metadata['percentages sex']
percentages_reg = metadata['percentages region']
percentages_edu = metadata['percentages education']

themes = sorted({theme for age_dict in percentages_age.values() for theme in age_dict})
groups = list(sorted(percentages_age.keys()))
groups.extend(list(percentages_sex.keys()))
groups.extend(list(percentages_reg.keys()))
groups.extend(list(percentages_edu.keys()))

groups_shorthand = [metadata['shorthand'][key] for key in groups]

# Build the data for the table
table_data = []
for theme in themes:
    row = []
    for k in groups:
        if k in percentages_age.keys():
            value = percentages_age[k].get(theme, 0.0)
        elif k in percentages_sex.keys():
            value = percentages_sex[k].get(theme, 0.0)
        elif k in percentages_reg.keys():
            value = percentages_reg[k].get(theme, 0.0)
        else:
            value = percentages_edu[k].get(theme, 0.0)
        if value == 0.0:
            row.append('')
        else:
            value = round(value, 2)
            if value.is_integer():  # <-- check if like 100.0 or 87.0
                row.append(int(value))
            else:
                row.append(value)
    table_data.append(row)

# Create the plot
# fig, ax = plt.subplots(figsize=(len(groups_shorthand) * 2, len(themes) * 0.4))
# fig, ax = plt.subplots(figsize=(40, 20))
fig, ax = plt.subplots()
ax.axis('off')

# Create the table
table = ax.table(
    cellText=table_data,
    rowLabels=themes,
    colLabels=groups_shorthand,
    cellLoc='center',
    loc='center'
)

table.auto_set_font_size(False)
table.set_fontsize(12)

table.scale(5, 1.5)

# table.scale(1, 1.5)
# plt.tight_layout()

# Style headers (make them bold and change background color)
for (i, key) in enumerate(table.get_celld().keys()):
    cell = table[key]

    cell.set_facecolor('#FFF2CC')

    if key[0] == 0:  # Header col
        cell.set_text_props(weight='bold')
        cell.set_facecolor('#DAE8FC')

    if key[1] == -1:
        cell.set_text_props(weight='bold')
        cell.set_facecolor('#DAE8FC')

    if key[0] != 0 and key[1] != -1:
        value = cell.get_text().get_text()
        if value.strip() != '' and float(value) > 30:
            cell.set_facecolor('#FF9999')

plt.savefig('data/img/table_perc.png', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
check_subjective = lambda x: "Objective" if 0 <= x < 0.4 else "Subjective" if 0.4 < x <= 1 else "Neutral"
check_polarity = lambda x: "Positive" if x > 0.1 else "Negative" if x < -0.1 else "Neutral"

# theme_directions: {'Theme': {polarity : x, count: x} }
theme_directions = {}
theme_directions_excluding_eliza = {}

for index, row in metadata['dfD'].iterrows():
    text = row['Direction']
    if pd.isna(text):
        text = ''
    matches = re.findall(r"(.*?=[\s]*-?\d+)", text)

    for match in matches:
        key, value = match.split('=')
        key = key.strip()

        if key not in theme_directions:
            theme_directions[key] = {'polarity': 0, 'count': 0}

        theme_directions[key]['polarity'] += int(value)
        theme_directions[key]['count'] += 1

        if key not in theme_directions_excluding_eliza:
            theme_directions_excluding_eliza[key] = {'polarity': 0, 'count': 0}
        if row['agentName'] != 'Eliza':
            theme_directions_excluding_eliza[key]['polarity'] += int(value)
            theme_directions_excluding_eliza[key]['count'] += 1

for k, v in theme_directions.items():
    theme_directions[k]['pol'] = round(theme_directions[k]['polarity'] / theme_directions[k]['count'], 2)

for k, v in theme_directions_excluding_eliza.items():
    theme_directions_excluding_eliza[k]['pol'] = round(theme_directions_excluding_eliza[k]['polarity'] / theme_directions_excluding_eliza[k]['count'], 2)

total_polarity = sum(v['polarity'] for v in theme_directions.values())
total_count = sum(v['count'] for v in theme_directions.values())
avg = total_polarity / total_count


def analyze_row(text):
    blob = TextBlob(str(text))
    subjectivity = check_subjective(blob.subjectivity)
    polarity = check_polarity(blob.polarity)
    actual_polarity = blob.polarity
    return pd.Series([subjectivity, polarity, actual_polarity])


df[['subjectivity', 'polarity', 'actual_polarity']] = df['EXP'].apply(analyze_row)

score = df['actual_polarity'].mean()
# ~48% Accuracy:
# - https://kth.diva-portal.org/smash/get/diva2:1890072/FULLTEXT02.pdf
# ~70% Accuracy:
# - https://www.sciencedirect.com/science/article/pii/S0950705122009017
error_margin = 0.48 * score
lower_bound = score - error_margin
upper_bound = score + error_margin

sentiment_table = PrettyTable(['Sentiment Score', 'Polarity', 'Error Margin'])
sentiment_table.add_row([score, avg, f"[{lower_bound:.2f}, {upper_bound:.2f}]"])
print(sentiment_table)

In [None]:
categories = [cat for cat in metadata['all themes'] if cat != 'Unopinionated']

print('Amount of themes: ', len(categories))
N = len(categories)

# Example scores for two systems
Manual_Scores = [theme_directions[cat]['pol'] for cat in categories]
Eliza_Scores = [theme_directions_excluding_eliza[cat]['pol'] for cat in categories]

# Close the loop for radar chart
Manual_Scores += Manual_Scores[:1]
Eliza_Scores += Eliza_Scores[:1]

# Angle setup
angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
angles += angles[:1]

# Plot setup
fig, ax = plt.subplots(figsize=(7, 7), subplot_kw=dict(polar=True))

# Plot data
ax.plot(np.linspace(0, 2 * np.pi, 100), [-1] * 100, color='red', linewidth=0.5, linestyle='--')
ax.plot(np.linspace(0, 2 * np.pi, 100), [0] * 100, color='red', linewidth=0.5, linestyle='--')
ax.plot(np.linspace(0, 2 * np.pi, 100), [1] * 100, color='red', linewidth=0.5, linestyle='--')

ax.plot(angles, Manual_Scores, label='Manual', linewidth=1.5, linestyle='solid')
ax.fill(angles, Manual_Scores, alpha=0.3)

ax.plot(angles, Eliza_Scores, label='Excl. Eliza', linewidth=1.5, linestyle='solid')
ax.fill(angles, Eliza_Scores, alpha=0.3)

ax.set_xticks(angles[:-1])
ax.set_xticklabels([metadata['shorthand all themes'][cat] for cat in categories])

for i, label in enumerate(ax.get_xticklabels()):
    angle = angles[i]
    degrees = np.degrees(angle)
    x, y = label.get_position()
    lab = ax.text(x, y - 0.05, label.get_text(), transform=label.get_transform(),
                  ha='center', va='center')
    lab.set_rotation(degrees)
    label = lab

ax.set_xticklabels([])

# Range setup
ax.set_rlabel_position(30)
plt.yticks([-1, 0, 1], ["-1", "0", "1"], color="red", size=18)
plt.ylim(-1.1, 1.5)

# Legend and title
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=10)
# plt.title('Averaged Direction of User themes', size=15, y=1.08)

plt.tight_layout()
plt.savefig('data/img/circular_directions.png', dpi=600, bbox_inches='tight')
plt.show()