In [None]:
import json
import pandas as pd
from config import selected_features

# Funciones para resumir el nombre de las features
def get_main_category(feat):
    """
    Extracts the top category from the feature name.
    Ex: from 'coherence_analysis.best_num_topics' returns 'coherence'.
    """
    return feat.split('.')[0].replace('_analysis','').replace('_metrics','').lower()

def shorten_feature_name(feat):
    """
    It summarise the full name of the feature.
    Ej: 'coherence_analysis.best_num_topics' -> 'Coherence: Topics'
    """
    mapping = {
        "best_num_topics": "Topics",
        "entity_coherence": "Entity",
        "local_coherence_embeddings": "Embeddings",
        "overall_coherence": "Overall",
        "log_likelihood": "LogLike",
        "semantic_distance": "Distance",
        "smog_index": "SMOG",
        "average_sentiment": "Average",
        "emotional_persistence": "Persistence",
        "emotional_richness": "Richness",
        "emotional_volatility": "Volatility",
        "sentiment_distribution_negative": "Negative",
        "sentiment_distribution_positive": "Positive",
        "sentiment_volatility": "Volatility",
        "variance_sentiment": "Variance",
        "average_sentence_length": "Sentence Len",
        "average_tree_depth": "Tree Depth",
        "dependency_variety": "Dependency",
        "MTLD": "MTLD",
        "max_subordination_depth": "Subordination",
        "total_count": "Devices Count",
        "variety": "Devices Var",
        "std_dev_sentence_length": "Sentence Rhythm",
        "entropy_score": "Entropy",
        "graph_density": "Density",
        "inter_theme_similarity": "Similarity"
    }
    parts = feat.split('.')
    category = parts[0].replace('_analysis','').replace('_metrics','').title()
    metric_key = parts[-1].replace('_mean', '')
    short_metric = mapping.get(metric_key, metric_key.replace('_', ' ').title())
    return f"{category}: {short_metric}"

# Definition of datasets ("poetry" is deleted)
datasets = {
    "confederacy": "datasets/1_metrics/confederacy_short_stories_metrics.json",
    "ttcw": "datasets/1_metrics/ttcw_short_stories_metrics.json",
    "slm": "datasets/1_metrics/slm_short_stories_metrics.json",
    "pronvsprompt": "datasets/1_metrics/pronvsprompt_short_stories_metrics.json",
    "hanna": "datasets/1_metrics/hanna_short_stories_metrics.json"
}

def load_and_process_dataset(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return pd.json_normalize(data)

def is_human_story(story_id, dataset_name):
    if dataset_name == "ttcw":
        return "_NewYorker" in story_id
    elif dataset_name == "slm":
        return "_human" in story_id
    elif dataset_name == "confederacy":
        return "_human" in story_id
    elif dataset_name == "pronvsprompt":
        return "_patricio" in story_id
    elif dataset_name == "hanna":
        return "_Human" in story_id
    return False

# Load all datasets into a single DataFrame
dataframes = []
for name, path in datasets.items():
    df = load_and_process_dataset(path)
    df['dataset_name'] = name
    df['is_human'] = df['story_id'].apply(lambda x: is_human_story(x, name))
    dataframes.append(df)
df_all = pd.concat(dataframes, ignore_index=True)

# Calculate the averages for each dataset for the selected features.
# Group by dataset and calculate means for human text and AI
dataset_means = {}
for dataset in datasets.keys():
    df_dataset = df_all[df_all['dataset_name'] == dataset]
    human_means = df_dataset[df_dataset['is_human']][selected_features].describe().loc['mean']
    ai_means = df_dataset[~df_dataset['is_human']][selected_features].describe().loc['mean']
    dataset_means[dataset] = (human_means, ai_means)

# Build the final table
final_table = pd.DataFrame({'Feature': selected_features})
for dataset in datasets.keys():
    human_means, ai_means = dataset_means[dataset]
    # dataset.capitalize() is used to display it with the first letter capitalised
    final_table[f"{dataset.capitalize()} Human Mean"] = human_means.values
    final_table[f"{dataset.capitalize()} AI Mean"] = ai_means.values
final_table

Unnamed: 0,Feature,Confederacy Human Mean,Confederacy AI Mean,Ttcw Human Mean,Ttcw AI Mean,Slm Human Mean,Slm AI Mean,Pronvsprompt Human Mean,Pronvsprompt AI Mean,Hanna Human Mean,Hanna AI Mean
0,coherence_analysis.best_num_topics,6.2,5.733333,7.083333,8.027778,5.147541,7.098361,4.733333,4.383333,5.739583,5.427083
1,coherence_analysis.entity_coherence,0.079462,0.062428,0.120571,0.05328,0.095179,0.146245,0.084262,0.038277,0.121616,0.213899
2,coherence_analysis.local_coherence_embeddings,0.267482,0.340106,0.306447,0.374305,0.262804,0.283286,0.323751,0.476358,0.295991,0.3284
3,coherence_analysis.overall_coherence,0.257782,0.273985,0.203106,0.184943,0.187901,0.147363,0.325916,0.359487,0.267261,0.324331
4,originality_analysis.log_likelihood,14323.450781,8907.275391,20184.984131,26808.033583,1059.537701,1113.793649,7451.960567,10557.330872,6960.695377,2995.863646
5,readability_metrics.smog_index,8.12,9.995,7.575,9.897222,6.306557,3.803279,12.966667,13.845833,7.008333,6.076458
6,sentiment_analysis.average_sentiment,-0.273332,0.418737,-0.119286,0.422075,0.221952,0.555197,0.054113,0.776896,-0.1253,-0.16197
7,sentiment_analysis.emotional_volatility,0.342616,0.35548,0.303197,0.48561,0.260171,0.285508,0.283022,0.336509,0.440616,0.461597
8,sentiment_analysis.variance_sentiment,0.848599,0.648825,0.911253,0.71117,0.620634,0.47816,0.908254,0.36015,0.837937,0.773657
9,stylistic_analysis.linguistic_metrics.average_...,12.61017,16.958951,16.204102,18.23849,24.996289,28.223198,34.585304,25.966291,16.00441,13.956722


In [2]:
# Construcción de la tabla LaTeX
lines = []
num_datasets = len(datasets)
# 1 columna para Feature + 2 columnas por dataset
col_format = "l" + " c"*(2*num_datasets)
lines.append(r"\begin{table}[h]")
lines.append(r"    \centering")
lines.append(r"    \renewcommand{\arraystretch}{1.2}")
lines.append(r"    \begin{tabular}{" + col_format + r"}")
lines.append(r"        \toprule")
header_line = "        \\textbf{Feature}"
for dataset in datasets.keys():
    header_line += f" & \\multicolumn{{2}}{{c}}{{\\textbf{{{dataset.capitalize()}}}}}"
header_line += r" \\"
lines.append(header_line)
sub_header = "        "
for _ in datasets.keys():
    sub_header += " & Human & AI"
sub_header += r" \\"
lines.append(sub_header)
lines.append(r"        \midrule")

for _, row in final_table.iterrows():
    feature_short = shorten_feature_name(row['Feature'])
    row_line = f"        {feature_short}"
    for dataset in datasets.keys():
        human_val = f"{row[f'{dataset.capitalize()} Human Mean']:.2f}"
        ai_val = f"{row[f'{dataset.capitalize()} AI Mean']:.2f}"
        row_line += f" & {human_val} & {ai_val}"
    row_line += r" \\"
    lines.append(row_line)

lines.append(r"        \bottomrule")
lines.append(r"    \end{tabular}")
lines.append(r"    \caption{Mean Comparison per Dataset for Human vs AI}")
lines.append(r"    \label{tab:dataset_means}")
lines.append(r"\end{table}")

latex_table = "\n".join(lines)
print(latex_table)

\begin{table}[h]
    \centering
    \renewcommand{\arraystretch}{1.2}
    \begin{tabular}{l c c c c c c c c c c}
        \toprule
        \textbf{Feature} & \multicolumn{2}{c}{\textbf{Confederacy}} & \multicolumn{2}{c}{\textbf{Ttcw}} & \multicolumn{2}{c}{\textbf{Slm}} & \multicolumn{2}{c}{\textbf{Pronvsprompt}} & \multicolumn{2}{c}{\textbf{Hanna}} \\
         & Human & AI & Human & AI & Human & AI & Human & AI & Human & AI \\
        \midrule
        Coherence: Topics & 6.20 & 5.73 & 7.08 & 8.03 & 5.15 & 7.10 & 4.73 & 4.38 & 5.74 & 5.43 \\
        Coherence: Entity & 0.08 & 0.06 & 0.12 & 0.05 & 0.10 & 0.15 & 0.08 & 0.04 & 0.12 & 0.21 \\
        Coherence: Embeddings & 0.27 & 0.34 & 0.31 & 0.37 & 0.26 & 0.28 & 0.32 & 0.48 & 0.30 & 0.33 \\
        Coherence: Overall & 0.26 & 0.27 & 0.20 & 0.18 & 0.19 & 0.15 & 0.33 & 0.36 & 0.27 & 0.32 \\
        Originality: LogLike & 14323.45 & 8907.28 & 20184.98 & 26808.03 & 1059.54 & 1113.79 & 7451.96 & 10557.33 & 6960.70 & 2995.86 \\
        Readabil