# Constructing Paradigmatic Text Profiles for Readers

To characterize each reader $r_j$’s overall preferences within a single dataset, we aggregate the corpus features weighted by the reader’s preference intensities. Specifically, for reader $r_j$, we define a vector
$$
\mathbf{x}_j^{*} 
\;=\;
\frac{\sum_{i=1}^t\,\rho_j(x_i)\,\mathbf{x}_i}
     {\sum_{i=1}^t\,\rho_j(x_i)},
$$
where $\rho_j(x_i)$ indicates how strongly $r_j$ prefers text $x_i$. Texts that $r_j$ highly favors contribute more to $\mathbf{x}_j^{*}$, whereas texts receiving negligible scores have limited impact. The resulting paradigmatic vector $\mathbf{x}_j^{*}$ can be viewed as an  “ideal text” representation for the given reader, distilled from the corpus under study. Plotting these vectors (e.g., via PCA) reveals clusters of readers who place emphasis 
on similar sets of attributes.

In [2]:
import os
import json
import glob
import pandas as pd
from config import selected_features

def get_nested_value(d, key):
    """Extracts the value of a nested dictionary given a key in the format 'level1.level2...'."""
    keys = key.split('.')
    for k in keys:
        if isinstance(d, dict):
            d = d.get(k, None)
        else:
            return None
    return d

def process_dataset(prefix, metrics_filepath, rankings_filepath):
    # Load and transform the metrics JSON
    with open(metrics_filepath, 'r') as f:
        metrics_data = json.load(f)

    metrics_flat = []
    for row in metrics_data:
        flat_row = {}
        for col in selected_features:
            flat_row[col] = get_nested_value(row, col)
        flat_row['story_id'] = row.get('story_id')
        metrics_flat.append(flat_row)

    df_metrics = pd.DataFrame(metrics_flat)

    df_rankings = pd.read_csv(rankings_filepath)

    # Merge of the two sources using story_id
    df_merged = df_rankings.merge(df_metrics, on='story_id', how='inner')

    # Convert feature columns to numeric
    for col in selected_features:
        df_merged[col] = pd.to_numeric(df_merged[col], errors='coerce')

    # Multiplicar cada feature por ranking_value_norm
    for col in selected_features:
        df_merged[col] = df_merged[col] * df_merged['ranking_value_norm']

    # Function to calculate weighted average per user
    def weighted_average(group):
        weight_sum = group['ranking_value_norm'].sum()
        result = {}
        for col in selected_features:
            result[col] = group[col].sum() / weight_sum if weight_sum != 0 else None
        return pd.Series(result)

    df_weighted = df_merged.groupby('user_id', as_index=False, group_keys=False).apply(weighted_average)
    # Add dataset identifier column
    df_weighted['dataset'] = prefix
    return df_weighted


In [3]:
metrics_dir = 'datasets/1_metrics'
rankings_dir = 'datasets/1_2_rankings'

# Names are assumed to be prefixed with _short_stories_metrics.json for metrics.
# and the same prefix followed by _rankings.csv for rankings.
metrics_files = glob.glob(os.path.join(metrics_dir, '*_short_stories_metrics.json'))

results = []
for metrics_filepath in metrics_files:
    # Extract the filename prefix
    base_metrics = os.path.basename(metrics_filepath)
    prefix = base_metrics.split('_')[0]
    # Build the filepath from the CSV in rankings using the same prefix
    rankings_filepath = os.path.join(rankings_dir, f'{prefix}_rankings.csv')
    if os.path.exists(rankings_filepath):
        print(f'Processing dataset: {prefix}')
        df_res = process_dataset(prefix, metrics_filepath, rankings_filepath)
        results.append(df_res)
    else:
        print(f'No rankings file was found for the dataset: {prefix}')

# Concatenate the results and save as final CSV
if results:
    df_final = pd.concat(results, ignore_index=True)
    output_filepath = 'outputs/paradigmatic_texts_by_user.csv'
    df_final.to_csv(output_filepath, index=False)
    print(f'Resultados guardados en: {output_filepath}')
else:
    print('No datasets were processed.')

Processing dataset: confederacy
Processing dataset: ttcw
Processing dataset: slm
Processing dataset: hanna


  df_weighted = df_merged.groupby('user_id', as_index=False, group_keys=False).apply(weighted_average)
  df_weighted = df_merged.groupby('user_id', as_index=False, group_keys=False).apply(weighted_average)
  df_weighted = df_merged.groupby('user_id', as_index=False, group_keys=False).apply(weighted_average)


Processing dataset: pronvsprompt
Resultados guardados en: outputs/paradigmatic_texts_by_user.csv


  df_weighted = df_merged.groupby('user_id', as_index=False, group_keys=False).apply(weighted_average)
  df_weighted = df_merged.groupby('user_id', as_index=False, group_keys=False).apply(weighted_average)
