## Explanation of the Code for Selecting Representative and Uncorrelated Metrics

The main objective of the code is to select the most representative and least correlated metrics from various textual datasets. Below are the steps and formulas used in the process:

### 1. Filtering Columns by Variance

The variance of each column is calculated, and columns with variance less than or equal to a threshold (`variance_threshold`) are removed.

### 2. Filtering Columns by Correlation

The correlation matrix of the remaining columns is calculated, and columns with a correlation greater than a threshold (`correlation_threshold`) are removed. We place a high emphasis on correlation to avoid the issue of feature importance distribution in methods like Random Forest.

### 3. Principal Component Analysis (PCA)

PCA is applied to reduce dimensionality and select the most representative variables. PCA transforms the original variables into a new set of uncorrelated variables called principal components.

### 4. Variable Selection Based on Loadings

Variables with an absolute loading greater than or equal to a threshold (`loading_threshold`) in at least one principal component are selected.

### 5. Combining Results

Finally, the selected variables from all datasets are combined and the results are saved.

### Process Summary

1. **Variance Filtering**: Remove columns with low variance.
2. **Correlation Filtering**: Remove highly correlated columns. We place a high emphasis on correlation to avoid the issue of feature importance distribution in methods like Random Forest.
3. **PCA**: Reduce dimensionality and select variables based on loadings.
4. **Combination**: Combine selected variables from all datasets.

This process ensures that the selected metrics are representative and non-redundant, thereby improving the quality of subsequent analysis.

In [1]:
import pandas as pd
import json

In [2]:
def load_json(file_path):
    with open(file_path, 'r') as file:
        raw_data = json.load(file)
    # Normalize and flatten JSON structure
    normalized_data = pd.json_normalize(raw_data)
    return normalized_data

In [3]:
pron = load_json('datasets/1_metrics/pronvsprompt_short_stories_metrics.json')
pron_numerical_cols = pron.select_dtypes(include=['float64', 'int64']).columns
slm = load_json('datasets/1_metrics/slm_short_stories_metrics.json')
slm_numerical_cols = slm.select_dtypes(include=['float64', 'int64']).columns
ttcw = load_json('datasets/1_metrics/ttcw_short_stories_metrics.json')
ttcw_numerical_cols = ttcw.select_dtypes(include=['float64', 'int64']).columns
hanna = load_json('datasets/1_metrics/hanna_short_stories_metrics.json')
hanna_numerical_cols = hanna.select_dtypes(include=['float64', 'int64']).columns
confederacy = load_json('datasets/1_metrics/confederacy_short_stories_metrics.json')
confederacy_numerical_cols = confederacy.select_dtypes(include=['float64', 'int64']).columns

In [4]:
import pandas as pd
import json
import numpy as np
import os

output_folder = 'outputs/metric_correlations'
os.makedirs(output_folder, exist_ok=True)

datasets = {
    'representative_pron.csv': pron[pron_numerical_cols],
    'representative_slm.csv': slm[slm_numerical_cols],
    'representative_ttcw.csv': ttcw[ttcw_numerical_cols],
    'representative_hanna.csv': hanna[hanna_numerical_cols],
    'representative_confederacy.csv': confederacy[confederacy_numerical_cols]

}

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os

# function to select the most representative metrics
def select_representative_variables(df, variance_threshold=0.05, correlation_threshold=0.8, explained_variance_target=0.95, loading_threshold=0.5, dataset_name="dataset"):
    # Filter columns that do not start with 'sentiment_analysis.emotion_transitions_map'
    df = df[df.columns[~df.columns.str.startswith("sentiment_analysis.emotion_transitions_map")]]
    
    # Calculate the variance of each column
    variance = df.var()
    low_variance_columns = variance[variance <= variance_threshold].index.tolist()
    high_variance_columns = variance[variance > variance_threshold].index

    # Identify variables with reduced range between 0 and 1
    normalized_columns = [col for col in high_variance_columns 
                          if df[col].min() >= 0 and df[col].max() <= 1]

    # Exclude variables of reduced range from filtering
    final_high_variance_columns = [col for col in high_variance_columns if col not in normalized_columns]

    # Save data from columns deleted due to low variance
    output_folder = 'outputs/metric_correlations/reports'
    os.makedirs(output_folder, exist_ok=True)
    df[low_variance_columns].to_csv(f'{output_folder}/{dataset_name}_low_variance_columns.csv', index=False)

    # Filter columns with high variance
    filtered_df = df[final_high_variance_columns].copy()

    # Calculate the correlation matrix

    correlation_matrix = filtered_df.corr().abs()

    # Identify columns with high correlation
    to_drop = set()
    correlation_report = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if correlation_matrix.iloc[i, j] > correlation_threshold:
                colname = correlation_matrix.columns[i]
                to_drop.add(colname)
                correlation_report.append((correlation_matrix.columns[j], colname, correlation_matrix.iloc[i, j]))

    # Save data from columns removed due to high correlation
    df[list(to_drop)].to_csv(f'{output_folder}/{dataset_name}_high_correlation_columns.csv', index=False)

    # Save correlation report
    correlation_df = pd.DataFrame(correlation_report, columns=['Variable 1', 'Variable 2', 'Correlation'])
    correlation_df.to_csv(f'{output_folder}/{dataset_name}_correlation_report.csv', index=False)

    # Save variance report
    variance_report_df = pd.DataFrame(variance, columns=['Variance'])
    variance_report_df.to_csv(f'{output_folder}/{dataset_name}_variance_report.csv')

    print(f"Columns eliminated due to low variance: {low_variance_columns}")
    print(f"Columns removed due to high correlation: {list(to_drop)}")
    print(f"Columns with rank 0-1 that were maintained: {normalized_columns}")

    # Initial column selection after filtering high correlation
    remaining_cols = [col for col in filtered_df.columns if col not in to_drop]
    filtered_df = filtered_df[remaining_cols]

    # --- Start of the integration of PCA ---
    # Standardise data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(filtered_df.dropna(axis=1, how='any'))  
    # Note: NaN columns are removed for PCA; adjust as necessary.

    # Apply PCA initially to determine the number of components required.
    pca_full = PCA()
    pca_full.fit(scaled_data)
    explained_variance = pca_full.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance)

    # Determine the number of components to reach the explained variance threshold
    num_components = np.argmax(cumulative_variance >= explained_variance_target) + 1
    print(f"Number of components needed to explain {explained_variance_target*100}% of variance: {num_components}")

    # Apply PCA with the optimal number of components
    pca = PCA(n_components=num_components)
    principal_components = pca.fit_transform(scaled_data)

    # Obtain loadings for each variable in each principal component.
    loadings = pd.DataFrame(pca.components_.T, 
                            index=filtered_df.dropna(axis=1, how='any').columns,
                            columns=[f'PC{i+1}' for i in range(num_components)])
    loadings.to_csv(f'{output_folder}/{dataset_name}_pca_loadings.csv')

    # Select variables that have high loadings in at least one component
    selected_vars = set()
    for pc in loadings.columns:
        high_loading_vars = loadings.index[loadings[pc].abs() >= loading_threshold].tolist()
        selected_vars.update(high_loading_vars)

    # --- End of PCA integration --- --- End of PCA integration

    # Combine selected variables with previously identified normalised ones
    final_selected_columns = list(selected_vars) + normalized_columns

    # Make sure that the selected columns exist in the original dataframe.
    final_selected_columns = [col for col in final_selected_columns if col in df.columns]

    # Return the final dataframe with the selected variables
    return df[final_selected_columns]

# Process each file with the new function included in PCA
selected_columns_sets = []
for output_filename, df in datasets.items():
    representative_df = select_representative_variables(df, dataset_name=output_filename.replace('.csv', ''))
    selected_columns_sets.append(set(representative_df.columns))
    representative_df.to_csv(f'{output_folder}/{output_filename}', index=False)
    print(f"Archivo guardado: {output_folder}/{output_filename}")

# Rest of the code to join and save selected columns...
all_selected_columns = set().union(*selected_columns_sets)
print(f"Columns present in at least one dataset: {all_selected_columns}")

with open(f'{output_folder}/all_selected_columns.txt', 'w') as file:
    file.write("\n".join(all_selected_columns))


Columns eliminated due to low variance: ['coherence_analysis.local_coherence_embeddings', 'coherence_analysis.entity_coherence', 'coherence_analysis.overall_coherence', 'coherence_analysis.coherence_values.2', 'coherence_analysis.coherence_values.3', 'coherence_analysis.coherence_values.4', 'coherence_analysis.coherence_values.5', 'coherence_analysis.coherence_values.6', 'coherence_analysis.coherence_values.7', 'coherence_analysis.coherence_values.8', 'coherence_analysis.coherence_values.9', 'coherence_analysis.coherence_values.10', 'originality_analysis.semantic_distance', 'stylistic_analysis.linguistic_metrics.lexical_diversity.TTR', 'stylistic_analysis.linguistic_metrics.lexical_diversity.log_TTR', 'stylistic_analysis.linguistic_metrics.average_subordination_depth', 'stylistic_analysis.rhetorical_devices.counts.onomatopoeia', 'stylistic_analysis.rhetorical_devices.counts.epiphora', 'stylistic_analysis.rhetorical_devices.counts.paronomasia', 'stylistic_analysis.semantic_density.cosin