# Part I - Prepare Base Datasets

## 1. Set up Environment and Load Data

In [None]:
! pip install  sentencepiece accelerate safetensors datasets torchvision scikit-optimize openai ratelimiter scikits.bootstrap arch

In [2]:
import time
import json
import openai
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline
from google.colab import drive
from urllib.parse import urlparse

In [None]:
with open('file_path/file.pkl', 'rb') as file:
    full_data = pickle.load(file)

Mounted at /content/drive


## 2. Extract Domains and Label Domain Credibility

### 2.1 Extract Domains from URLs

In [None]:
def extract_domain(df):
    df['urls'] = df['urls'].fillna('')
    df['urls'] = df['urls'].astype(str)
    df = df.assign(urls=df['urls'].str.split(';')).explode('urls')
    df['domain'] = df['urls'].apply(lambda x: urlparse(x).netloc if isinstance(x, str) else '')
    df['domain'] = df['domain'].str.replace('www.', '')

    return df

for df_name, dataframe in full_data.items():
    full_data[df_name] = extract_domain(dataframe)

### 2.2 Add Credibility Scores to Original Data

In [None]:
import pandas as pd

credibility_ratings = pd.read_csv('file_path/file.csv')
credibility_ratings = credibility_ratings[['url', 'rating',]]
credibility_ratings = credibility_ratings.reset_index(drop=True)

def merge_credibility_rating(df, credibility_ratings):
    merged_data = pd.merge(df, credibility_ratings[['url', 'rating']], left_on='domain', right_on='url', how='left')
    merged_data = merged_data.drop(columns='url')
    return merged_data

full_data['covid'] = merge_credibility_rating(full_data['covid'], credibility_ratings)
full_data['climate'] = merge_credibility_rating(full_data['climate'], credibility_ratings)


### 2.3 Split Datasets in Low-Credibility and High-Credibility

In [None]:
import pickle

def separate_by_trust_ratings(data, name):
    original_data_length = len(data)
    low_cred, high_cred = data[data['rating'] <= 0.4], data[data['rating'] >= 0.6]

    print(f"Number of rows in {name} low credibility: {len(low_cred)}")
    print(f"Number of rows in {name} high credibility: {len(high_cred)}")

    low_cred_percentage = (len(low_cred) / original_data_length) * 100
    print(f"Percentage of {name} data that is low credibility: {low_cred_percentage:.2f}%")

    return low_cred, high_cred

covid_low_cred, covid_high_cred = separate_by_trust_ratings(full_data['covid'], 'COVID-19')
climate_low_cred, climate_high_cred = separate_by_trust_ratings(full_data['climate'], 'Climate Change')

data_dict = {
    'covid_low_cred': covid_low_cred,
    'covid_high_cred': covid_high_cred,
    'climate_low_cred': climate_low_cred,
    'climate_high_cred': climate_high_cred
}

with open('file_path/file.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

# Part II  - Stratification Variables

In [None]:
with open('file_path/file.pkl', 'rb') as file:
    annotated_data = pickle.load(file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 3.1 Engagement Levels and Followers Clustering with Quantile Based Discretization

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

def qb_discretization(annotated_data, process_col, n_bins, strategy='quantile'):
    def relabel_bins(df, process_col, n_bins, strategy, label_name):
        kbins = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy)
        bin_labels = kbins.fit_transform(df[[process_col]])
        return bin_labels.ravel().astype(int)


    label_name = process_col.split('_')[0] + '_cluster'

    data_groups = {'climate': pd.concat([df.assign(source=key) for key, df in annotated_data.items() if 'climate' in key]),
                   'covid': pd.concat([df.assign(source=key) for key, df in annotated_data.items() if 'covid' in key])}

    result_dict = {}
    for group_name, group_df in data_groups.items():
        group_df[process_col] = group_df[process_col].apply(pd.to_numeric, errors='coerce').fillna(0)
        group_df[process_col] = np.log1p(group_df[process_col])
        group_df[label_name] = relabel_bins(group_df, process_col, n_bins, strategy, label_name)

        for key in [k for k in annotated_data.keys() if group_name in k]:
            key_df = group_df[group_df['source'] == key]
            result_dict[key] = key_df[['id', label_name, process_col]].to_dict(orient='records')
            cluster_counts = key_df[label_name].value_counts()
            cluster_means = key_df.groupby(label_name)[process_col].mean()
            print(f"Mean value of '{process_col}' in each cluster for '{key}':\n{cluster_means}\n")

    return result_dict

In [None]:
## Apply to Engagement Data
for key in annotated_data:
    annotated_data[key]['engagement_merged'] = annotated_data[key][['retweet_count', 'reply_count', 'like_count', 'quote_count']].apply(pd.to_numeric, errors='coerce').fillna(0).sum(axis=1)

results_engagement = qb_discretization(annotated_data, process_col='engagement_merged', n_bins=9)
with open('file_path/file.json', 'w') as f:
    json.dump(results_engagement, f, indent=4)

In [None]:
## Apply to Followers Data
results_followers = qb_discretization(annotated_data, process_col='followers_count',n_bins=4)

with open('file_path/file.json', 'w') as f:
    json.dump(results_followers, f, indent=4)

## 3.2 Toxicity Scoring With Perspective API

In [None]:
import aiohttp
import asyncio
import nest_asyncio
nest_asyncio.apply()

def set_perspective_key():
    with open("path_to_key/key.json") as f:
        return json.load(f).get("PERSPECTIVE_API_KEY")

async def classify_text(session, api_key, text, bucket, max_retries=5):
    initial_wait_time, retry_count = 1, 0
    payload = {
        "comment": {"text": text},
        "languages": ["en"],
        "requestedAttributes": {"TOXICITY": {}}
    }
    while bucket[0] <= 0 or retry_count < max_retries:
        if bucket[0] > 0:
            bucket[0] -= 1
            try:
                async with session.post("https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze", params={"key": api_key}, json=payload, timeout=None) as response:
                    return await response.json()
            except Exception as e:
                retry_count += 1
                print(f"Request failed, waiting for {initial_wait_time * (2 ** retry_count)} seconds before retrying. Exception: {e}")
                await asyncio.sleep(initial_wait_time * (2 ** retry_count))
        else:
            await asyncio.sleep(1)
    print(f"Failed to get response after {max_retries} retries.")

async def refill_bucket(bucket, fill_rate=700):
    while True:
        bucket[0] = fill_rate
        await asyncio.sleep(60)

async def main(df_name, dataframe, json_data):
    print(f"Processing DataFrame: {df_name}")
    api_key, token_bucket = set_perspective_key(), [700]
    async with aiohttp.ClientSession() as session:
        tasks = [classify_text(session, api_key, text, token_bucket) for text in dataframe['text'].to_list()]
        refill_task = asyncio.create_task(refill_bucket(token_bucket))
        dataframe_results = [{"id": tweet_id, "toxicity": None} for tweet_id in dataframe['id']]
        for index, future in enumerate(asyncio.as_completed(tasks)):
            result = await future
            if result and 'attributeScores' in result and 'TOXICITY' in result['attributeScores'] and 'summaryScore' in result['attributeScores']['TOXICITY'] and 'value' in result['attributeScores']['TOXICITY']['summaryScore']:
                toxicity_score = result['attributeScores']['TOXICITY']['summaryScore']['value']
                dataframe_results[index]["toxicity"] = toxicity_score
            else:
                print(f"No 'attributeScores' in result for tweet_id {dataframe_results[index]['id']}. Full result: {result}")
        json_data[df_name] = dataframe_results
        refill_task.cancel()

async def process_dataframes(dataframes):
    json_data = {}
    for df_name, dataframe in dataframes.items():
        await main(df_name, dataframe, json_data)
    with open('file_path/file.json', 'w') as f:
        json.dump(json_data, f, indent=4)

loop = asyncio.get_event_loop()
loop.run_until_complete(process_dataframes(annotated_data))

In [None]:
# Use kmeans clustering to divide toxicity in clusters
from sklearn.cluster import KMeans

with open('file_path/file.json', 'r') as f:
    data_json = json.load(f)

datasets = {'climate': pd.DataFrame(), 'covid': pd.DataFrame()}
for key, df in ((k, pd.DataFrame(v)) for k, v in data_json.items()):
    datasets['climate' if 'climate' in key else 'covid'] = pd.concat([datasets['climate' if 'climate' in key else 'covid'], df], ignore_index=True)

kmeans = KMeans(n_clusters=3, random_state=18)
for name, df in datasets.items():
    df['toxicity_cluster'] = kmeans.fit_predict(df[['toxicity']])
    cluster_map = {old: new for new, old in enumerate(df.groupby('toxicity_cluster')['toxicity'].mean().sort_values().index)}
    df['toxicity_cluster'] = df['toxicity_cluster'].map(cluster_map)

for key in data_json:
    dataframe = datasets['climate'] if 'climate' in key else datasets['covid']
    for item in data_json[key]:
        item['toxicity_cluster'] = int(dataframe[dataframe['toxicity'] == item['toxicity']]['toxicity_cluster'].values[0])

with open('file_path/file.json', 'w') as f:
    json.dump(data_json, f, indent=4)

## 3.3 Political Bias with GPT-4 API

In [None]:
import ast
from concurrent.futures import ThreadPoolExecutor, as_completed
from ratelimiter import RateLimiter

def delayed_chat_completion(delay_in_seconds: float = 1, **kwargs):
    time.sleep(delay_in_seconds)
    return openai.ChatCompletion.create(**kwargs)

def set_openai_api_key():
    with open("path_to_key/key.json") as f:
        openai_keys_obj = json.load(f)
    openai.api_key = openai_keys_obj.get("OPENAI_API_KEY")
    openai.organization = openai_keys_obj.get("ORGANIZATION")

def get_label_constructor(system_message, user_message, default_response_dict):
    def get_label(domain, example):
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message.format(domain=domain, example=example)}
        ]
        rate_limit_per_minute = 60 ### Change this according to your rate limit
        delay = 60.0 / rate_limit_per_minute

        response = delayed_chat_completion(
            delay_in_seconds=delay,
            model="gpt-4",
            messages=messages,
            max_tokens=120,
            temperature=0
        )
        reply = response['choices'][0]['message']['content'].replace("Assistant:", "").strip()

        try:
            reply_dict = ast.literal_eval(reply)
        except SyntaxError:
            reply_dict = default_response_dict(domain, reply)
        return domain, reply_dict
    return get_label

def classify_domain(df1, df2, system_message, user_message, example, default_response_dict, output_file):
    set_openai_api_key()

    merged_df = pd.concat([df1, df2])
    merged_df = merged_df[pd.notnull(merged_df['domain']) & (merged_df['domain'] != '')]
    unique_domains = merged_df['domain'].unique()

    label_dict = {}
    rate_limiter = RateLimiter(max_calls=55, period=60)

    get_label = get_label_constructor(system_message, user_message, default_response_dict)

    try:
        with ThreadPoolExecutor(max_workers=55) as executor:
            futures = []

            def submit_task(domain):
                with rate_limiter:
                    return executor.submit(get_label, domain, example)

            for domain in unique_domains:
                futures.append(submit_task(domain))

            for future in as_completed(futures):
                domain, label = future.result()
                label_dict[domain] = label
    except Exception as e:
        print(f"Exception occurred: {str(e)}")

    with open(output_file, 'w') as json_file:
        json.dump(label_dict, json_file, indent=4)

    return label_dict

In [None]:
def default_response_dict_bias(domain, reply):
    return {'url': domain, 'political_bias': -1, 'reason': reply}

system_message_bias = "You are an assistant to help the user determine the political bias of websites."
user_message_bias = "Rate the political bias of the website '{domain}' using the labels 'far-right', 'right', 'no bias', 'left', 'far-left'. If the assistant has no knowledge of the website, it will return a rating of -1. Assistant should only provide the result in JSON format, do not provide anything else. Here is an example: {example}"
example_bias = {
    'url': 'example.com',
    'political_bias': 'no_bias',
    'reason': 'The example website is known to have no political bias.'
}
output_file_bias = 'file_path/file.json'

bias_ratings = classify_domain(full_data['covid'], full_data['climate'], system_message_bias, user_message_bias, example_bias, default_response_dict_bias, output_file_bias)

# Part III - Amplification Analysis

## 4. Amplification Analysis with Bootstrapping

### 4.1 Checkpoint - Reload Annotated Data

In [12]:
with open('file_path/file.pkl', 'rb') as file:
    annotated_data = pickle.load(file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 4.2 Hydrate Datasets with Stratification Variables

In [14]:
def hydrate_data(full_data, json_file_path, target_col, match_col='id', match_func=None):
    with open(json_file_path) as file:
        data = json.load(file)

    if match_func is not None:
        for df in full_data.values():
            df[target_col] = df[match_col].map(match_func).astype(str)
    else:
        grouping_var = {key: pd.DataFrame(value) for key, value in data.items()}
        for key, df in full_data.items():
            other_df = grouping_var.get(key)
            if other_df is not None and {match_col, target_col}.issubset(other_df.columns):
                mapping = other_df.set_index(match_col)[target_col].to_dict()
                df[target_col] = df[match_col].map(mapping)

    if target_col == 'political_bias':
        for df in full_data.values():
            df[target_col] = df[target_col].replace({'None': 'no_bias', '-1': 'unknown'})

    return full_data

# Load the political bias data
with open("file_path/file.json") as f:
    political_bias_data = json.load(f)

# Hydrate the data with different stratification variables
hydrations = [
    ('file_path/file.json', 'toxicity_cluster'),
    ('file_path/file.json', 'engagement_cluster'),
    ('file_path/file.json', 'followers_cluster'),
    ('file_path/file.json', 'political_bias', 'domain', lambda url: political_bias_data.get(url, {}).get('rating'))
]

for hydration in hydrations:
    annotated_data = hydrate_data(annotated_data, *hydration)

### 4.3 Run Baseline Comparisons With Bootstrapping

#### 4.3.1  Define Baseline Stratification with Bootstrapping

In [5]:
from arch.bootstrap import IIDBootstrap
from sklearn.utils import resample
from scipy.stats import iqr

def create_slices(data, additional_var=None):
    if additional_var is None:
        return {(engagement, followers): data[(data["engagement_cluster"] == engagement) & (data["followers_cluster"] == followers)]["impressions_count"]
                for engagement in data["engagement_cluster"].unique()
                for followers in data["followers_cluster"].unique()}
    else:
        return {(engagement, followers, new_var): data[(data["engagement_cluster"] == engagement) & (data["followers_cluster"] == followers) & (data[additional_var] == new_var)]["impressions_count"]
                for engagement in data["engagement_cluster"].unique()
                for followers in data["followers_cluster"].unique()
                for new_var in data[additional_var].unique()}

def calculate_difference(sample_1, sample_2, percentage=False):
    mean_1, mean_2 = np.mean(sample_1), np.mean(sample_2)
    if percentage and not np.isclose(mean_2, 0):
        return (mean_1 - mean_2) / mean_2 * 100
    else:
        return mean_1 - mean_2

def bootstrap_iteration(data_slices, iteration, keys, stratum_boot_differences, percentage=False):
    boot_cred = {cred_key: [] for cred_key in keys}

    for key in data_slices[keys[0]]:
        sample_size = min(len(data_slices[keys[0]][key]), len(data_slices[keys[1]][key]))
        samples = {cred_key: resample(data_slices[cred_key][key], n_samples=sample_size) for cred_key in keys}
        for cred_key in keys:
            boot_cred[cred_key].extend(samples[cred_key])

        stratum_difference = calculate_difference(samples[keys[0]], samples[keys[1]], percentage)
        stratum_boot_differences[key].append(stratum_difference)

    overall_difference = calculate_difference(boot_cred[keys[0]], boot_cred[keys[1]], percentage)
    return overall_difference

def stratified_bootstrapping(annotated_data, high_cred_key, low_cred_key, n_iterations, main_var_name, seed, ci=0.95, percentage=False, additional_var=None):
    np.random.seed(seed)
    data_slices = {cred_key: create_slices(annotated_data[cred_key], additional_var) for cred_key in [high_cred_key, low_cred_key]}
    stratum_boot_differences = {key: [] for key in data_slices[high_cred_key].keys()}

    boot_differences = []
    for iteration in range(n_iterations):
        overall_difference = bootstrap_iteration(data_slices, iteration, [low_cred_key, high_cred_key], stratum_boot_differences, percentage)
        boot_differences.append(overall_difference)

    results_df = pd.DataFrame({main_var_name: boot_differences})

    stratum_results = [{'stratum': key,
                        main_var_name: np.mean(differences),
                        'median': np.median(differences),
                        'iqr':  iqr(differences),
                        **dict(zip(['ci_lower', 'ci_upper'], IIDBootstrap(np.array(differences)).conf_int(np.mean, 1000, method='percentile'))),
                        'total_sample_size': len(data_slices[high_cred_key][key]) + len(data_slices[low_cred_key][key])}
                       for key, differences in stratum_boot_differences.items()]

    stratum_results_df = pd.DataFrame(stratum_results)

    return results_df, stratum_results_df

In [6]:
### Baseline Results - Absolute Values
covid_results, covid_results_stratum = stratified_bootstrapping(annotated_data, 'covid_high_cred', 'covid_low_cred', n_iterations=1000,main_var_name='mean_difference', seed=18)
climate_results, climate_results_stratum = stratified_bootstrapping(annotated_data, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, main_var_name='mean_difference', seed=18)

## Baseline Results - Percentage Values
covid_results_percentage, covid_results_stratum_percentage = stratified_bootstrapping(annotated_data, 'covid_high_cred', 'covid_low_cred', n_iterations=1000, main_var_name='mean_difference_perc',seed=18,percentage=True)
climate_results_percentage, climate_results_stratum_percentage = stratified_bootstrapping(annotated_data, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, seed=18,main_var_name='mean_difference_perc', percentage=True)

In [None]:
import pickle

results_dict = {
    'covid_results': covid_results,
    'covid_results_stratum': covid_results_stratum,
    'climate_results': climate_results,
    'climate_results_stratum': climate_results_stratum,
    'covid_results_percentage': covid_results_percentage,
    'covid_results_stratum_percentage': covid_results_stratum_percentage,
    'climate_results_percentage': climate_results_percentage,
    'climate_results_stratum_percentage': climate_results_stratum_percentage
}

with open('file_path/file.pkl', 'wb') as file:
    pickle.dump(results_dict, file)

### 4.4 Estimate Impact of Additional Stratification Variables

#### 4.4.1 Define Function to Compute the Effect of Additional Stratification Variables

In [9]:
import numpy as np
import pandas as pd

def estimate_stratification_effect(stratum_results, stratum_variable_results, stratum_variable, percentage=True):
    def split_stratum(df, n, col_name='stratum'):
        return [df[col_name].apply(lambda x: x[i] if i < len(x) else np.nan) for i in range(n)]

    # Process stratum_results
    stratum_results_tmp = stratum_results.copy()
    stratum_results_tmp['engagement_stratum'], stratum_results_tmp['followers_stratum'] = split_stratum(stratum_results_tmp, 2)
    baseline_avg_diff = stratum_results_tmp.drop(['stratum', 'total_sample_size'], axis=1).groupby(['engagement_stratum', 'followers_stratum']).agg({'mean_difference_perc' if percentage else 'mean_difference': np.mean}).reset_index().rename(columns={'mean_difference_perc' if percentage else 'mean_difference':'baseline_mean_difference'})

    # Process stratum_variable_results
    stratum_variable_results_tmp = stratum_variable_results.copy()
    stratum_variable_results_tmp['engagement_stratum'], stratum_variable_results_tmp['followers_stratum'], stratum_variable_results_tmp[stratum_variable+'_stratum'] = split_stratum(stratum_variable_results_tmp, 3)
    avg_diff = stratum_variable_results_tmp.drop(['stratum', 'total_sample_size'], axis=1).groupby(['engagement_stratum', 'followers_stratum', stratum_variable + '_stratum']).agg({'mean_difference_perc' if percentage else 'mean_difference': np.mean}).reset_index()

    # Merge and calculate raw difference
    result = pd.merge(avg_diff, baseline_avg_diff, on=['engagement_stratum', 'followers_stratum'])
    result['impact_of_' + stratum_variable] = result['mean_difference_perc' if percentage else 'mean_difference'] - result['baseline_mean_difference']

    # Average raw difference by stratum_variable
    mean_raw_difference = result.groupby(stratum_variable+'_stratum')['impact_of_'+stratum_variable].mean().reset_index()

    return mean_raw_difference

#### 4.3.2 Additional Stratifications - Toxicity


In [10]:
## Toxicity Results - Absolute Values
covid_results_toxicity, covid_results_toxicity_stratum = stratified_bootstrapping(annotated_data, 'covid_high_cred', 'covid_low_cred', n_iterations=1000,main_var_name='mean_difference', seed=18,additional_var="toxicity_cluster")
climate_results_toxicity, climate_results_toxicity_stratum = stratified_bootstrapping(annotated_data, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, main_var_name='mean_difference', seed=18,additional_var="toxicity_cluster")

## Toxicity Results - Percentage Values
covid_results_toxicity_percentage, covid_results_toxicity_stratum_percentage = stratified_bootstrapping(annotated_data, 'covid_high_cred', 'covid_low_cred', n_iterations=1000, main_var_name='mean_difference_perc',seed=18,percentage=True,additional_var="toxicity_cluster")
climate_results_toxicity_percentage, climate_results_toxicity_stratum_percentage = stratified_bootstrapping(annotated_data, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, seed=18,main_var_name='mean_difference_perc', percentage=True,additional_var="toxicity_cluster")

In [11]:
# Compute Effect of Toxicity Stratification
covid_effects_tox = estimate_stratification_effect(covid_results_stratum_percentage, covid_results_toxicity_stratum_percentage, "toxicity")
climate_effects_tox = estimate_stratification_effect(climate_results_stratum_percentage, climate_results_toxicity_stratum_percentage, "toxicity")

#### 4.3.2 Additional Stratifications - Political Bias


In [None]:
# Here, we need to clean up the political_bias data, as far-right is not present in high_cred data (as there are no high-credibility far-right domains),
# so we need to merge far-right and right, and far-left and left, Also, no_bias sources are ~0.2% in low-credibility data, so we
# remove them

annotated_data_reduced = {}

# Iterate through each key in the original annotated_data dictionary
for key in annotated_data:
    # Make a copy of the current DataFrame
    df = annotated_data[key].copy()

    # Group 'right' and 'far-right' together, 'left' and 'far-left' together
    df['political_bias'] = df['political_bias'].replace(['right', 'far-right'], 'right_group')
    df['political_bias'] = df['political_bias'].replace(['left', 'far-left'], 'left_group')

    # Remove rows where political_bias is 'unknown'
    df = df[df['political_bias'] != 'unknown']
    df = df[df['political_bias'] != 'no_bias']

    # Add the processed DataFrame to the new dictionary
    annotated_data_reduced[key] = df

In [None]:
## Political Bias Results - Absolute Values
covid_results_bias, covid_results_bias_stratum= stratified_bootstrapping(annotated_data_reduced, 'covid_high_cred', 'covid_low_cred', n_iterations=1000,main_var_name='mean_difference', seed=18,additional_var="political_bias")
climate_results_bias, climate_results_bias_stratum = stratified_bootstrapping(annotated_data_reduced, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, main_var_name='mean_difference', seed=18,additional_var="political_bias")

## Political Bias - Percentage Values
covid_results_bias_percentage, covid_results_bias_stratum_percentage = stratified_bootstrapping(annotated_data_reduced, 'covid_high_cred', 'covid_low_cred', n_iterations=1000, main_var_name='mean_difference_perc',seed=18,percentage=True,additional_var="political_bias")
climate_results_bias_percentage, climate_results_bias_stratum_percentage = stratified_bootstrapping(annotated_data_reduced, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, seed=18,main_var_name='mean_difference_perc', percentage=True,additional_var="political_bias")

In [None]:
# Compute Effect of Bias Stratification
covid_effects_bias = estimate_stratification_effect(covid_results_stratum_percentage, covid_results_bias_stratum_percentage,'political_bias')
climate_effects_bias = estimate_stratification_effect(climate_results_stratum_percentage, climate_results_bias_stratum_percentage,'political_bias')

#### 4.3.3 Additional Stratifications - Verified Status

In [None]:
## Verified Results - Absolute Values
covid_results_verified, covid_results_verified_stratum= stratified_bootstrapping(annotated_data, 'covid_high_cred', 'covid_low_cred', n_iterations=1000,main_var_name='mean_difference', seed=18,additional_var="verified")
climate_results_verified, climate_results_verified_stratum = stratified_bootstrapping(annotated_data, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, main_var_name='mean_difference', seed=18,additional_var="verified")

## Verified Results - Percentage Values
covid_results_verified_percentage, covid_results_verified_stratum_percentage = stratified_bootstrapping(annotated_data, 'covid_high_cred', 'covid_low_cred', n_iterations=1000, main_var_name='mean_difference_perc',seed=18,percentage=True,additional_var="verified")
climate_results_verified_percentage, climate_results_verified_stratum_percentage = stratified_bootstrapping(annotated_data, 'climate_high_cred', 'climate_low_cred', n_iterations=1000, seed=18,main_var_name='mean_difference_perc', percentage=True,additional_var="verified")

In [None]:
# Compute Effect of Bias Stratification
covid_effects_verified = estimate_stratification_effect(covid_results_stratum_percentage, covid_results_verified_stratum_percentage,'verified',percentage=True)
climate_effects_verified = estimate_stratification_effect(climate_results_stratum_percentage, climate_results_verified_stratum_percentage,'verified',percentage=True)

#### 4.3.4 Export Stratifications Results

In [None]:
import pandas as pd
import pickle

def prepare_stratification_data(prefix):
    df1 = globals()[f'{prefix}_effects_bias'].rename(columns={"political_bias_stratum": "stratum", "impact_of_political_bias": "impact"})
    df1['stratum_type'] = 'political_bias'

    df2 = globals()[f'{prefix}_effects_verified'].rename(columns={"verified_stratum": "stratum", "impact_of_verified": "impact"})
    df2['stratum_type'] = 'verified'

    df3 = globals()[f'{prefix}_effects_tox'].rename(columns={"toxicity_stratum": "stratum", "impact_of_toxicity": "impact"})
    df3['stratum_type'] = 'toxicity'

    df = pd.concat([df1, df2, df3])
    df['stratum'] = df['stratum'].astype(str)  # ensure stratum is string type

    # Modify labels
    label_mapping = {
        "True": "verified_true",
        "False": "verified_false",
        "right_group": "political_bias_right",
        "left_group": "political_bias_left",
        "0": "toxicity_low",
        "1": "toxicity_mid",
        "2": "toxicity_high"
    }

    df['stratum'] = df['stratum'].replace(label_mapping)

    return df

covid_stratifications = prepare_stratification_data('covid')
climate_stratifications = prepare_stratification_data('climate')

results_dict = {
    'covid_stratifications': covid_stratifications,
    'climate_stratifications': climate_stratifications
}


with open('file_path/file.pkl', 'wb') as file:
    pickle.dump(results_dict, file)
