# Calculation of Bias Variables 

In [1]:
# Import relevant libraries
import pandas as pd
from Dbias.bias_classification import classify
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at d4data/bias-detection-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Device set to use 0


In [2]:
# import datasets
df0_cleaned = pd.read_parquet('data/sentiment_data_0.parquet', engine="pyarrow")

# check first few rows
df0_cleaned.head(2)

Unnamed: 0,ID,publishedAt,instances,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound,sentiment_category
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",Albidda.net,ae,United Arab Emirates,general,2020,8,A doctor warns of new symptoms of “Corona” tha...,0.149,0.851,0.0,-0.2023,negative
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",Middle East Online,ae,United Arab Emirates,general,2020,8,Foldable phones lead Samsung to climb the top ...,0.0,0.893,0.107,0.2023,positive


In [3]:
# add bias category and probability to each dataframe based on the 'new_title' column

def add_bias_cols(df):
    df_copy = df.copy()

    # Extract text column as clean strings
    texts = df_copy["new_title"].fillna("").astype(str).tolist()

    # Batch inference
    outputs = classify(texts)

    # Convert model output (list of dicts) to dataframe
    bias_df = pd.DataFrame(outputs)

    # Merge back
    return pd.concat([df_copy.reset_index(drop=True), bias_df], axis=1)

In [8]:
# test on subset
subset = df0_cleaned.head(20)
subset_w_bias = add_bias_cols(subset)
subset_w_bias.head(2)

Unnamed: 0,ID,publishedAt,instances,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound,sentiment_category,label,score
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",Albidda.net,ae,United Arab Emirates,general,2020,8,A doctor warns of new symptoms of “Corona” tha...,0.149,0.851,0.0,-0.2023,negative,Non-biased,0.781007
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",Middle East Online,ae,United Arab Emirates,general,2020,8,Foldable phones lead Samsung to climb the top ...,0.0,0.893,0.107,0.2023,positive,Biased,0.581067


In [5]:
# turn score into signed score 
def add_sign(row):
    if row.label == "Biased":
        return row.score  # positive
    elif row.label == "Non-biased":
        return -row.score # negative
    else:
        return np.nan

In [9]:
# test on subset
subset_w_bias["bias_score"] = subset_w_bias.apply(add_sign, axis=1)
subset_w_bias.head(2)

Unnamed: 0,ID,publishedAt,instances,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound,sentiment_category,label,score,bias_score
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",Albidda.net,ae,United Arab Emirates,general,2020,8,A doctor warns of new symptoms of “Corona” tha...,0.149,0.851,0.0,-0.2023,negative,Non-biased,0.781007,-0.781007
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",Middle East Online,ae,United Arab Emirates,general,2020,8,Foldable phones lead Samsung to climb the top ...,0.0,0.893,0.107,0.2023,positive,Biased,0.581067,0.581067


In [10]:
# rename cols and drop 
subset_w_bias = subset_w_bias.rename(columns={"label":"bias_category"})
subset_w_bias = subset_w_bias.drop(columns=["score", "instances"])
subset_w_bias.head(2)

Unnamed: 0,ID,publishedAt,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound,sentiment_category,bias_category,bias_score
0,12436,2020-08-06T09:21:27Z,Albidda.net,ae,United Arab Emirates,general,2020,8,A doctor warns of new symptoms of “Corona” tha...,0.149,0.851,0.0,-0.2023,negative,Non-biased,-0.781007
1,12541,2020-08-06T15:45:39Z,Middle East Online,ae,United Arab Emirates,general,2020,8,Foldable phones lead Samsung to climb the top ...,0.0,0.893,0.107,0.2023,positive,Biased,0.581067


In [11]:
# function to put it all together 

def add_and_clean_bias(df):

    # add bias cols to df 
    new_df = add_bias_cols(df)
    print("Bias columns added!")

    # convert to signed score
    new_df["bias_score"] = new_df.apply(add_sign, axis=1)

    # rename and drop
    new_df = new_df.rename(columns={"label":"bias_category"})
    new_df = new_df.drop(columns=["score","instances"])
    print("Cleaning finished!")

    # return
    return new_df

In [None]:
# get specific countries as separate dataframes 
# run bias on each country or chunks of country

# ar	Argentina	        159139
# in	India	            145536
# us	United States	    144800
# ca	Canada	            143928
# it	Italy	            129005
# ru	Russian Federation	113395
# cn	China	            91561

In [None]:
# get specific countries as separate files

import pyarrow.dataset as ds
import glob

paths = glob.glob("data/sentiment_data_*.parquet")
dataset = ds.dataset(paths, format="parquet")

def get_country_data(location_code, country_name):
    
    # get filtered dataframe 
    filtered = dataset.to_table(
        filter=ds.field("location_code") == location_code
    )
    df = filtered.to_pandas()

    # save to parquet file 
    filename = f"data/{country_name}_data.parquet"
    df.to_parquet(filename, index=False)

    return df

In [16]:
df_china = get_country_data("cn", "china")
df_china.head(2)

Unnamed: 0,ID,publishedAt,instances,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound,sentiment_category
0,246,2020-08-07T07:33:39Z,"[{'category': 'general', 'collectedAt': '2020-...",Sina.com.cn,cn,China,general,2020,8,Xiaopao’s smart hero Hongfunu’s 12+5 plan capt...,0.0,0.695,0.305,0.7644,positive
1,384,2020-08-07T08:48:00Z,"[{'category': 'general', 'collectedAt': '2020-...",Sohu.com,cn,China,general,2020,8,Early surrender? Blues 6 will miss the Champio...,0.287,0.588,0.125,-0.5267,negative


In [17]:
import pyarrow.parquet as pq

def process_parquet_in_chunks(path, batch_size=5000):

    parquet_file = pq.ParquetFile(path)
    result_chunks = []
    count = 0

    for batch in parquet_file.iter_batches(batch_size=batch_size):
        count += 1
        print(f"Running batch {count}:")
        df = batch.to_pandas()
        processed_chunk = add_and_clean_bias(df)
        result_chunks.append(processed_chunk)
        print(f"Finished batch {count}!")

    final_df = pd.concat(result_chunks, ignore_index=True)
    return final_df

In [18]:
# add bias cols to df 
china_bias = process_parquet_in_chunks("data/china_data.parquet", batch_size=10000)

# check first rows
china_bias.head(2)

Running batch 1:
Bias columns added!
Cleaning finished!
Finished batch 1!
Running batch 2:
Bias columns added!
Cleaning finished!
Finished batch 2!
Running batch 3:
Bias columns added!
Cleaning finished!
Finished batch 3!
Running batch 4:
Bias columns added!
Cleaning finished!
Finished batch 4!
Running batch 5:
Bias columns added!
Cleaning finished!
Finished batch 5!
Running batch 6:
Bias columns added!
Cleaning finished!
Finished batch 6!
Running batch 7:
Bias columns added!
Cleaning finished!
Finished batch 7!
Running batch 8:
Bias columns added!
Cleaning finished!
Finished batch 8!
Running batch 9:
Bias columns added!
Cleaning finished!
Finished batch 9!
Running batch 10:
Bias columns added!
Cleaning finished!
Finished batch 10!


Unnamed: 0,ID,publishedAt,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound,sentiment_category,bias_category,bias_score
0,246,2020-08-07T07:33:39Z,Sina.com.cn,cn,China,general,2020,8,Xiaopao’s smart hero Hongfunu’s 12+5 plan capt...,0.0,0.695,0.305,0.7644,positive,Biased,0.943532
1,384,2020-08-07T08:48:00Z,Sohu.com,cn,China,general,2020,8,Early surrender? Blues 6 will miss the Champio...,0.287,0.588,0.125,-0.5267,negative,Biased,0.551895


In [19]:
# Save bias cols to new file
china_bias.to_parquet("data/china_data_final.parquet", index=False)

In [None]:
# loop through rest of files 

for i in range(1, 5):
    print(i)

    # get filename and new filename 
    filename = f"data/sentiment_data_{i}.parquet"
    new_filename = f"data/final_data_{i}.parquet"

    # process data 
    print(f"Processing {filename}:")
    df_bias = process_parquet_in_chunks(filename, batch_size=5000)

    # save to parquet 
    df_bias.to_parquet(new_filename, index=False)
    print(f"Successfully saved {new_filename}!")