In [28]:
import pandas as pd
import tensorflow as tf
from transformers import (
    DistilBertTokenizer, TFDistilBertForSequenceClassification,
    RobertaTokenizer, TFRobertaForSequenceClassification,
)
from openai import OpenAI
import re
from Levenshtein import distance as levenshtein_distance

In [29]:
# Load the fine-tuned DistilBERT model and tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('./distilbert_finetuned')
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('./distilbert_finetuned')

# Load the fine-tuned RoBERTa model and tokenizer for the first variant
roberta_tokenizer_v1 = RobertaTokenizer.from_pretrained('./roberta_v1_finetuned')
roberta_model_v1 = TFRobertaForSequenceClassification.from_pretrained('./roberta_v1_finetuned')

# Load the fine-tuned RoBERTa model and tokenizer for the second variant
roberta_tokenizer_v2 = RobertaTokenizer.from_pretrained('./roberta_v2_finetuned')
roberta_model_v2 = TFRobertaForSequenceClassification.from_pretrained('./roberta_v2_finetuned')

def ensemble_classify_news_and_evaluate_accuracy(df, column):
    # Lists to store individual model predictions
    distilbert_predictions = []
    roberta_v1_predictions = []
    roberta_v2_predictions = []
    ensemble_predictions = []
    
    for _, row in df.iterrows():
        text_input = row[column]

        # Prepare inputs and get probabilities for DistilBERT
        distilbert_inputs = distilbert_tokenizer(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        distilbert_outputs = distilbert_model(distilbert_inputs)
        distilbert_probabilities = tf.nn.softmax(distilbert_outputs.logits, axis=-1)
        distilbert_predicted_class_index = tf.argmax(distilbert_probabilities, axis=-1).numpy()[0]
        distilbert_predictions.append(True if distilbert_predicted_class_index == 1 else False)

        # Prepare inputs and get probabilities for RoBERTa variant 1
        roberta_inputs_v1 = roberta_tokenizer_v1(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        roberta_outputs_v1 = roberta_model_v1(roberta_inputs_v1)
        roberta_probabilities_v1 = tf.nn.softmax(roberta_outputs_v1.logits, axis=-1)
        roberta_v1_predicted_class_index = tf.argmax(roberta_probabilities_v1, axis=-1).numpy()[0]
        roberta_v1_predictions.append(True if roberta_v1_predicted_class_index == 1 else False)

        # Prepare inputs and get probabilities for RoBERTa variant 2
        roberta_inputs_v2 = roberta_tokenizer_v2(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        roberta_outputs_v2 = roberta_model_v2(roberta_inputs_v2)
        roberta_probabilities_v2 = tf.nn.softmax(roberta_outputs_v2.logits, axis=-1)
        roberta_v2_predicted_class_index = tf.argmax(roberta_probabilities_v2, axis=-1).numpy()[0]
        roberta_v2_predictions.append(True if roberta_v2_predicted_class_index == 1 else False)

        # Ensemble: Average the probabilities from all models
        avg_probabilities = (distilbert_probabilities + roberta_probabilities_v1 + roberta_probabilities_v2) / 3
        predicted_class_index = tf.argmax(avg_probabilities, axis=-1).numpy()[0]
        ensemble_predictions.append(True if predicted_class_index == 1 else False)

    # Adding predictions to the DataFrame
    df[f'DistilBERTPrediction_{column}'] = distilbert_predictions
    df[f'RoBERTaV1Prediction_{column}'] = roberta_v1_predictions
    df[f'RoBERTaV2Prediction_{column}'] = roberta_v2_predictions
    df[f'EnsemblePrediction_{column}'] = ensemble_predictions
    
    # Calculate and print the accuracy for the ensemble predictions
    # correct_predictions = (df[f'EnsemblePrediction_{column}'] == df['Label']).sum()
    # total_predictions = len(df)
    # accuracy = correct_predictions / total_predictions
    # print(f"Accuracy: {accuracy:.4f}")
    
    return df

Some layers from the model checkpoint at ./distilbert_finetuned were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ./distilbert_finetuned and are newly initialized: ['dropout_175']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All model checkpoint layers were used when initializing TFRob

In [30]:
# # Ensure the get_gpt4_response function is adjusted if necessary
def get_gpt4_response(client, prompt):
    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return chat_completion.choices[0].message.content

key_path = './apiKey.txt'
with open(key_path, 'r') as file:
    key = file.readline().strip()

client = OpenAI(
    api_key = key
)

def get_counterfactual(df, client):
    # Iterate over the DataFrame using .iterrows() for reading; however, to modify the original df, use .loc for assignment
    for index, row in df.iterrows():
        input_text, label = row['text'], row['label']
        
        # Your existing prompt creation and GPT-4 querying logic
        # Simplified task description without source information
        task_description = "classifying tweets on COVID19 as misinformation or reliable information"
        
        prompt1 = f"""
        You are an oracle explanation module in a machine learning pipeline. In the task of {task_description},
        a trained black-box classifier correctly predicted the label
        {label} for the following headline. Think about why the model
        predicted the {label} label and identify the latent features
        that caused the label. List ONLY the latent features
        as a comma separated list, without any explanation.
        Examples of latent features are ‘credibility’, ‘tone’, ‘ambiguity in text’, etc.
        —
        Headline: {input_text}
        —
        Begin!
        """
        latent_features = get_gpt4_response(client, prompt1)
        
        prompt2 = f"""
        Original headline: {input_text}
        Label: {label}

        Identify the words in the headline that are associated
        with the latent features: {latent_features}, and output the
        identified words as a comma separated list.
        """
        identified_words = get_gpt4_response(client, prompt2)
        
        prompt3 = f"""
        Original headline: {input_text}
        Label: {label}

        Identified words associated with latent features: {identified_words}.
        Generate a minimally edited version of the original headline
        by ONLY changing a minimal set of the words you identified, in order to change the label. It is okay if the semantic meaning of the original headline is altered. Make sure the
        generated text makes sense and is plausible. Enclose the
        generated text within <new>tags.
        """
        counterfactual = get_gpt4_response(client, prompt3)
        
        # Update the original DataFrame directly
        df.loc[index, 'Latent Features'] = latent_features
        df.loc[index, 'Identified Words'] = identified_words
        df.loc[index, 'Counterfactual Text'] = counterfactual

    # No need to return a new DataFrame; the original df has been updated
    return df

In [31]:
val_data = pd.read_csv("./Data/Processed/Constraint_Val_Labeled.csv").drop(columns=["Unnamed: 0"])
display(val_data)

Unnamed: 0,text,label,DistilBERTPrediction_Text,RoBERTaV1Prediction_Text,RoBERTaV2Prediction_Text,EnsemblePrediction_Text
0,Chinese converting to Islam after realising th...,False,False,False,False,False
1,11 out of 13 people (from the Diamond Princess...,False,False,False,False,False
2,"COVID-19 Is Caused By A Bacterium, Not Virus A...",False,False,False,False,False
3,Mike Pence in RNC speech praises Donald Trump’...,False,False,False,False,False
4,6/10 Sky's @EdConwaySky explains the latest #C...,True,True,True,True,True
...,...,...,...,...,...,...
2135,Donald Trump wrongly claimed that New Zealand ...,False,False,False,False,False
2136,Current understanding is #COVID19 spreads most...,True,True,True,True,True
2137,Nothing screams “I am sat around doing fuck al...,False,False,False,False,False
2138,Birx says COVID-19 outbreak not under control ...,False,False,True,False,False


In [32]:
test = val_data.sample(500) # change to sample as many rows as needed
test = get_counterfactual(test, client)
display(test)

Unnamed: 0,text,label,DistilBERTPrediction_Text,RoBERTaV1Prediction_Text,RoBERTaV2Prediction_Text,EnsemblePrediction_Text,Latent Features,Identified Words,Counterfactual Text
2024,With financial support from @WorldBank WHO pro...,True,True,True,True,True,"credibility, official source, positive tone, f...","financial support, @WorldBank, WHO, technical ...",<new>With financial aid from @WorldBank WHO pr...
1725,#CoronaVirusUpdates #IndiaFightsCorona Total r...,True,True,True,True,True,"positive tone, statistics, high recovery rate,...","high recovery rate, low case fatality rate",<new> #CoronaVirusUpdates #IndiaFightsCorona T...
156,_A video has been viewed tens of thousands of ...,False,False,False,False,False,"sensationalism, lack of evidence, misleading c...","sensationalism, misleading, false",<new>A video has been viewed tens of times in ...
1309,It will be difficult to put an exact date on t...,True,True,True,False,True,"credibility, timeline, official statement, pot...","Union Health Minister, India's first vaccine, ...",<new>It will be easy to put an exact date on t...
399,@DallasBarnett3 Kia ora Dallas yes. If someone...,True,True,True,True,True,"informative, clear communication, adherence to...","refuses, test, 14 days, managed isolation, cos...",<new> If someone accepts the test at the end o...
...,...,...,...,...,...,...,...,...,...
1660,Self-medicate COVID-19 using a home remedy wit...,False,False,False,False,False,"credibility, lack of scientific evidence","Self-medicate, home remedy, aspirin, lemon, honey",<new>Self-care COVID-19 using a natural remedy...
955,Use the Coronavirus Self-Checker to help decid...,True,True,True,True,True,"credibility, urgency, informative","Coronavirus Self-Checker, testing, medical car...",<new>Use the Coronavirus Self-Checker to help ...
383,We've built out a tool CAN Compare for decisio...,True,True,True,False,True,"credibility, informative tone, data-driven con...","tool, decision makers, everyday people, compar...",<new>We've developed a tool CAN Analyze for de...
942,#CoronaVirusUpdates #IndiaFightsCorona More th...,True,True,True,True,True,"credibility, factual information, statistics","samples, tested, COVID19, Tests Per Million (T...",<new>#CoronaVirusUpdates #IndiaFightsCorona Mo...


In [36]:
test = ensemble_classify_news_and_evaluate_accuracy(test, 'Counterfactual Text')
display(test)

Unnamed: 0,text,label,DistilBERTPrediction_Text,RoBERTaV1Prediction_Text,RoBERTaV2Prediction_Text,EnsemblePrediction_Text,Latent Features,Identified Words,Counterfactual Text,DistilBERTPrediction_Counterfactual Text,RoBERTaV1Prediction_Counterfactual Text,RoBERTaV2Prediction_Counterfactual Text,EnsemblePrediction_Counterfactual Text
2024,With financial support from @WorldBank WHO pro...,True,True,True,True,True,"credibility, official source, positive tone, f...","financial support, @WorldBank, WHO, technical ...",<new>With financial aid from @WorldBank WHO pr...,True,True,True,True
1725,#CoronaVirusUpdates #IndiaFightsCorona Total r...,True,True,True,True,True,"positive tone, statistics, high recovery rate,...","high recovery rate, low case fatality rate",<new> #CoronaVirusUpdates #IndiaFightsCorona T...,True,True,True,True
156,_A video has been viewed tens of thousands of ...,False,False,False,False,False,"sensationalism, lack of evidence, misleading c...","sensationalism, misleading, false",<new>A video has been viewed tens of times in ...,False,False,False,False
1309,It will be difficult to put an exact date on t...,True,True,True,False,True,"credibility, timeline, official statement, pot...","Union Health Minister, India's first vaccine, ...",<new>It will be easy to put an exact date on t...,True,True,True,True
399,@DallasBarnett3 Kia ora Dallas yes. If someone...,True,True,True,True,True,"informative, clear communication, adherence to...","refuses, test, 14 days, managed isolation, cos...",<new> If someone accepts the test at the end o...,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1660,Self-medicate COVID-19 using a home remedy wit...,False,False,False,False,False,"credibility, lack of scientific evidence","Self-medicate, home remedy, aspirin, lemon, honey",<new>Self-care COVID-19 using a natural remedy...,False,False,False,False
955,Use the Coronavirus Self-Checker to help decid...,True,True,True,True,True,"credibility, urgency, informative","Coronavirus Self-Checker, testing, medical car...",<new>Use the Coronavirus Self-Checker to help ...,True,True,True,True
383,We've built out a tool CAN Compare for decisio...,True,True,True,False,True,"credibility, informative tone, data-driven con...","tool, decision makers, everyday people, compar...",<new>We've developed a tool CAN Analyze for de...,True,True,True,True
942,#CoronaVirusUpdates #IndiaFightsCorona More th...,True,True,True,True,True,"credibility, factual information, statistics","samples, tested, COVID19, Tests Per Million (T...",<new>#CoronaVirusUpdates #IndiaFightsCorona Mo...,True,True,True,True


In [37]:
# Function to strip tags
def strip_tags(text):
    return re.sub(r"<new>|</new>", "", text)

# Apply the function to strip tags from the Counterfactual Text
test['Counterfactual Text'] = test['Counterfactual Text'].apply(strip_tags)
display(test)

Unnamed: 0,text,label,DistilBERTPrediction_Text,RoBERTaV1Prediction_Text,RoBERTaV2Prediction_Text,EnsemblePrediction_Text,Latent Features,Identified Words,Counterfactual Text,DistilBERTPrediction_Counterfactual Text,RoBERTaV1Prediction_Counterfactual Text,RoBERTaV2Prediction_Counterfactual Text,EnsemblePrediction_Counterfactual Text
2024,With financial support from @WorldBank WHO pro...,True,True,True,True,True,"credibility, official source, positive tone, f...","financial support, @WorldBank, WHO, technical ...",With financial aid from @WorldBank WHO provide...,True,True,True,True
1725,#CoronaVirusUpdates #IndiaFightsCorona Total r...,True,True,True,True,True,"positive tone, statistics, high recovery rate,...","high recovery rate, low case fatality rate",#CoronaVirusUpdates #IndiaFightsCorona Total ...,True,True,True,True
156,_A video has been viewed tens of thousands of ...,False,False,False,False,False,"sensationalism, lack of evidence, misleading c...","sensationalism, misleading, false",A video has been viewed tens of times in multi...,False,False,False,False
1309,It will be difficult to put an exact date on t...,True,True,True,False,True,"credibility, timeline, official statement, pot...","Union Health Minister, India's first vaccine, ...",It will be easy to put an exact date on the av...,True,True,True,True
399,@DallasBarnett3 Kia ora Dallas yes. If someone...,True,True,True,True,True,"informative, clear communication, adherence to...","refuses, test, 14 days, managed isolation, cos...",If someone accepts the test at the end of the...,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1660,Self-medicate COVID-19 using a home remedy wit...,False,False,False,False,False,"credibility, lack of scientific evidence","Self-medicate, home remedy, aspirin, lemon, honey",Self-care COVID-19 using a natural remedy with...,False,False,False,False
955,Use the Coronavirus Self-Checker to help decid...,True,True,True,True,True,"credibility, urgency, informative","Coronavirus Self-Checker, testing, medical car...",Use the Coronavirus Self-Checker to help decid...,True,True,True,True
383,We've built out a tool CAN Compare for decisio...,True,True,True,False,True,"credibility, informative tone, data-driven con...","tool, decision makers, everyday people, compar...",We've developed a tool CAN Analyze for decisio...,True,True,True,True
942,#CoronaVirusUpdates #IndiaFightsCorona More th...,True,True,True,True,True,"credibility, factual information, statistics","samples, tested, COVID19, Tests Per Million (T...",#CoronaVirusUpdates #IndiaFightsCorona More th...,True,True,True,True


In [38]:
test['Levenshtein Distance'] = test.apply(lambda row: levenshtein_distance(row['text'], row['Counterfactual Text']), axis=1)

# Display the modified DataFrame
display(test[['text', 'Counterfactual Text', 'Levenshtein Distance']])
average_levenshtein_distance = test['Levenshtein Distance'].mean()
print(f"Average Levenshtein Distance: {average_levenshtein_distance:.2f}")

Unnamed: 0,text,Counterfactual Text,Levenshtein Distance
2024,With financial support from @WorldBank WHO pro...,With financial aid from @WorldBank WHO provide...,39
1725,#CoronaVirusUpdates #IndiaFightsCorona Total r...,#CoronaVirusUpdates #IndiaFightsCorona Total ...,4
156,_A video has been viewed tens of thousands of ...,A video has been viewed tens of times in multi...,41
1309,It will be difficult to put an exact date on t...,It will be easy to put an exact date on the av...,14
399,@DallasBarnett3 Kia ora Dallas yes. If someone...,If someone accepts the test at the end of the...,56
...,...,...,...
1660,Self-medicate COVID-19 using a home remedy wit...,Self-care COVID-19 using a natural remedy with...,26
955,Use the Coronavirus Self-Checker to help decid...,Use the Coronavirus Self-Checker to help decid...,6
383,We've built out a tool CAN Compare for decisio...,We've developed a tool CAN Analyze for decisio...,53
942,#CoronaVirusUpdates #IndiaFightsCorona More th...,#CoronaVirusUpdates #IndiaFightsCorona More th...,17


Average Levenshtein Distance: 37.17


In [39]:
# Assuming `df` is your DataFrame

# Columns for predictions on original text and counterfactual text
original_pred_columns = ['DistilBERTPrediction_Text', 'RoBERTaV1Prediction_Text', 'RoBERTaV2Prediction_Text', 'EnsemblePrediction_Text']
counterfactual_pred_columns = ['DistilBERTPrediction_Counterfactual Text', 'RoBERTaV1Prediction_Counterfactual Text', 'RoBERTaV2Prediction_Counterfactual Text', 'EnsemblePrediction_Counterfactual Text']

# Calculate label flip score for each model
for original_col, counterfactual_col in zip(original_pred_columns, counterfactual_pred_columns):
    # Count the number of label flips
    label_flips = (test[original_col] != test[counterfactual_col]).sum()
    total_predictions = len(test)
    flip_score = label_flips / total_predictions
    print(f"Label flip score for {original_col.split('_')[0]}: {flip_score:.4f}")


Label flip score for DistilBERTPrediction: 0.0940
Label flip score for RoBERTaV1Prediction: 0.1040
Label flip score for RoBERTaV2Prediction: 0.0660
Label flip score for EnsemblePrediction: 0.0540
