In [1]:
import tensorflow as tf
import pandas as pd
from transformers import TFBertForSequenceClassification, BertTokenizer, DistilBertTokenizer, TFDistilBertForSequenceClassification
import numpy as np




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the previously saved model
model = tf.keras.models.load_model("bert_fine_tuned_model_2")





In [14]:
max_length = 128
disbert_tr = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(disbert_tr)

def tokenize_headlines(text_data, source_data):
    # Ensure all entries are converted to strings and concatenate them
    combined_data = text_data.astype(str) + " [SEP] " + source_data.astype(str)
    
    return tokenizer(
        combined_data.tolist(),  # Convert the combined data into a list of strings directly
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )


def prepare_new_input(new_headline, new_source):
    # Tokenize the new input
    tokenized_input = tokenize_headlines(pd.Series([new_headline]), pd.Series([new_source]))
    
    # Convert BatchEncoding to standard tensors
    inputs = {key: tf.convert_to_tensor(value) for key, value in tokenized_input.items()}
    
    return inputs

def get_predicted_class(new_headline, new_source, model, tokenizer):
    tokenized_input = tokenize_headlines(pd.Series([new_headline]), pd.Series([new_source]))
    inputs = {key: tf.convert_to_tensor(value) for key, value in tokenized_input.items()}
    predictions = model.predict(inputs)
    logits = predictions['logits']
    probabilities = tf.nn.softmax(logits, axis=-1)
    predicted_class = np.argmax(probabilities, axis=-1)
    return predicted_class

In [15]:
predicted_class = get_predicted_class("Says Osama bin Laden endorsed Joe Biden", "Donald Trump Jr.", model, tokenizer)

print("Predicted class:", predicted_class)

Predicted class: [1]


In [19]:
import pandas as pd
from openai import OpenAI

def process_dataframe_with_source_and_headline(df, client):
    results = []
    for index, row in df.iterrows():
        input_text, source, label = row['Headline'], row['Source'], row['Label']
        
        # Adjusted task description and prompts to include source information
        task_description = "classifying text as misinformation or reliable information based on both the source and headline"
        
        prompt1 = f"""
        You are an oracle explanation module in a machine learning pipeline. In the task of {task_description},
        a trained black-box classifier correctly predicted the label
        {label} for the following source and headline. Think about why the model
        predicted the {label} label and identify the latent features
        that caused the label. List ONLY the latent features
        as a comma separated list, without any explanation.
        Examples of latent features are ‘source credibility’, ‘tone’, ‘ambiguity in
        text’, etc.
        —
        Source: {source}
        Headline: {input_text}
        —
        Begin!
        """
        
        latent_features = get_gpt4_response(client, prompt1)
        
        prompt2 = f"""
        Original source: {source}
        Original headline: {input_text}
        Label: {label}

        Identify the words in the headline that are associated
        with the latent features: {latent_features}, and output the
        identified words as a comma separated list.
        """
        
        identified_words = get_gpt4_response(client, prompt2)
        
        prompt3 = f"""
        Original source: {source}
        Original headline: {input_text}
        Label: {label}

        Identified words associated with latent features: {identified_words}.
        Generate a minimally edited version of the original headline
        by ONLY changing a minimal set of the words you identified, in order to change the label. It is okay if the semantic meaning of the original headline is altered. Make sure the
        generated text makes sense and is plausible. Enclose the
        generated text within <new>tags.
        """
        
        counterfactual = get_gpt4_response(client, prompt3)
        
        results.append({
            "Source": source,
            "Headline": input_text,
            "Label": label,
            "Latent Features": latent_features,
            "Identified Words": identified_words,
            "Counterfactual Text": counterfactual
        })
    
    return pd.DataFrame(results)

# Ensure the get_gpt4_response function is adjusted if necessary
def get_gpt4_response(client, prompt):
    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return chat_completion.choices[0].message.content

key_path = './apiKey.txt'
with open(key_path, 'r') as file:
    key = file.readline().strip()

client = OpenAI(
    api_key = key
)

           Source                                           Headline  Label  \
0     Source Name  Trump administration just FIRED 54 scientists ...      1   
1  Another Source  Says before he planned a rally on June 19 “nob...      0   

                                     Latent Features  \
0  sensationalism, negative tone, political bias,...   
1  source credibility, date mentioned in headline...   

                                    Identified Words  \
0                             FIRED, Communist China   
1  source credibility, June 19, Juneteenth, histo...   

                                 Counterfactual Text  
0  <new>Scientists fined 54 grants associated wit...  
1  <new>Says before he organized a rally on Junet...  


In [49]:
val_data = pd.read_csv("./Data/Raw/Validation.csv")[["News_Headline", "Source", "Label"]].rename(columns={"News_Headline": "Headline"}).head(5)

In [52]:
processed_df = process_dataframe_with_source_and_headline(val_data, client)
print(processed_df)


                        Source  \
0                Charlie Crist   
1              Stephen Sweeney   
2                  Viral image   
3             Occupy Democrats   
4            Verified Politics   
5                 Ron DeSantis   
6   Progressive Choice Florida   
7                    Ro Khanna   
8                 Scott Walker   
9                   Gery Chico   
10                    Bloggers   
11                Donald Trump   
12                Donald Trump   
13               Wendell Young   
14                    Bloggers   
15             Michael Needham   
16                Donald Trump   
17                      Tweets   
18              Alcee Hastings   
19                 Viral image   

                                             Headline  Label  \
0   Says Rick Scott has "teamed up with a felon co...  False   
1   "People want the minimum wage, they want marri...   True   
2   Says a photo of a young girl crying alongside ...  False   
3   "House Republicans just p

In [53]:
display(processed_df)

Unnamed: 0,Source,Headline,Label,Latent Features,Identified Words,Counterfactual Text
0,Charlie Crist,"Says Rick Scott has ""teamed up with a felon co...",False,"lack of source credibility, negative tone, ass...","lack of source credibility, negative tone, ass...","<new>Says Rick Scott has ""partnered with a fel..."
1,Stephen Sweeney,"""People want the minimum wage, they want marri...",True,"tone, social issues, support of women's health...","tone, social issues, support of women's health...","<new>People want the maximum wage, they want m..."
2,Viral image,Says a photo of a young girl crying alongside ...,False,"emotional content, sensationalism, religious c...","crying, forced into sexual slavery, child brid...",<new>Says a photo of a young boy laughing alon...
3,Occupy Democrats,"""House Republicans just passed a bill that mak...",False,"biased source, sensationalism, misinformation","biased source, sensationalism, misinformation",<new>House Republicans just passed a bill that...
4,Verified Politics,Says Marco Rubio tweeted a verse from the New ...,False,"source credibility, political bias, religious ...","Verified Politics, Marco Rubio, New Testament,...",<new>Says Marco Rubio tweeted a verse from the...
5,Ron DeSantis,"""This particular pandemic is one where I dont...",False,"sensationalism, misleading statistics","sensationalism, misleading statistics",<new>This specific pandemic is one where I bel...
6,Progressive Choice Florida,"Says Charlie Crist ""implemented Jeb Bushs A+ ...",True,source credibility,"Progressive Choice Florida, Charlie Crist, Jeb...",<new>Progressive Choice Florida claims Charlie...
7,Ro Khanna,"""Most Americans dont own stocks.""",True,"source credibility, statistical analysis","Ro Khanna, most Americans, stocks",<new>Most Americans own stocks.</new>
8,Scott Walker,"""Our last budget committed the highest level o...",True,"source credibility, historical context, financ...","Scott Walker, highest level, need-based financ...",<new>Our last budget committed the lowest leve...
9,Gery Chico,"""In Springfield, Susana Mendoza voted to hit w...",True,"negative tone, specific accusation, political ...","voted, hit working families, massive new, soda...","<new>In Springfield, Susana Mendoza voted for ..."


In [44]:
val_data['PredictedClass'] = val_data.apply(lambda x: get_predicted_class(x['Headline'], x['Source'], model, tokenizer), axis=1)

