In [88]:
import tensorflow as tf
import pandas as pd
from transformers import TFBertForSequenceClassification, BertTokenizer, DistilBertTokenizer, TFDistilBertForSequenceClassification
import numpy as np

In [89]:
# Load the previously saved model
model = tf.keras.models.load_model("bert_3")

In [90]:
max_length = 128
disbert_tr = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(disbert_tr)

def tokenize_headlines(text_data, source_data):
    # Ensure all entries are converted to strings and concatenate them
    combined_data = text_data.astype(str) + " [SEP] " + source_data.astype(str)
    
    return tokenizer(
        combined_data.tolist(),  # Convert the combined data into a list of strings directly
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )


def prepare_new_input(new_headline, new_source):
    # Tokenize the new input
    tokenized_input = tokenize_headlines(pd.Series([new_headline]), pd.Series([new_source]))
    
    # Convert BatchEncoding to standard tensors
    inputs = {key: tf.convert_to_tensor(value) for key, value in tokenized_input.items()}
    
    return inputs

def get_predicted_class(new_headline, new_source, model, tokenizer):
    tokenized_input = tokenize_headlines(pd.Series([new_headline]), pd.Series([new_source]))
    inputs = {key: tf.convert_to_tensor(value) for key, value in tokenized_input.items()}
    predictions = model.predict(inputs)
    logits = predictions['logits']
    probabilities = tf.nn.softmax(logits, axis=-1)
    predicted_class = np.argmax(probabilities, axis=-1)
    return predicted_class

In [91]:
predicted_class = get_predicted_class("Says Osama bin Laden endorsed Joe Biden", "Donald Trump Jr.", model, tokenizer)

print("Predicted class:", predicted_class)

Predicted class: [0]


In [92]:
import pandas as pd
from openai import OpenAI

def process_dataframe_with_source_and_headline(df, client):
    results = []
    for index, row in df.iterrows():
        input_text, source, label = row['Headline'], row['Source'], row['Label']
        
        # Adjusted task description and prompts to include source information
        task_description = "classifying text as misinformation or reliable information based on both the source and headline"
        
        prompt1 = f"""
        You are an oracle explanation module in a machine learning pipeline. In the task of {task_description},
        a trained black-box classifier correctly predicted the label
        {label} for the following source and headline. Think about why the model
        predicted the {label} label and identify the latent features
        that caused the label. List ONLY the latent features
        as a comma separated list, without any explanation.
        Examples of latent features are ‘source credibility’, ‘tone’, ‘ambiguity in
        text’, etc.
        —
        Source: {source}
        Headline: {input_text}
        —
        Begin!
        """
        
        latent_features = get_gpt4_response(client, prompt1)
        
        prompt2 = f"""
        Original source: {source}
        Original headline: {input_text}
        Label: {label}

        Identify the words in the headline that are associated
        with the latent features: {latent_features}, and output the
        identified words as a comma separated list.
        """
        
        identified_words = get_gpt4_response(client, prompt2)
        
        prompt3 = f"""
        Original source: {source}
        Original headline: {input_text}
        Label: {label}

        Identified words associated with latent features: {identified_words}.
        Generate a minimally edited version of the original headline
        by ONLY changing a minimal set of the words you identified, in order to change the label. It is okay if the semantic meaning of the original headline is altered. Make sure the
        generated text makes sense and is plausible. Enclose the
        generated text within <new>tags.
        """
        
        counterfactual = get_gpt4_response(client, prompt3)
        
        results.append({
            "Source": source,
            "Headline": input_text,
            "Label": label,
            "Latent Features": latent_features,
            "Identified Words": identified_words,
            "Counterfactual Text": counterfactual
        })
    
    return pd.DataFrame(results)

# Ensure the get_gpt4_response function is adjusted if necessary
def get_gpt4_response(client, prompt):
    chat_completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return chat_completion.choices[0].message.content

key_path = './apiKey.txt'
with open(key_path, 'r') as file:
    key = file.readline().strip()

client = OpenAI(
    api_key = key
)

In [93]:
val_data = pd.read_csv("./Data/Raw/Validation.csv")[["News_Headline", "Source", "Label"]].rename(columns={"News_Headline": "Headline"}).head(10)

# add this once we get a better working model
# val_data['PredictedClass'] = val_data.apply(lambda x: get_predicted_class(x['Headline'], x['Source'], model, tokenizer), axis=1) 

In [94]:
processed_df = process_dataframe_with_source_and_headline(val_data, client)
display(processed_df)


Unnamed: 0,Source,Headline,Label,Latent Features,Identified Words,Counterfactual Text
0,Charlie Crist,"Says Rick Scott has ""teamed up with a felon co...",False,"sensationalism, negative tone, guilt by associ...","sensationalism, negative tone, guilt by associ...","<new>Says Rick Scott has ""collaborated with a ..."
1,Stephen Sweeney,"""People want the minimum wage, they want marri...",True,"tone, support for social issues, contrast with...","support, social issues, contrast, governor's a...","<new>""People want the minimum wage, they want ..."
2,Viral image,Says a photo of a young girl crying alongside ...,False,"sensationalism, lack of evidence, emotional ma...","sensationalism, lack of evidence, emotional ma...",<new>Says a photo of a young girl crying along...
3,Occupy Democrats,"""House Republicans just passed a bill that mak...",False,"biased source, sensational headline","House Republicans, single mothers, fired, legal",<new>House Republicans just passed a bill that...
4,Verified Politics,Says Marco Rubio tweeted a verse from the New ...,False,"sensationalized language,false accusation,bias...","sensationalized language, false accusation, bi...",<new>Says Marco Rubio tweeted a verse from the...
5,Ron DeSantis,"""This particular pandemic is one where I dont...",False,"source credibility, sensationalism, false info...","source credibility, false information","<new>""This specific pandemic is one where I be..."
6,Progressive Choice Florida,"Says Charlie Crist ""implemented Jeb Bushs A+ ...",True,"source credibility, political leaning, mention...","Progressive Choice Florida, Charlie Crist, Jeb...","<new>Says Jeb Bush ""implemented Charlie Crist'..."
7,Ro Khanna,"""Most Americans dont own stocks.""",True,"source credibility, factual statement, lack of...","Most Americans, don't own stocks",<new>Most Americans own stocks.</new>
8,Scott Walker,"""Our last budget committed the highest level o...",True,"source credibility, positive tone, mention of ...","Scott Walker, highest level, need-based financ...","<new>""Our last budget committed the lowest lev..."
9,Gery Chico,"""In Springfield, Susana Mendoza voted to hit w...",True,"negative tone, specific mention of action by S...","voted, hit, working families, massive, new sod...","<new>In Springfield, Susana Mendoza voted to s..."


In [67]:
display(val_data)

Unnamed: 0,Headline,Source,Label,PredictedClass
0,"Says Rick Scott has ""teamed up with a felon co...",Charlie Crist,False,[0]
1,"""People want the minimum wage, they want marri...",Stephen Sweeney,True,[0]
2,Says a photo of a young girl crying alongside ...,Viral image,False,[0]
3,"""House Republicans just passed a bill that mak...",Occupy Democrats,False,[0]
4,Says Marco Rubio tweeted a verse from the New ...,Verified Politics,False,[0]
5,"""This particular pandemic is one where I dont...",Ron DeSantis,False,[0]
6,"Says Charlie Crist ""implemented Jeb Bushs A+ ...",Progressive Choice Florida,True,[0]
7,"""Most Americans dont own stocks.""",Ro Khanna,True,[0]
8,"""Our last budget committed the highest level o...",Scott Walker,True,[0]
9,"""In Springfield, Susana Mendoza voted to hit w...",Gery Chico,True,[0]


In [87]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

from transformers import pipeline
import tensorflow as tf


# Load the tokenizer and model outside the function to avoid reloading them on each function call
# tokenizer = DistilBertTokenizer.from_pretrained('jojo0616/my_Misinformation_distilbert_model')
# Load the DistilBERT model and tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('jojo0616/my_Misinformation_distilbert_model')
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('jojo0616/my_Misinformation_distilbert_model', from_pt=True)

# Load the RoBERTa model and tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained('vikram71198/distilroberta-base-finetuned-fake-news-detection')
roberta_model = TFRobertaForSequenceClassification.from_pretrained('vikram71198/distilroberta-base-finetuned-fake-news-detection', from_pt=True)


def ensemble_classify_news_and_evaluate_accuracy(df):
    predictions = []
    
    for _, row in df.iterrows():
        text_input = f"{row['Headline']} [SEP] {row['Source']}"
        
        # Tokenize and prepare inputs for DistilBERT
        distilbert_inputs = distilbert_tokenizer(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        distilbert_outputs = distilbert_model(distilbert_inputs)
        distilbert_probabilities = tf.nn.softmax(distilbert_outputs.logits, axis=-1)
        
        # Tokenize and prepare inputs for RoBERTa
        roberta_inputs = roberta_tokenizer(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        roberta_outputs = roberta_model(roberta_inputs)
        roberta_probabilities = tf.nn.softmax(roberta_outputs.logits, axis=-1)
        
        # Ensemble: Average the probabilities from both models
        avg_probabilities = (distilbert_probabilities + roberta_probabilities) / 2
        predicted_class_index = tf.argmax(avg_probabilities, axis=-1).numpy()[0]
        
        # Map the predicted class index to boolean True (real) or False (fake)
        prediction = True if predicted_class_index == 1 else False
        predictions.append(prediction)
    
    df['EnsemblePrediction'] = predictions
    
    # Calculate accuracy
    correct_predictions = (df['EnsemblePrediction'] == df['Label']).sum()
    total_predictions = len(df)
    accuracy = correct_predictions / total_predictions
    
    print(f"Accuracy: {accuracy:.4f}")
    
    return df


# Example usage
df_updated = ensemble_classify_news_and_evaluate_accuracy(val_data)
display(df_updated)



All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClas

Accuracy: 0.4000


Unnamed: 0,Headline,Source,Label,EnsemblePrediction
0,"Says Rick Scott has ""teamed up with a felon co...",Charlie Crist,False,True
1,"""People want the minimum wage, they want marri...",Stephen Sweeney,True,False
2,Says a photo of a young girl crying alongside ...,Viral image,False,False
3,"""House Republicans just passed a bill that mak...",Occupy Democrats,False,False
4,Says Marco Rubio tweeted a verse from the New ...,Verified Politics,False,True
5,"""This particular pandemic is one where I dont...",Ron DeSantis,False,False
6,"Says Charlie Crist ""implemented Jeb Bushs A+ ...",Progressive Choice Florida,True,True
7,"""Most Americans dont own stocks.""",Ro Khanna,True,False
8,"""Our last budget committed the highest level o...",Scott Walker,True,False
9,"""In Springfield, Susana Mendoza voted to hit w...",Gery Chico,True,False


In [85]:
display(val_data)

Unnamed: 0,Headline,Source,Label,PredictedClass,EnsemblePrediction
0,"Says Rick Scott has ""teamed up with a felon co...",Charlie Crist,,[0],True
1,"""People want the minimum wage, they want marri...",Stephen Sweeney,,[0],False
2,Says a photo of a young girl crying alongside ...,Viral image,,[0],False
3,"""House Republicans just passed a bill that mak...",Occupy Democrats,,[0],False
4,Says Marco Rubio tweeted a verse from the New ...,Verified Politics,,[0],True
5,"""This particular pandemic is one where I dont...",Ron DeSantis,,[0],False
6,"Says Charlie Crist ""implemented Jeb Bushs A+ ...",Progressive Choice Florida,,[0],True
7,"""Most Americans dont own stocks.""",Ro Khanna,,[0],False
8,"""Our last budget committed the highest level o...",Scott Walker,,[0],False
9,"""In Springfield, Susana Mendoza voted to hit w...",Gery Chico,,[0],False


In [74]:
for i in df_updated['Prediction']:
    print(i)

('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.996154  , 0.00384604]], dtype=float32)>, 0)
('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.9929248 , 0.00707523]], dtype=float32)>, 0)
('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.9987834 , 0.00121657]], dtype=float32)>, 0)
('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.9951952 , 0.00480478]], dtype=float32)>, 0)
('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.9985203 , 0.00147974]], dtype=float32)>, 0)
('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.99774903, 0.00225091]], dtype=float32)>, 0)
('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.9967681 , 0.00323185]], dtype=float32)>, 0)
('fake', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.9942538 , 0.00574613]], dtype=float32)>, 0)
('real', <tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.02600778, 0.97399217]], dtype=float32)>, 1)
('fake', <