In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install openai -q

In [None]:
import os
from tqdm import tqdm
from openai import OpenAI
import pandas as pd
import json

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
gpt_key = user_secrets.get_secret("GPT_key")


In [None]:
client = OpenAI(api_key = gpt_key)

In [None]:
def classify_fake(system, data):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": data}
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "fake_news_detection_response",
                "strict": True,
                "schema": {
                    "type": "object",
                    "properties": {
                        "samples": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "id": {
                                        "type": "integer"
                                    },
                                    "is_fake": {
                                        "type": "boolean"
                                    }
                                },
                                "required": [
                                    "id", "is_fake"
                                ],
                                "additionalProperties": False
                            }
                        }
                    },
                    "required": [
                        "samples"
                    ],
                    "additionalProperties": False
                }
            }
        }
    )

    return response.choices[0].message.content

In [None]:
output_prompt = ' You will output in JSON, where the samples are gonna be in a list.'

### bangla prompt

In [None]:
base_prompt = """
You are an expert in natural language processing and fake news detection. Your task is to analyze Bengali text and determine whether it is likely to be fake news. Fake news refers to false or misleading information or satire content presented as news, often with the intent to deceive or manipulate.
For each of the sample you will receive the headline followed by the content.
Follow these steps:
1. Read the input text carefully.
2. Analyze the content for signs of fake news, such as:
   - Sensational or exaggerated claims.
   - Lack of credible sources or evidence.
   - Contradictions with known facts or reliable information.
   - Emotional or manipulative language.
3. Classify the text into one of the following categories:
   - "Fake News": The text is likely to be false or misleading.
   - "Not Fake News": The text appears to be credible and factual.
4. Provide a brief explanation for your classification.

The following types of news are considered as fake news:

• Misleading/False Context: Any news with unreliable information or contains facts that can mislead audiences.
• Clickbait: News that uses sensitive headlines to grab attention and drive click-throughs to the publisher’s website.
• Satire/Parody: News stories that are intended for entertainment and parody

You will receive an array of objects, each containing an 'id' and 'text'.
"""

bangla_system_prompt = base_prompt + output_prompt

### codemixed prompt

In [None]:
base_prompt = """
You are an expert in natural language processing and fake news detection. Your task is to analyze codemixed Bengali text and determine whether it is likely to be fake news. Fake news refers to false or misleading information or satire content presented as news, often with the intent to deceive or manipulate.
For each of the sample you will receive the headline followed by the content.
Follow these steps:
1. Read the input text carefully.
2. Analyze the content for signs of fake news, such as:
   - Sensational or exaggerated claims.
   - Lack of credible sources or evidence.
   - Contradictions with known facts or reliable information.
   - Emotional or manipulative language.
3. Classify the text into one of the following categories:
   - "Fake News": The text is likely to be false or misleading.
   - "Not Fake News": The text appears to be credible and factual.
4. Provide a brief explanation for your classification.

The following types of news are considered as fake news:

• Misleading/False Context: Any news with unreliable information or contains facts that can mislead audiences.
• Clickbait: News that uses sensitive headlines to grab attention and drive click-throughs to the publisher’s website.
• Satire/Parody: News stories that are intended for entertainment and parody

You will receive an array of objects, each containing an 'id' and 'text'.
"""

codemixed_system_prompt = base_prompt + output_prompt

In [None]:
filepath = '/kaggle/input/codemixed-further-experiments-dataset/further_experiments/fake_news_80.csv'
df = pd.read_csv(filepath)
# df = df[:5]
# test_df

In [None]:
df

In [None]:
df.columns

In [None]:
df = df.reset_index(drop=True)
df['id'] = df.index + 1

In [None]:
output_dir = 'output/gpt-4o'
os.makedirs(output_dir, exist_ok=True)

### helper functions

In [None]:
def process_text(text):
    return text.replace('\n', ' ')

In [None]:
chunk_size = 10

def process_chunks(column):
    df['headline_content'] = df['headline'] + " " + df[column] 
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    return chunks

In [None]:
def generate_chunk_list(chunks):
    chunk_list = []
    for i, chunk in enumerate(chunks):
        start_id = chunk.index[0] + 1
        end_id = chunk.index[-1] + 1
        
        user_prompt = "\n".join([f"{row['id']}: {process_text(row['headline_content'])}" for _, row in chunk.iterrows()])
        
        chunk_dict = {
            'chunk_name': f"{start_id}-{end_id}",
            'user_prompt': user_prompt
        }
        
        chunk_list.append(chunk_dict)
    return chunk_list

In [None]:
def llm_prediction(chunk_list):
    final_df = pd.DataFrame()
    for chunk in chunk_list:
        try:
            response = classify_fake(codemixed_system_prompt, chunk['user_prompt'])
            response_object = json.loads(response)
            output_df = pd.DataFrame(response_object['samples'])
            final_df = pd.concat([final_df, output_df], ignore_index = True)
            
    #         output_df.to_csv(output_file, index=False)
        except Exception as e:
            print(f"An error occurred: {e}" + chunk['chunk_name'])
    return final_df

In [None]:
def process_prediction(final_df):
    result_df = pd.merge(df, final_df, on = 'id', how = 'inner')
    result_df.loc[result_df['is_fake'] == False, 'pred'] = 1
    result_df.dropna(inplace=True)
    return result_df

In [None]:
from sklearn.metrics import classification_report
def classification(y_true, y_pred):
    report = classification_report(y_true, y_pred, digits = 4)
    print(report)

## choose colum

In [None]:
columns = [ 'perturbed_text_words','perturbed_text_sentences', 'perturbed_text_salient']

In [None]:
output_filepath = filepath.split('/')[-1].split('.')[0]

In [None]:
for column in columns:
    print(column)
    chunks = process_chunks(column)
    chunk_list = generate_chunk_list(chunks)
    prediction_df = llm_prediction(chunk_list)
    prediction_df.to_csv(f"preditcion_{output_filepath}_{column}.csv", index=False)
    result_df = process_prediction(prediction_df)
    classification(result_df['label'], result_df['pred'])

    final_output_file = os.path.join(output_dir, f"{output_filepath}_{column}.csv")
    result_df.to_csv(final_output_file, index=False)
    print(f"Final DataFrame saved to {final_output_file}")
    

In [None]:
# result_df