# Vector Data Store for entailment

## Preprocessing

In [None]:
import pandas as pd
import re

In [None]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text.encode('ascii', 'ignore').decode('ascii')
    if text.startswith("''"):
        text = text[1:-1]
    return text

In [None]:
data = pd.read_csv("/kaggle/input/fake-news-covid/prova")

display(data["id"])

0        1344763058888048643
1        1344756661567823872
2        1344733112454942721
3        1344770503014297602
4        1344719068520095744
                ...         
11425    1413112332385927169
11426    1413102602410156032
11427    1413142807024336909
11428    1413143486568681478
11429    1413129569662554117
Name: id, Length: 11430, dtype: int64

In [None]:
data_mis = pd.read_csv("/kaggle/input/fake-news-covid/VaxMisinfoData.csv")

display(data_mis["id"])

0        1344795424855642112
1        1344794858133860353
2        1344794822691983360
3        1344794752819077123
4        1344792070507134977
                ...         
15068    1413087751474397186
15069    1413087030578401283
15070    1413085793397186565
15071    1413085519710363648
15072    1413085365745774593
Name: id, Length: 15073, dtype: int64

In [None]:
data_final = data_mis.merge(data, on="id")

display(data_final["id"])

0        1344795424855642112
1        1344794858133860353
2        1344794822691983360
3        1344794752819077123
4        1344790296119422980
                ...         
11425    1413088663886573569
11426    1413087751474397186
11427    1413087030578401283
11428    1413085519710363648
11429    1413085365745774593
Name: id, Length: 11430, dtype: int64

In [None]:
#data_final["text"].head()

In [None]:
data_final["text"] = data_final["text"].apply(str).apply(lambda x: text_preprocessing(x))

#data_final["text"].head()

In [None]:
text_data = data_final["text"].values
print(len(text_data))

labels = data_final["is_misinfo"].values
print(len(labels))

11430
11430


Verifico bilanciamento delle etichette

In [None]:
labels_0 = [label for label in labels if label==0]
labels_1 = [label for label in labels if label==1]

print(len(labels_0))
print(len(labels_1))

7631
3799


## Preparing vectorstore

- Label 0: Notizie vere
- Label 1: Notizie false

In [None]:
%%capture --no-stderr
%pip install -U langchain_experimental langchain-ai21 langchain-pinecone langchain_community langchainhub langchain langchain-text-splitters

In [None]:
import os
os.environ["PINECONE_API_KEY"] = "94ef7896-1fae-44d3-b8d2-0bd6f5f664f5"
os.environ["AI21_API_KEY"] = "KlINkh5QKw3hG1b5Hr75YDO7TwGoQvzn"

In [None]:
from langchain.schema import Document

documents = [Document(page_content=row['text'], metadata={'label': row['is_misinfo']}) for index, row in data_final.iterrows()]

print(len(documents))

11430


In [None]:
import pandas as pd
from langchain_pinecone import PineconeVectorStore
from langchain_ai21 import AI21Embeddings

index_name = "entailment-test"

# Add to vectorDB
vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    #embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local", device="cuda"),
    embedding=AI21Embeddings(),
    index_name=index_name
)

# Entailment evaluation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Dati estratti dalla tabella
shots = [0, 3, 6, 12]
metric = "f1_0"

# F1 Scores per each combination of Type Acceptance and Neutral
df = pd.read_excel("/content/drive/MyDrive/predictions_entailment/test_entailment_labels_X.xlsx")
df_ass_contr_no_neutral = df[(df["type_acceptance"] == "Skeptical") & (df["neutral"] == "No-Neutral")]
df_ass_contr_neutral = df[(df["type_acceptance"] == "Skeptical") & (df["neutral"] == "Neutral")]
df_ent_dir_no_neutral = df[(df["type_acceptance"] == "Credulous") & (df["neutral"] == "No-Neutral")]
df_ent_dir_neutral = df[(df["type_acceptance"] == "Credulous") & (df["neutral"] == "Neutral")]


f1_scores = {
    "Ass. Contr. No-Neutral": df_ass_contr_no_neutral[metric].values,
    "Ass. Contr. Neutral": df_ass_contr_neutral[metric].values,
    "Ent. dir. No-Neutral": df_ent_dir_no_neutral[metric].values,
    "Ent. dir. Neutral": df_ent_dir_neutral[metric].values
}

# Setting the positions and width for the bars
bar_width = 0.2
x = np.arange(len(shots))

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(x - 1.5*bar_width, f1_scores["Ass. Contr. No-Neutral"], bar_width, label='Ass. Contr. No-Neutral', color='#4CAF50')
ax.bar(x - 0.5*bar_width, f1_scores["Ass. Contr. Neutral"], bar_width, label='Ass. Contr. Neutral', color='#FF9800')
ax.bar(x + 0.5*bar_width, f1_scores["Ent. dir. No-Neutral"], bar_width, label='Ent. dir. No-Neutral', color='#81C784')
ax.bar(x + 1.5*bar_width, f1_scores["Ent. dir. Neutral"], bar_width, label='Ent. dir. Neutral', color='#FFB74D')

# Adding labels and title
ax.set_xlabel('Shots')
ax.set_ylabel('F1 Score')
ax.set_title('F1 Scores for Acceptance Types and Neutrality Levels Across Real Documents')
ax.set_xticks(x)
ax.set_xticklabels(shots)
ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.25), ncols=2)

# Display the plot
plt.show()

In [None]:
# Dati estratti dalla tabella
shots = [0, 3, 6, 12]
metric = "f1_1"

# F1 Scores per each combination of Type Acceptance and Neutral
df = pd.read_excel("/content/drive/MyDrive/predictions_entailment/test_entailment_labels_X.xlsx")
df_ass_contr_no_neutral = df[(df["type_acceptance"] == "Skeptical") & (df["neutral"] == "No-Neutral")]
df_ass_contr_neutral = df[(df["type_acceptance"] == "Skeptical") & (df["neutral"] == "Neutral")]
df_ent_dir_no_neutral = df[(df["type_acceptance"] == "Credulous") & (df["neutral"] == "No-Neutral")]
df_ent_dir_neutral = df[(df["type_acceptance"] == "Credulous") & (df["neutral"] == "Neutral")]

f1_scores = {
    "Ass. Contr. No-Neutral": df_ass_contr_no_neutral[metric].values,
    "Ass. Contr. Neutral": df_ass_contr_neutral[metric].values,
    "Ent. dir. No-Neutral": df_ent_dir_no_neutral[metric].values,
    "Ent. dir. Neutral": df_ent_dir_neutral[metric].values
}

# Setting the positions and width for the bars
bar_width = 0.2
x = np.arange(len(shots))

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(x - 1.5*bar_width, f1_scores["Ass. Contr. No-Neutral"], bar_width, label='Ass. Contr. No-Neutral', color='#4CAF50')
ax.bar(x - 0.5*bar_width, f1_scores["Ass. Contr. Neutral"], bar_width, label='Ass. Contr. Neutral', color='#FF9800')
ax.bar(x + 0.5*bar_width, f1_scores["Ent. dir. No-Neutral"], bar_width, label='Ent. dir. No-Neutral', color='#81C784')
ax.bar(x + 1.5*bar_width, f1_scores["Ent. dir. Neutral"], bar_width, label='Ent. dir. Neutral', color='#FFB74D')

# Adding labels and title
ax.set_xlabel('Shots')
ax.set_ylabel('F1 Score')
ax.set_title('F1 Scores for Acceptance Types and Neutrality Levels Across Fake Documents')
ax.set_xticks(x)
ax.set_xticklabels(shots)
ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.25), ncols=2)

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Dati estratti dalla tabella
aspects = ["Health", "Governmental", "Society"]

# F1 Scores per each combination of Type Acceptance and Neutral 12 shots
df = pd.read_excel("/content/drive/MyDrive/predictions_entailment/test_entailment_aspects_X.xlsx")
df_ass_contr_no_neutral = df[(df["type_acceptance"] == "Skeptical") & (df["neutral"] == "No-Neutral") & (df["shots"] ==6)]
df_ass_contr_neutral = df[(df["type_acceptance"] == "Skeptical") & (df["neutral"] == "Neutral") & (df["shots"] ==6)]
df_ent_dir_no_neutral = df[(df["type_acceptance"] == "Credulous") & (df["neutral"] == "No-Neutral") & (df["shots"] ==6)]
df_ent_dir_neutral = df[(df["type_acceptance"] == "Credulous") & (df["neutral"] == "Neutral") & (df["shots"] ==6)]

f1_scores = {
    "Ass. Contr. No-Neutral": [df_ass_contr_no_neutral[f"f1_{aspect}"].values[0] for aspect in aspects],
    "Ass. Contr. Neutral": [df_ass_contr_neutral[f"f1_{aspect}"].values[0] for aspect in aspects],
    "Ent. dir. No-Neutral": [df_ent_dir_no_neutral[f"f1_{aspect}"].values[0] for aspect in aspects],
    "Ent. dir. Neutral": [df_ent_dir_neutral[f"f1_{aspect}"].values[0] for aspect in aspects]
}

# Setting the positions and width for the bars
bar_width = 0.2
x = np.arange(len(aspects))

# Plotting the bars
fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(x - 1.5*bar_width, f1_scores["Ass. Contr. No-Neutral"], bar_width, label='Ass. Contr. No-Neutral', color='#4CAF50')
ax.bar(x - 0.5*bar_width, f1_scores["Ass. Contr. Neutral"], bar_width, label='Ass. Contr. Neutral', color='#FF9800')
ax.bar(x + 0.5*bar_width, f1_scores["Ent. dir. No-Neutral"], bar_width, label='Ent. dir. No-Neutral', color='#81C784')
ax.bar(x + 1.5*bar_width, f1_scores["Ent. dir. Neutral"], bar_width, label='Ent. dir. Neutral', color='#FFB74D')

# Adding labels and title
ax.set_ylabel('F1 Score')
ax.set_title('F1 Scores for Acceptance Types and Neutrality Levels For Aspect')
ax.set_xticks(x)
ax.set_xticklabels(aspects)
ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.20), ncols=2)

# Display the plot
plt.show()