In [None]:
import requests

def get_wikipedia_pageviews(article_title, start_date='20230101', end_date='20231231'):
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/{article_title}/daily/{start_date}/{end_date}"
    
    headers = {
        'User-Agent': 'WikiPageviewsAnalyzer/1.0' 
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()

        if 'items' in data: 
            total_views = sum([item['views'] for item in data['items']])
            return total_views
        else:
            print(f"No 'items' found for {article_title}")
            return 0
    else:
        print(f"Error fetching data for {article_title}, status code: {response.status_code}")
        return 0  

In [None]:
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

vectorizer = TfidfVectorizer()

def calculate_total_popularity_and_closest_article(docs, output_mini_fact):
    cosine_scores = []
    print(docs)

    for doc in docs:
        tfidf_matrix = vectorizer.fit_transform([output_mini_fact, doc])
        cosine_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0, 0]  
        cosine_scores.append(cosine_score)

    closest_article = docs[cosine_scores.index(max(cosine_scores))]
    pageviews = get_wikipedia_pageviews(closest_article) 
    return pageviews, closest_article


Create Popularity Column for Train

In [None]:
df = pd.read_pickle('train/train.pkl')


df[['popularity', 'closest_article']] = df.apply(
    lambda row: calculate_total_popularity_and_closest_article(row['docs'], row['output_mini_fact']),
    axis=1,
    result_type='expand'  
)

df.to_pickle('train/train_with_popularity_unbalanced.pkl')

Create Popularity Column for Test/Dev

In [None]:
df = pd.read_pickle('train/dev.pkl')

df[['popularity', 'closest_article']] = df.apply(
    lambda row: calculate_total_popularity_and_closest_article(row['docs'], row['output_mini_fact']),
    axis=1,
    result_type='expand'  
)

df.to_pickle('train/dev_with_popularity_unbalanced.pkl')

Get real samples

In [None]:
import unicodedata2
import sqlite3

def query_wiki(doc_title):
    conn = sqlite3.connect('wiki_wo_links.db')
    c = conn.cursor()

    para = (
        c.execute("SELECT text FROM documents WHERE id = ?", (unicodedata2.normalize('NFD', doc_title),)).fetchall()
    )[0][0]

    conn.close()
    return para

In [None]:
mini_facts_instruction = f"""Your task is to breakdown claims/sentences into independant and verifiable statements (maximum 4). 
You must NEVER correct or comment the original claims/sentences even if something of the original claims/sentences is incorrect.
Do NEVER generate statements that are not in the original claims/sentences. Every statement must start with an entity that specifies the topic (e.g. **The Fox Broadcasting Company** and not **The company**)."""
        
mini_facts_samples = ["The Hunger Games is a 2012 American science fiction film directed by John Peter and based on the novel of the same name by Suzanne Collins. Matt Lucena is an American former professional tennis player.",
"""Owen Wilson starred in the film "The Karate Kid" (2010) alongside martial arts expert Tom Wu. Owen Wilson voiced Lightning McQueen in the "Cars" franchise, not "The Royal Tenenbaums" franchise.""",
"Feels So Good is a song by the American R&B group Tony! Toni! TonÃ. The song was written by the group's lead vocalist Raphael Saadiq and producer Tony! Toni! TonÃ's lead vocalist Dwayne Wimberly."]
        
        
mini_facts_sample_outputs = ["""- **The Hunger Games** is a 2012 American science fiction film.
- **The Hunger Games** was directed by John Peter.
- **The Hunger Games** is based on a novel of the same name by Suzanne Collins.
- **Matt Lucena** is an American former professional tennis player.""",
"""- **Owen Wilson** starred in the film The Karate Kid (2010) alongside martial arts expert Tom Wu.
- **Owen Wilson** voiced Lightning McQueen in the Cars franchise.
- **Owen Wilson** did not voice Lightning McQueen in the The Royal Tenenbaums franchise.""",
"""- **Feels So Good** is a song by the American R&B group Tony! Toni! TonÃ.
- **Feels So Good** was written by the group's lead vocalist Raphael Saadiq and producer Tony! Toni! TonÃ's lead vocalist Dwayne Wimberly."""]

In [None]:
import spacy
import os
from openai import OpenAI
import torch


nlp = spacy.load("en_core_web_sm")

with open("../api.key", "r") as file:
    api_key = file.read().strip() 

os.environ["OPENAI_API_KEY"] = api_key
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def convert_text_to_sentences(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences

def call_llm(messages, response_format):
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        temperature=0.0,
        max_tokens=256,
        top_p=1.0,
        frequency_penalty=0,
        presence_penalty=0,
        response_format={
            "type": response_format
        }
    )
    return response.choices[0].message.content
    

def get_prompt_mini_facts(gen_evidence):
    messages = [{"role": "system", 
            "content" : [{"type": "text", 
                        "text": f"{mini_facts_instruction}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{mini_facts_samples[0]}"}]},
        {"role": "assistant", "content": [{"type": "text", "text": f"{mini_facts_sample_outputs[0]}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{mini_facts_samples[1]}"}]},
        {"role": "assistant", "content": [{"type": "text", "text": f"{mini_facts_sample_outputs[1]}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{mini_facts_samples[2]}"}]},
        {"role": "assistant", "content": [{"type": "text", "text": f"{mini_facts_sample_outputs[2]}"}]},
        {"role": "user", "content": [{"type": "text", "text": f"{gen_evidence}"}]}]
    return messages


Add 150 real Documents (Each Document has 3-4 mini-facts) to each bin of the Test Set

In [None]:
import pandas as pd

df_test = pd.read_pickle('test/test_llm_generations.pkl')

# Define labels and create quantile-based bins using qcut
labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
df_test['popularity_bin'] = pd.qcut(df_test['popularity'], q=5, labels=labels)

# Initialize an empty DataFrame to store results
df_real_test_samples = pd.DataFrame()

# Iterate over each bin group
for bin_label, bin_group in df_test.groupby('popularity_bin'):
    count_in_bin = 0
    
    # Process each row within the bin
    for index, row in bin_group.iterrows():
        popularity = row['popularity']
        closest_article = row['closest_article']
        
        # Query wiki, convert to sentences, and generate prompt
        text = query_wiki(closest_article)
        sentences = convert_text_to_sentences(text)
        messages = get_prompt_mini_facts(sentences[0])
        output = call_llm(messages, "text")
        
        # Process the LLM response
        response = output.replace("**", "").replace("-", "")
        response = response.split("\n")
        response = [item.strip() for item in response if item.strip()]
        
        # Append each mini fact to df_new
        for mini_fact in response:
            df_real_samples = pd.concat([df_real_test_samples, pd.DataFrame({
                'output_mini_fact': [mini_fact],
                'label_mini_fact': [1],
                'closest_article': [[closest_article]],
                'popularity': [popularity]
            })], ignore_index=True)
        
        count_in_bin += 1
        # Stop after processing 50 entries for the current bin
        if count_in_bin >= 150:
            print(f"Completed {count_in_bin} entries for popularity bin {bin_label}")
            break

    print(f"Completed {count_in_bin} entries for popularity bin {bin_label}")

# Final count for df_real_test_samples
print("Total entries added:", len(df_real_test_samples))
df_real_test_samples.to_pickle('test_folder/mini_fact_hover_test_with_popularity_real_samples.pkl')

Add real Train Samples with Low popularity (e.g popularity < 5000 corresponds to documents in bin 'Very Low' and bin 'Low')

In [None]:
df_real_train_samples = pd.DataFrame()
count_low_popularity = 0

df_with_popularity_unbalanced = pd.read_pickle('train/train_with_popularity_unbalanced.pkl')

for index, row in df_with_popularity_unbalanced.iterrows():
    if index % 100 == 0:
        print(count_low_popularity)

    if count_low_popularity == 2000:
        break
    popularity = row['popularity']
    if popularity < 5000:
        count_low_popularity += 1
        closest_article = row['closest_article']
        text = query_wiki(closest_article)
        sentences = convert_text_to_sentences(text)
        messages = get_prompt_mini_facts(sentences[0])
        output = call_llm(messages, "text")
        response = output.replace("**", "").replace("-", "")
        response = response.split("\n")
        response = [item.strip() for item in response]
        for mini_fact in response:
            df_real_train_samples = pd.concat([df_real_train_samples, pd.DataFrame({'output_mini_fact' : [mini_fact], 'label_mini_fact': [1], 'closest_article' : [[closest_article]], 'popularity' : [popularity]})], ignore_index=True)



Real Samples need embeddings

In [None]:
import transformers
import torch


remove_period = True
model_path = "C:/Users/droeh/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298"

tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True,
    local_files_only=True,
)

In [None]:
import pandas as pd
from transformers import logging
logging.set_verbosity_error()

def get_embeddings(df_real_samples):
    embeddings_name = f"embeddings-16_mini_fact"
    layer = -16
    df_real_samples[str(embeddings_name)] = None
    df_real_samples[str(embeddings_name)] = df_real_samples[str(embeddings_name)].astype(object)

    def process_row(prompt, layer):
        if remove_period:
            prompt = prompt.rstrip(". ")
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model.generate(inputs.input_ids, output_hidden_states=True, return_dict_in_generate=True, max_new_tokens=1, min_new_tokens=1)
        embeddings = {}
        last_hidden_state = outputs.hidden_states[0][layer][0][-1]
        last_hidden_state = last_hidden_state.to(torch.float32)
        embeddings[layer] = [last_hidden_state.cpu().numpy().tolist()]
        return embeddings

    for index, row in df_real_samples.iterrows():
        if index % 100 == 0:
            print(index)
        mini_fact = row['output_mini_fact']
        embeddings_list = process_row(mini_fact, layer)
        df_real_samples.at[index, str(embeddings_name)] = embeddings_list[layer][0]

    return df_real_samples

df_real_train_samples = get_embeddings(df_real_train_samples)
df_real_test_samples = get_embeddings(df_real_test_samples)

Balance all dfs

In [None]:
import pandas as pd

df_with_popularity_unbalanced = pd.read_pickle('train/train_with_popularity_unbalanced.pkl')
df_real_train_samples = pd.read_pickle('train/train_injection_low_popularity_with_embeddings.pkl')

In [None]:
from sklearn.utils import resample

def balance_df_by_popularity(df):

    labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
    df['popularity_bin'] = pd.qcut(df['popularity'], q=5, labels=labels)


    balanced_dfs = []
    for bin_label, group in df.groupby('popularity_bin'):
        positive_class = group[group['label_mini_fact'] == 1]
        negative_class = group[group['label_mini_fact'] == 0]
        minority_size = min(len(positive_class), len(negative_class))
        print(f"Balancing bin '{bin_label}' with {minority_size} samples in each class")
        positive_class_balanced = resample(positive_class, replace=False, n_samples=minority_size, random_state=42)
        negative_class_balanced = resample(negative_class, replace=False, n_samples=minority_size, random_state=42)
        balanced_group = pd.concat([positive_class_balanced, negative_class_balanced])
        balanced_dfs.append(balanced_group)

    balanced_df = pd.concat(balanced_dfs)

    return balanced_df

df_balanced_with_popularity_balanced_train_injections = pd.concat([df_with_popularity_unbalanced, df_real_train_samples], ignore_index=True)
df_balanced_with_popularity_balanced_train_injections = balance_df_by_popularity(df_balanced_with_popularity_balanced_train_injections)

#df_balanced_with_popularity_balanced.to_pickle('train/train_with_popularity_balanced.pkl')
#df_balanced_with_popularity_balanced_train_injections.to_pickle('train/train_with_popularity_balanced_train_injections.pkl')

In [None]:
df_balanced_with_popularity_balanced_train_injections