# Neural NLP Representation Learning Approach

### CLEF 2025 - CheckThat! Lab  - Task 4 Scientific Web Discourse - Subtask 4b (Scientific Claim Source Retrieval)

This notebook implements an improved neural approach using sentence transformers with:
- Enhanced text preprocessing for scientific content
- Multi-query retrieval with domain-specific augmentation
- Semantic term matching boosts

This remains a pure neural representation learning approach.

In [None]:
!pip install sentence-transformers scikit-learn

# 1) Importing data and packages

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import warnings
warnings.filterwarnings('ignore')

## 1.a) Import the collection set

In [None]:
PATH_COLLECTION_DATA = 'subtask4b_collection_data.pkl' #MODIFY PATH

In [None]:
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [None]:
df_collection.info()

In [None]:
df_collection.head()

## 1.b) Import the query set

In [None]:
PATH_QUERY_TRAIN_DATA = 'subtask4b_query_tweets_train.tsv' #MODIFY PATH
PATH_QUERY_DEV_DATA = 'subtask4b_query_tweets_dev.tsv' #MODIFY PATH

In [None]:
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep='\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep='\t')

In [None]:
df_query_train.head()

In [None]:
df_query_dev.head()

# 2) Text preprocessing functions

In [None]:
def clean_tweet_text(text):
    """Clean tweet text while preserving scientific information"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'&lt;', '<', text)
    text = re.sub(r'&gt;', '>', text)
    
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    
    text = re.sub(r'#covid19', 'COVID-19', text, flags=re.IGNORECASE)
    text = re.sub(r'#sarscov2', 'SARS-CoV-2', text, flags=re.IGNORECASE)
    text = re.sub(r'#(covid|coronavirus)', 'COVID-19', text, flags=re.IGNORECASE)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    text = re.sub(r'\bcovid-?19\b', 'COVID-19', text, flags=re.IGNORECASE)
    text = re.sub(r'\bsars-?cov-?2\b', 'SARS-CoV-2', text, flags=re.IGNORECASE)
    text = re.sub(r'\bcovid\b(?![\d-])', 'COVID-19', text, flags=re.IGNORECASE)
    
    text = re.sub(r'\bnih\b', 'NIH', text, flags=re.IGNORECASE)
    text = re.sub(r'\bicu\b', 'ICU', text, flags=re.IGNORECASE)
    text = re.sub(r'\bppe\b', 'PPE', text, flags=re.IGNORECASE)
    text = re.sub(r'\busa\b', 'USA', text, flags=re.IGNORECASE)
    
    text = re.sub(r'\bp\s*[<>=]\s*0\.(\d+)', r'p-value 0.\1', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(\d+)%\s*ci\b', r'\1% confidence interval', text, flags=re.IGNORECASE)
    
    text = re.sub(r'[💃🚨▶️👍📈📊🔥✅❌🎯🧵👇🏻🔴☑️⬇️➡️]+', '', text)
    text = re.sub(r'[\"\"\"]', '"', text)
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def clean_scientific_text(text):
    """Minimal cleaning for scientific text"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def create_enhanced_document_text(row):
    """Create structured document representation"""
    title = clean_scientific_text(row['title'])
    abstract = clean_scientific_text(row['abstract'])
    authors = str(row['authors']) if not pd.isna(row['authors']) else ""
    journal = str(row['journal']) if not pd.isna(row['journal']) else ""
    
    enhanced_text = f"Title: {title}. Abstract: {abstract}"
    
    if authors:
        enhanced_text += f" Authors: {authors}"
    if journal:
        enhanced_text += f" Journal: {journal}"
    
    return enhanced_text

# 3) Load sentence transformer model

In [None]:
# Load sentence transformer model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device='cuda')

# 4) Prepare document representations

In [None]:
# Create enhanced representations for all documents
df_collection['enhanced_text'] = df_collection.apply(create_enhanced_document_text, axis=1)

# Prepare corpus and IDs
corpus = df_collection['enhanced_text'].tolist()
cord_uids = df_collection['cord_uid'].tolist()

# 5) Encode all documents

In [None]:
# Encode all documents
doc_embeddings = model.encode(
    corpus,
    show_progress_bar=True,
    convert_to_numpy=True,
    batch_size=32,
    normalize_embeddings=True
)

# 6) Advanced retrieval functions

In [None]:
def create_augmented_queries(tweet_text):
    """Create multiple query variations for better retrieval"""
    base_query = clean_tweet_text(tweet_text)
    queries = [base_query]
    base_lower = base_query.lower()
    
    # Add scientific context if study-related terms present
    if any(term in base_lower for term in ['study', 'research', 'trial', 'analysis', 'findings']):
        queries.append(f"scientific research {base_query}")
    
    # Add COVID context if relevant
    if any(term in base_lower for term in ['covid', 'coronavirus', 'pandemic', 'vaccine', 'mask']):
        queries.append(f"COVID-19 pandemic study {base_query}")
    
    # Add statistical context if numbers present
    if re.search(r'\d+%|\bp-value|\bconfidence interval|\bodds ratio|\brisk', base_query, re.IGNORECASE):
        queries.append(f"statistical research findings {base_query}")
    
    return queries

def compute_semantic_boost(query_text, corpus_texts):
    """Compute semantic boost factors based on domain-specific term matching"""
    query_lower = query_text.lower()
    boosts = np.zeros(len(corpus_texts))
    
    # boosts
    for i, doc_text in enumerate(corpus_texts):
        doc_lower = doc_text.lower()
        boost = 0.0
        
       
        if re.search(r'\d+%|\bp-value', query_lower) and re.search(r'\d+%|\bp-value', doc_lower):
            boost += 0.06
        
        if any(term in query_lower for term in ['covid', 'coronavirus']):
            if any(term in doc_lower for term in ['covid', 'coronavirus']):
                boost += 0.05
        
        if any(term in query_lower for term in ['study', 'trial', 'research']):
            if any(term in doc_lower for term in ['study', 'trial', 'research']):
                boost += 0.03

        if any(term in query_lower for term in ['vaccine', 'mask', 'treatment']):
            if any(term in doc_lower for term in ['vaccine', 'mask', 'treatment']):
                boost += 0.04
        
        boosts[i] = boost
    
    return boosts

def retrieve_papers_fast_boosting(query_text, k=5):
    """Fast neural retrieval using multi-query and semantic boosting"""
    
    queries = create_augmented_queries(query_text)
    
    all_similarities = []
    for query in queries:
        query_embedding = model.encode([query], 
                                     convert_to_numpy=True, 
                                     normalize_embeddings=True)
        base_similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
        
        boosts = compute_semantic_boost(query, corpus)
        boosted_similarities = base_similarities + boosts
        all_similarities.append(boosted_similarities)
    
    if len(all_similarities) == 1:
        fused_scores = all_similarities[0]
    else:
        weights = [0.6] + [0.4 / (len(all_similarities) - 1)] * (len(all_similarities) - 1)
        fused_scores = np.average(all_similarities, axis=0, weights=weights)
    
    top_indices = np.argsort(fused_scores)[::-1][:k]
    return [cord_uids[i] for i in top_indices]

# 7) Running the improved neural model

In [None]:
# Retrieve topk candidates using the improved neural model
df_query_train['improved_neural_topk'] = df_query_train['tweet_text'].apply(lambda x: retrieve_papers_fast_boosting(x))
df_query_dev['improved_neural_topk'] = df_query_dev['tweet_text'].apply(lambda x: retrieve_papers_fast_boosting(x))

# 8) Evaluating the improved neural model

In [None]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [None]:
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'improved_neural_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'improved_neural_topk')
# Printed MRR@k results in the following format: {k: MRR@k}
print(f"Results on the train set: {results_train}")
print(f"Results on the dev set: {results_dev}")

# 9) Exporting results to prepare the submission

In [None]:
df_query_dev['preds'] = df_query_dev['improved_neural_topk'].apply(lambda x: x[:5])

In [None]:
df_query_dev[['post_id', 'preds']].to_csv('predictions_improved_neural.tsv', index=None, sep='\t')