# CLEF 2025 - CheckThat!
## LabTask 4 Scientific Web Discourse - Subtask 4b (Scientific Claim Source Retrieval)
## EDA

## 1. Import data

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from wordcloud import WordCloud

## 1.a) Import CORD Dataset
The data contains metadata of CORD-19 academic papers.

The preprocessed and filtered CORD-19 dataset is available here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4?ref_type=heads

In [None]:
PATH_COLLECTION_DATA = 'subtask4b_collection_data.pkl'

In [None]:
df_cord = pd.read_pickle(PATH_COLLECTION_DATA)

In [None]:
df_cord.info()

In [None]:
df_cord.head()
df_cord.to_csv('cord.csv')

## 1.b) Import the Tweets Dataset
The query set contains tweets with implicit references to academic papers from the collection set.

The preprocessed query set is available here: https://gitlab.com/checkthat_lab/clef2025-checkthat-lab/-/tree/main/task4?ref_type=heads

In [None]:
PATH_QUERY_DATA = 'subtask4b_query_tweets.tsv'

In [None]:
df_tweets = pd.read_csv(PATH_QUERY_DATA, sep = '\t')

In [None]:
df_tweets.head()

In [None]:
df_tweets.info()

## 2. CORD Dataset

### 2.a) Description

In [None]:
df_cord.describe(include=[object])

### 2.b) Bar Charts

### Sources (source_x)

In [None]:
source_x = df_cord['source_x'].to_list()
source_x = [string.split(";")[0].strip() for string in source_x]
unique_source_x = list(set(source_x))

plt.hist(x=source_x, bins=len(unique_source_x), width=0.5)

### Journals (journal)

In [None]:
journals = df_cord['journal'].value_counts().nlargest(20)
journals.plot(kind='bar')

### Authors

In [None]:
df1 = df_cord['authors'].str.split(';', expand=True)
author_list = []
for column in df1.columns:
    df1[column] = df1[column].str.strip()
    for item in list(df1[column].values):
        author_list.append(item)
author_list = [x for x in author_list if str(x) != 'nan']

In [None]:
num_credits=len(author_list)
num_authors = len(set(author_list))

print("Number of authors: {}".format(num_authors))
print("Number of credits: {}".format(num_credits))
print("Percent of authors with multiple publications: {0:.2%}".format(num_authors/num_credits))

In [None]:
authors = pd.DataFrame(author_list)[0].value_counts().nlargest(20)
authors.plot(kind='bar')

## 3. Tweets Dataset

### 2.a) Description

In [None]:
df_tweets.describe(include=[object])

In [None]:
def remove_stopwords(text):
    stopword_list = ["", "-", "&amp;", "likely", "new", "may", "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
    filtered_tokens = [token for token in text if token.lower() not in stopword_list]
    return filtered_tokens

In [None]:
def word_frequencies(words):
    text = words.str.cat(sep=' ')
    return Counter(remove_stopwords(text.split(' ')))

In [None]:
tweet_frequencies = word_frequencies(df_tweets['tweet_text'])
cord_abstract_frequencies = word_frequencies(df_cord['abstract'])
cord_title_frequencies = word_frequencies(df_cord['title'])

In [None]:
tweets_wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(tweet_frequencies)

# Display word clouds
plt.figure(figsize=(10, 5))
plt.imshow(tweets_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud for Tweets", fontsize=16)
plt.show()

tweets_wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(cord_abstract_frequencies)

# Display word clouds
plt.figure(figsize=(10, 5))
plt.imshow(tweets_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud for CORD Abstracts", fontsize=16)
plt.show()

tweets_wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(cord_title_frequencies)

# Display word clouds
plt.figure(figsize=(10, 5))
plt.imshow(tweets_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud for CORD Titles", fontsize=16)
plt.show()