# Data Loading/Prep

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
import string
from tqdm.notebook import tqdm
# stopwords
nltk.download("stopwords")

In [None]:
TEST_BODIES_PATH = "data/fnc-1/competition_test_bodies.csv"
TEST_STANCES_PATH = "data/fnc-1/competition_test_stances.csv"
TRAIN_BODIES_PATH = "data/fnc-1/train_bodies.csv"
TRAIN_STANCES_PATH = "data/fnc-1/train_stances.csv"
ALBERT_PREDICTIONS = "data/fnc-1/golden_labels_2.csv"
BASELINE_PREDICTIONS = "data/fnc-1/baseline_output.csv"


## Loading Bodies and Stances for both models

In [None]:
def create_merged_df(bodies_path, stances_path):
    bodies = pd.read_csv(bodies_path, names=['Body ID', 'articleBody'], header=0)
    stances = pd.read_csv(stances_path, names=['Headline', 'Body ID', 'Stance'], header=0)
    df = pd.merge(bodies, stances, on='Body ID')
    return df


albert = pd.read_csv(ALBERT_PREDICTIONS, names=['Headline', 'Body ID', 'Stance'], header=0)
baseline = pd.read_csv(BASELINE_PREDICTIONS, names=['Headline', 'Body ID', 'Stance'], header=0)
baseline.columns = ['Headline', 'Body ID', 'Stance_baseline']
test_res = create_merged_df(TEST_BODIES_PATH, TEST_STANCES_PATH)
# test_res['albert'] = pd.malbert[['Headline', 'Stance']]
test_res = pd.merge(test_res,albert, on=['Headline', 'Body ID'], suffixes=['_true', '_albert'])
test_res = pd.merge(test_res,baseline, on=['Headline', 'Body ID'])
train = create_merged_df(TRAIN_BODIES_PATH, TRAIN_STANCES_PATH)
# display(test_res)
# display(train)

In [None]:
print("A few sanity checks:")
correct_agree_albert = test_res.loc[(test_res['Stance_true'] == 'agree') & (test_res['Stance_albert'] == 'agree')].shape[0]
correct_agree_base = test_res.loc[(test_res['Stance_true'] == 'agree') & (test_res['Stance_baseline'] == 'agree')].shape[0]
print(f"Amount of agrees, predicted by ALBERT as agree: {correct_agree_albert}")
print(f"Amount of agrees, predicted by baseline as agree: {correct_agree_base}")


agree_disagree_albert = test_res.loc[(test_res['Stance_true'] == 'agree') & (test_res['Stance_albert'] == 'disagree')].shape[0]
print(f"Amount of agrees, predicted by ALBERT as disagree: {agree_disagree_albert}")
correct_unrelated = test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_baseline'] == 'unrelated')].shape[0]
print(f"Amount of unrelated pairs, correctly predicted by baseline as unrelated: {correct_unrelated}")


def get_confusion_value(true_label, predicted_label):
    return test_res.loc[(test_res['Stance_true'] == true_label) & (test_res['Stance_albert'] == predicted_label)].shape[0]
confusion_matrix_albert = [[get_confusion_value('agree', 'agree'),get_confusion_value('agree', 'disagree'),get_confusion_value('agree', 'discuss'),get_confusion_value('agree', 'unrelated')],
             [test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'agree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'disagree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'discuss')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'unrelated')].shape[0]],
             [test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'agree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'disagree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'discuss')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'unrelated')].shape[0]],
             [test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'agree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'disagree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'discuss')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'unrelated')].shape[0]]]
print(f"confusion matrix for ALBERT: {confusion_matrix_albert}")

In [None]:
test_headlines = set(test_res['Headline'].values)
train_count = 0
for headline in train['Headline'].values:
    if headline in test_headlines:
        train_count += 1

train_length = len(train['Headline'].values)
print(f"There are {train_count} train headlines that occur in the test set out of {train_length}")


train_headlines = set(train['Headline'].values)
test_count = 0
for headline in test_res['Headline'].values:
    if headline in train_headlines:
        test_count += 1
test_length = len(test_res['Headline'].values)
print(f'There are {test_count} test headlines that occur in the train set out of {test_length}')

print(f'We have {len(train_headlines)} unique train headlines and {len(test_headlines)}, the intersection has size {len(train_headlines.intersection(test_headlines))}')

## Discard Unrelated Instances and Map Model Stances to Correctness

In [None]:
test_res_rel = test_res.loc[test_res['Stance_true'] != 'unrelated']
correct = test_res_rel.copy()
correct['correct_albert'] = test_res_rel['Stance_true'] == test_res_rel['Stance_albert']
correct['correct_base'] = test_res_rel['Stance_true'] == test_res_rel['Stance_baseline']
correct = correct[['articleBody', 'Headline', 'Stance_true', 'correct_albert', 'correct_base']]
display(correct)

## Aggregate Dataframes by Headlines and Bodies

In [None]:
correct_count = correct[['Headline', 'correct_albert', 'correct_base']].groupby(['Headline']).sum().sort_values('correct_albert', ascending=False)
correct_count.reset_index(level=0, inplace=True)
display(correct_count)

# Figure 2: Performance on Aggregated Bodies 

In [None]:
grouped_bodies = correct[['articleBody', 'correct_albert', 'correct_base']].groupby(['articleBody']).sum().sort_values('correct_albert', ascending=False)
grouped_bodies.reset_index(level=0, inplace=True)

pair_count = {}
rel_bodies = set(grouped_bodies['articleBody'].values)
# print(rel_headlines)
for body in rel_bodies:
    pair_count[body] = test_res_rel.loc[test_res_rel['articleBody'] == body].shape[0]

missed_count_albert = []
missed_count_base = []
total = []

for body in grouped_bodies['articleBody'].values:
    total.append(pair_count[body])
    missed_count_albert.append(pair_count[body] - grouped_bodies.loc[grouped_bodies['articleBody'] == body]['correct_albert'].values[0])
    missed_count_base.append(pair_count[body] - grouped_bodies.loc[grouped_bodies['articleBody'] == body]['correct_base'].values[0])
     
grouped_bodies['missed_albert'] = missed_count_albert
grouped_bodies['missed_base'] = missed_count_base
grouped_bodies['total'] = total
grouped_bodies['prop_albert'] = grouped_bodies['correct_albert'] / grouped_bodies['total']
grouped_bodies['prop_base'] = grouped_bodies['correct_base'] / grouped_bodies['total']

grouped_bodies['difference'] = grouped_bodies['correct_albert'] - grouped_bodies['correct_base']
display(grouped_bodies)
hist_range = [0,1]
plt.hist(grouped_bodies['prop_albert'], bins=20, range=hist_range)
plt.grid()
plt.title("Correctly predicted headlines by ALBERT model")
plt.ylabel('Frequency')
plt.xlabel('Proportion of correctly predicted headlines per body')
plt.savefig('prop_albert_bodies.pdf')
plt.show()
print(f"Mean props: {grouped_bodies['prop_albert'].mean()}")
print(f"std props: {grouped_bodies['prop_albert'].std()}")

In [None]:
plt.hist(grouped_bodies['prop_base'], bins=20, range=hist_range)
plt.grid()
plt.title("Correctly predicted headlines by the Baseline model")
plt.ylabel('Frequency')
plt.xlabel('Proportion of correctly predicted headlines per body')
plt.savefig('prop_base_bodies.pdf')
plt.show()
print(f"Mean props: {grouped_bodies['prop_base'].mean()}")
print(f"std props: {grouped_bodies['prop_base'].std()}")

In [None]:
print(f"Mean correct albert: {correct_count['correct_albert'].mean()}")
print(f"Mean correct base: {correct_count['correct_base'].mean()}")

# Aggregate by Headlines and Data Exploration

In [None]:
pair_count = {}
rel_headlines = set(test_res_rel['Headline'].values)
# print(rel_headlines)
for head in rel_headlines:
    pair_count[head] = test_res_rel.loc[test_res_rel['Headline'] == head].shape[0]
    
grouped_res = correct_count.copy()
missed_count_albert = []
missed_count_base = []
total = []

for headline in grouped_res['Headline'].values:
    total.append(pair_count[headline])
    missed_count_albert.append(pair_count[headline] - grouped_res.loc[grouped_res['Headline'] == headline]['correct_albert'].values[0])
    missed_count_base.append(pair_count[headline] - grouped_res.loc[grouped_res['Headline'] == headline]['correct_base'].values[0])
     
grouped_res['missed_albert'] = missed_count_albert
grouped_res['missed_base'] = missed_count_base
grouped_res['total'] = total

grouped_res['prop_albert'] = grouped_res['correct_albert'] / grouped_res['total']
grouped_res['prop_base'] = grouped_res['correct_base'] / grouped_res['total']

grouped_res['difference'] = grouped_res['correct_albert'] - grouped_res['correct_base']
display(grouped_res)
# display(grouped_res.loc[grouped_res['total'] > 20].sort_values('prop_albert', ascending=False))


In [None]:
display(grouped_res[['Headline','difference','correct_albert','correct_base']].sort_values('difference', ascending=False))

In [None]:
display(grouped_res.sort_values(['prop_albert', 'total'], ascending=[True, False])[:10])

In [None]:
display(grouped_res.sort_values(['prop_base', 'total'], ascending=[True, False])[:10])

In [None]:
print(f"Average score for albert: {grouped_res['prop_albert'].mean()} (sigma = {grouped_res['prop_albert'].std()})")
print(f"Average score for baseline: {grouped_res['prop_base'].mean()} (sigma = {grouped_res['prop_base'].std()})")

In [None]:
failed_count_albert = grouped_res.loc[grouped_res['prop_albert'] < 0.001].shape[0]
failed_count_base = grouped_res.loc[grouped_res['prop_base'] < 0.001].shape[0]
print(f"Amount of headlines where models failed completely: albert: {failed_count_albert}, base: {failed_count_base}")
succ_count_albert = grouped_res.loc[grouped_res['prop_albert'] > 0.999].shape[0]
succ_count_base = grouped_res.loc[grouped_res['prop_base'] > 0.999].shape[0]
print(f"Amount of headlines where models scored perfectly: albert: {succ_count_albert}, base: {succ_count_base}")

# Appendix: Related Instances per body/headline

In [None]:
hist_range = [0, 50]
plt.hist(grouped_res['total'], bins=20, range=hist_range)
print(f"Amount of headlines that are outliers (larger than {hist_range[1]}): {grouped_res.loc[grouped_res['total'] > hist_range[1]].shape[0]}")
plt.grid()
plt.ylabel("Frequency")
plt.xlabel("Amount of bodies for headline")
plt.savefig('bodycount.pdf')
plt.show()

In [None]:
hist_range = [0, 50]
plt.hist(grouped_bodies['total'], bins=20, range=hist_range)
print(f"Amount of bodies that are outliers (larger than {hist_range[1]}): {grouped_res.loc[grouped_bodies['total'] > hist_range[1]].shape[0]}")
plt.grid()
plt.ylabel("Frequency")
plt.xlabel("Amount of headlines for a body")
plt.savefig('headlinecount.pdf')
plt.show()

# Figure 1: Performance per Aggregated Headline

In [None]:
hist_range = [0,1]
plt.hist(grouped_res['prop_albert'], bins=20, range=hist_range)
plt.grid()
plt.title("Correctly predicted bodies by ALBERT model")
plt.ylabel('Frequency')
plt.xlabel('Proportion of correctly predicted bodies per headline')
plt.savefig('prop_albert.pdf')
plt.show()
print(f"Mean ALBERT: {grouped_res['prop_albert'].mean()}, std: {grouped_res['prop_albert'].std()}")

In [None]:
hist_range = [0,1]
plt.hist(grouped_res['prop_base'], bins=20, range=hist_range)
plt.grid()
plt.title("Correctly predicted bodies by baseline model")
plt.ylabel('Frequency')
plt.xlabel('Proportion of correctly predicted bodies per headline')
plt.savefig('prop_base.pdf')
plt.show()
print(f"Mean base: {grouped_res['prop_base'].mean()}, std: {grouped_res['prop_base'].std()}")

# Result Analysis (Similarity between successful/unsuccessful groups)

In [None]:
correct_headlines_albert = set(grouped_res.loc[grouped_res['prop_albert'] > 0.99]['Headline'].values)
correct_headlines_base = set(grouped_res.loc[grouped_res['prop_base'] > 0.99]['Headline'].values)
failed_headlines_albert = set(grouped_res.loc[grouped_res['prop_albert'] < 0.01]['Headline'].values)
failed_headlines_base = set(grouped_res.loc[grouped_res['prop_base'] < 0.01]['Headline'].values)
print(f"Intersection size of set of correct headlines for albert ({len(correct_headlines_albert)}) and baseline ({len(correct_headlines_base)}): {len(correct_headlines_albert.intersection(correct_headlines_base))}")
print(f"Intersection size of set of failed headlines for albert ({len(failed_headlines_albert)}) and baseline ({len(failed_headlines_base)}): {len(failed_headlines_albert.intersection(failed_headlines_base))}")


In [None]:
failed_bodies_albert = set(grouped_bodies.loc[grouped_bodies['prop_albert'] < 0.001]['articleBody'].values)
failed_instances_headlines_albert = test_res_rel.loc[test_res_rel['Headline'].isin(failed_headlines_albert)][['Headline', 'articleBody']]
failed_instances_bodies_albert = test_res_rel.loc[test_res_rel['articleBody'].isin(failed_bodies_albert)][['Headline', 'articleBody']]
intersection = failed_instances_headlines_albert.merge(failed_instances_bodies_albert, 'inner', on=['Headline', 'articleBody'])
union = failed_instances_headlines_albert.merge(failed_instances_bodies_albert, 'outer', on=['Headline', 'articleBody'])
print(f"Body instances size: {failed_instances_bodies_albert.shape[0]}, headline instances size: {failed_instances_headlines_albert.shape[0]}")
print(f"Intersection: {intersection.shape[0]}, union: {union.shape[0]}")
print(f"Jaccard similarity: {intersection.shape[0] / union.shape[0]}")

In [None]:
correct_bodies_albert = set(grouped_bodies.loc[grouped_bodies['prop_albert'] > 0.99]['articleBody'].values)
correct_instances_headlines_albert = test_res_rel.loc[test_res_rel['Headline'].isin(correct_headlines_albert)][['Headline', 'articleBody']]
correct_instances_bodies_albert = test_res_rel.loc[test_res_rel['articleBody'].isin(correct_bodies_albert)][['Headline', 'articleBody']]
intersection = correct_instances_headlines_albert.merge(correct_instances_bodies_albert, 'inner', on=['Headline', 'articleBody'])
union = correct_instances_headlines_albert.merge(correct_instances_bodies_albert, 'outer', on=['Headline', 'articleBody'])
print(f"Body instances size: {correct_instances_bodies_albert.shape[0]}, headline instances size: {correct_instances_headlines_albert.shape[0]}")
print(f"Intersection: {intersection.shape[0]}, union: {union.shape[0]}")
print(f"Jaccard similarity: {intersection.shape[0] / union.shape[0]}")

# (Discarded) Lexical Overlap between successful and unsuccessful groups

In [None]:
punct = set(string.punctuation)
sw = stopwords.words('english')
punct.update(['`', '\'', '\'s', '’', '‘', '“', '”', '“', '``',"''"])
stemmer = PorterStemmer()
def tokenize_corpus(corpus):
    token_dict = {}
    result = []
    tf = {}
    for passage in corpus:
        tokens = word_tokenize(passage)
        tokenized = []
        for token in tokens:
            token = token.lower()
            if token not in sw and token not in punct:
                stem = stemmer.stem(token)
                tokenized.append(stem)
                token_dict[stem] = token
                if stem not in tf.keys():
                    tf[stem] = 0
                tf[stem] += 1
        result.append(tokenized)
    return (result, token_dict, tf)


In [None]:
def sort_dict(x):
    return [(k, v) for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)]

tokenized_set, token_dict1, tf1 = tokenize_corpus(correct_headlines_albert)
word_dict_albert = {}
for stem in tf1.keys():
    word_dict_albert[token_dict1[stem]] = tf1[stem]
    
word_dict_albert = sort_dict(word_dict_albert)


tokenized_set, token_dict2, tf2 = tokenize_corpus(correct_headlines_base)
word_dict_base = {}
for stem in tf2.keys():
    word_dict_base[token_dict2[stem]] = tf2[stem]
    
word_dict_base = sort_dict(word_dict_base)

t=15

word_freq_albert = word_dict_albert[:t]
word_freq_base = word_dict_base[:t]
print(f"Top {t} frequent words in successfull headlines of ALBERT: {word_freq_albert}")
print(f"Top {t} frequent words in successfull headlines of Baseline: {word_freq_base}")

overlap = len(set([w[0] for w in word_freq_albert]).intersection(set([w[0] for w in word_freq_base]))) / t
print(f"The overlap of this top {t} is {overlap}")

In [None]:

tokenized_set, token_dict1, tf1 = tokenize_corpus(failed_headlines_albert)
word_dict_albert = {}
for stem in tf1.keys():
    word_dict_albert[token_dict1[stem]] = tf1[stem]
    
word_dict_albert = sort_dict(word_dict_albert)


tokenized_set, token_dict2, tf2 = tokenize_corpus(failed_headlines_base)
word_dict_base = {}
for stem in tf2.keys():
    word_dict_base[token_dict2[stem]] = tf2[stem]
    
word_dict_base = sort_dict(word_dict_base)

t=15

word_freq_albert = word_dict_albert[:t]
word_freq_base = word_dict_base[:t]
print(f"Top {t} frequent words in failed headlines of ALBERT: {word_freq_albert}")
print(f"Top {t} frequent words in failed headlines of Baseline: {word_freq_base}")

overlap = len(set([w[0] for w in word_freq_albert]).intersection(set([w[0] for w in word_freq_base]))) / t
print(f"The overlap of this top {t} is {overlap}")

In [None]:
display(grouped_res.loc[grouped_res['prop_albert'] < 0.001].sort_values('difference', ascending=True)[:10])

In [None]:
display(grouped_res.loc[grouped_res['prop_base'] < 0.001].sort_values('difference', ascending=False)[:10])

In [None]:
display(test_res_rel)

In [None]:
tokenized_corpus = {}
bodies =  list(set(test_res_rel['articleBody'].values))
headlines = list(set(test_res_rel['Headline'].values))
corpus = bodies + headlines
for text in tqdm(corpus, total=len(corpus)):
    tokenized_corpus[text] = tokenize_corpus([text])[0][0]
    
print(len(tokenized_corpus))

In [None]:
overlap = correct.copy()
overlap_values = []
i = 0
for (headline, body) in tqdm(overlap[['Headline', 'articleBody']].values, total=overlap.shape[0]):
        headline_token_set = set(tokenized_corpus[headline])
        body_token_set = set(tokenized_corpus[body])
        intersect = headline_token_set.intersection(body_token_set)
        union = headline_token_set.union(body_token_set)
        overl = len(intersect) / (len(headline_token_set))
#         if i < 2:
#             print("###########",i)
#             print(headline)
#             print(headline_token_set)
#             print(body)
#             print(body_token_set)
#             print(overl)
#             i += 1 
        overlap_values.append(overl)
overlap['overlap'] = overlap_values

In [None]:
display(overlap)

# (Discarded) Investigation whether Lexical Overlap between headline and body is correlated to performance

In [None]:
plt.hist(overlap.loc[overlap['correct_albert'] == True]['overlap'], bins = 20)
plt.title("Proportion of headline tokens present in the body for related correctly predicted instances (ALBERT)")
plt.grid()
plt.show()

In [None]:
plt.hist(overlap.loc[overlap['correct_albert'] == False]['overlap'], bins = 20)
plt.title("Proportion of headline tokens present in the body for related falsely predicted instances (ALBERT)")
plt.grid()
plt.show()

In [None]:
# falsely_agr = []
# falsely_dsc = []
# falsely_dsg = []
# for headline in grouped_res['Headline'].values:
#     instances = test_res_rel.loc[(test_res_rel['Headline'] == headline)]
#     falsely_agr.append(instances.loc[intance['Stance_albert']])

# (Discarded) Jaccard Similarity Between Succesful and Unsuccesful groups

In [None]:
failed_bodies = grouped_bodies.loc[grouped_bodies['prop_albert'] < 0.001]['articleBody'].values
failed_headlines = grouped_res.loc[grouped_res['prop_albert'] < 0.001]['Headline'].values
instances_bodies_failed = test_res_rel.loc[test_res_rel['articleBody'].isin(failed_bodies)][['Headline', 'articleBody']]
instances_headlines_failed = test_res_rel.loc[test_res_rel['Headline'].isin(failed_headlines)][['Headline', 'articleBody']]
print(instances_bodies_failed.shape)
print(instances_headlines_failed.shape)

print(pd.merge(instances_bodies_failed, instances_headlines_failed).shape)

In [None]:
from itertools import chain
success_instances_albert_headlines = test_res_rel.loc[test_res_rel['Headline'].isin(correct_headlines_albert)][['Headline', 'articleBody']]
success_instances_albert_bodies = test_res_rel.loc[test_res_rel['articleBody'].isin(correct_bodies_albert)][['Headline', 'articleBody']]
success_merged_dfs = pd.merge(success_instances_albert_headlines,success_instances_albert_bodies,how='inner',on=['Headline', 'articleBody']).values
success_corpus =  list(chain.from_iterable(success_merged_dfs))

In [None]:
failed_instances_albert_headlines = test_res_rel.loc[test_res_rel['Headline'].isin(failed_headlines_albert)][['Headline', 'articleBody']]
failed_instances_albert_bodies = test_res_rel.loc[test_res_rel['articleBody'].isin(failed_bodies_albert)][['Headline', 'articleBody']]
failed_merged_dfs = pd.merge(failed_instances_albert_headlines,failed_instances_albert_bodies,how='inner',on=['Headline', 'articleBody']).values
failed_corpus =  list(chain.from_iterable(failed_merged_dfs))

In [None]:
tf_success = sort_dict(tokenize_corpus(success_corpus)[2])
tf_failed = sort_dict(tokenize_corpus(failed_corpus)[2])
# tf_success = sort_dict(tokenize_corpus(correct_headlines_albert)[2])
# tf_failed = sort_dict(tokenize_corpus(failed_headlines_albert)[2])

In [None]:
print(tf_success[:15])
print(tf_failed[:15])

In [None]:
T = [5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 500, 1000, 5000]
print("Jaccard similarity for")
for t in T:
    success_set = set([tf[0] for tf in tf_success[:t]])
    failed_set = set([tf[0] for tf in tf_failed[:t]])
    print(f"   - top {t} tokens: {len(success_set.intersection(failed_set))/len(success_set.union(failed_set))}")

# Appendix: Difficult Headlines/Body Examples

In [None]:
example_body_row = grouped_bodies.loc[(grouped_bodies['prop_albert'] < 0.001) & (grouped_bodies['total'] > 5)].loc[889]#.loc[grouped_bodies['Body ID'] == 2557]
# display(example_body_row)

In [None]:
print(f"Example Body:")
example_body = example_body_row['articleBody']
print(example_body)
associated_headlines = test_res_rel.loc[test_res_rel['articleBody'] == example_body]
print(f"Associated headlines n={associated_headlines.shape[0]}:")
display(associated_headlines)
print(associated_headlines['Headline'].values)

In [None]:
example_headline_row = grouped_res.loc[(grouped_res['prop_albert'] < 0.001) & (grouped_res['total'] < 10)].sample(n=1)


In [None]:
display(example_headline_row)
print(f"Example Headline:")
example_headline = example_headline_row['Headline'].values[0]
print(example_headline)
associated_bodies = test_res_rel.loc[test_res_rel['Headline'] == example_headline]
print(f"Associated bodies n={associated_bodies.shape[0]}:")
display(associated_bodies[['articleBody', 'Stance_true', 'Stance_albert', 'Stance_baseline']])
print([body[:100] for body in associated_bodies['articleBody']])

# Corpus Statistics

In [None]:
train_bodies = pd.read_csv(TRAIN_BODIES_PATH, names=['Body ID', 'articleBody'], header=0)
train_stances = pd.read_csv(TRAIN_STANCES_PATH, names=['Headline', 'Body ID', 'Stance'], header=0)

test_bodies = pd.read_csv(TEST_BODIES_PATH, names=['Body ID', 'articleBody'], header=0)
test_stances = pd.read_csv(TEST_STANCES_PATH, names=['Headline', 'Body ID', 'Stance'], header=0)



print("Num headlines:", len(set(test_stances['Headline'])) + len(set(train_stances['Headline'])))
print("Num bodies:", len(set(test_bodies['articleBody'].values)) + len(set(train_bodies['articleBody'].values)))
print("Num instances:", train_stances.shape[0] + test_stances.shape[0])

stance_count = {}
all_stances = np.concatenate((train_stances['Stance'].values, test_stances['Stance'].values))
for stance in all_stances:
    if stance not in stance_count.keys():
        stance_count[stance] = 0
    else:
        stance_count[stance] += 1
print(stance_count)
stance_freq = [count / len(all_stances) for count in stance_count.values()]
print(stance_freq)