# Data Loading/Prep

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
import string
# stopwords
nltk.download("stopwords")

In [None]:
TEST_BODIES_PATH = "data/fnc-1/competition_test_bodies.csv"
TEST_STANCES_PATH = "data/fnc-1/competition_test_stances.csv"
TRAIN_BODIES_PATH = "data/fnc-1/train_bodies.csv"
TRAIN_STANCES_PATH = "data/fnc-1/train_stances.csv"
ALBERT_PREDICTIONS = "data/fnc-1/golden_labels_2.csv"
BASELINE_PREDICTIONS = "data/fnc-1/baseline_output.csv"


In [None]:
def create_merged_df(bodies_path, stances_path):
    bodies = pd.read_csv(bodies_path, names=['Body ID', 'articleBody'], header=0)
    stances = pd.read_csv(stances_path, names=['Headline', 'Body ID', 'Stance'], header=0)
    df = pd.merge(bodies, stances, on='Body ID')
    return df


albert = pd.read_csv(ALBERT_PREDICTIONS, names=['Headline', 'Body ID', 'Stance'], header=0)
baseline = pd.read_csv(BASELINE_PREDICTIONS, names=['Headline', 'Body ID', 'Stance'], header=0)
baseline.columns = ['Headline', 'Body ID', 'Stance_baseline']
test_res = create_merged_df(TEST_BODIES_PATH, TEST_STANCES_PATH)
# test_res['albert'] = pd.malbert[['Headline', 'Stance']]
test_res = pd.merge(test_res,albert, on=['Headline', 'Body ID'], suffixes=['_true', '_albert'])
test_res = pd.merge(test_res,baseline, on=['Headline', 'Body ID'])
train = create_merged_df(TRAIN_BODIES_PATH, TRAIN_STANCES_PATH)
display(test_res)
display(train)

In [None]:
print("A few sanity checks:")
correct_agree_albert = test_res.loc[(test_res['Stance_true'] == 'agree') & (test_res['Stance_albert'] == 'agree')].shape[0]
correct_agree_base = test_res.loc[(test_res['Stance_true'] == 'agree') & (test_res['Stance_baseline'] == 'agree')].shape[0]
print(f"Amount of agrees, predicted by ALBERT as agree: {correct_agree_albert}")
print(f"Amount of agrees, predicted by baseline as agree: {correct_agree_base}")


agree_disagree_albert = test_res.loc[(test_res['Stance_true'] == 'agree') & (test_res['Stance_albert'] == 'disagree')].shape[0]
print(f"Amount of agrees, predicted by ALBERT as disagree: {agree_disagree_albert}")
correct_unrelated = test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_baseline'] == 'unrelated')].shape[0]
print(f"Amount of unrelated pairs, correctly predicted by baseline as unrelated: {correct_unrelated}")


def get_confusion_value(true_label, predicted_label):
    return test_res.loc[(test_res['Stance_true'] == true_label) & (test_res['Stance_albert'] == predicted_label)].shape[0]
confusion_matrix_albert = [[get_confusion_value('agree', 'agree'),get_confusion_value('agree', 'disagree'),get_confusion_value('agree', 'discuss'),get_confusion_value('agree', 'unrelated')],
             [test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'agree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'disagree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'discuss')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'disagree') & (test_res['Stance_albert'] == 'unrelated')].shape[0]],
             [test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'agree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'disagree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'discuss')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'discuss') & (test_res['Stance_albert'] == 'unrelated')].shape[0]],
             [test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'agree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'disagree')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'discuss')].shape[0],
            test_res.loc[(test_res['Stance_true'] == 'unrelated') & (test_res['Stance_albert'] == 'unrelated')].shape[0]]]
print(f"confusion matrix for ALBERT: {confusion_matrix_albert}")
print("WARNING: there seems to be a discrepancy")

In [None]:
test_headlines = set(test_res['Headline'].values)
train_count = 0
for headline in train['Headline'].values:
    if headline in test_headlines:
        train_count += 1

train_length = len(train['Headline'].values)
print(f"There are {train_count} train headlines that occur in the test set out of {train_length}")


train_headlines = set(train['Headline'].values)
test_count = 0
for headline in test_res['Headline'].values:
    if headline in train_headlines:
        test_count += 1
test_length = len(test_res['Headline'].values)
print(f'There are {test_count} test headlines that occur in the train set out of {test_length}')

print(f'We have {len(train_headlines)} unique train headlines and {len(test_headlines)}, the intersection has size {len(train_headlines.intersection(test_headlines))}')

In [None]:
test_res_rel = test_res.loc[test_res['Stance_true'] != 'unrelated']
correct = test_res_rel.copy()
correct['correct_albert'] = test_res_rel['Stance_true'] == test_res_rel['Stance_albert']
correct['correct_base'] = test_res_rel['Stance_true'] == test_res_rel['Stance_baseline']
correct = correct[['articleBody', 'Headline', 'Stance_true', 'correct_albert', 'correct_base']]
display(correct)

In [None]:
correct_count = correct[['Headline', 'correct_albert', 'correct_base']].groupby(['Headline']).sum().sort_values('correct_albert', ascending=False)
correct_count.reset_index(level=0, inplace=True)
display(correct_count)

In [None]:
print(f"Mean correct albert: {correct_count['correct_albert'].mean()}")
print(f"Mean correct base: {correct_count['correct_base'].mean()}")

In [None]:
pair_count = {}
rel_headlines = set(test_res_rel['Headline'].values)
# print(rel_headlines)
for head in rel_headlines:
    pair_count[head] = test_res_rel.loc[test_res_rel['Headline'] == head].shape[0]
    
grouped_res = correct_count.copy()
missed_count_albert = []
missed_count_base = []
total = []

for headline in grouped_res['Headline'].values:
    total.append(pair_count[headline])
    missed_count_albert.append(pair_count[headline] - grouped_res.loc[grouped_res['Headline'] == headline]['correct_albert'].values[0])
    missed_count_base.append(pair_count[headline] - grouped_res.loc[grouped_res['Headline'] == headline]['correct_base'].values[0])
     
grouped_res['missed_albert'] = missed_count_albert
grouped_res['missed_base'] = missed_count_base
grouped_res['total'] = total

grouped_res['prop_albert'] = grouped_res['correct_albert'] / grouped_res['total']
grouped_res['prop_base'] = grouped_res['correct_base'] / grouped_res['total']

grouped_res['difference'] = grouped_res['correct_albert'] - grouped_res['correct_base']
display(grouped_res)
# display(grouped_res.loc[grouped_res['total'] > 20].sort_values('prop_albert', ascending=False))


In [None]:
display(grouped_res[['Headline','difference','correct_albert','correct_base']].sort_values('difference', ascending=False))

In [None]:
display(grouped_res.sort_values(['prop_albert', 'total'], ascending=[True, False])[:10])

In [None]:
display(grouped_res.sort_values(['prop_base', 'total'], ascending=[True, False])[:10])

In [None]:
print(f"Average score for albert: {grouped_res['prop_albert'].mean()}")
print(f"Average score for baseline: {grouped_res['prop_base'].mean()}")

In [None]:
failed_count_albert = grouped_res.loc[grouped_res['prop_albert'] < 0.001].shape[0]
failed_count_base = grouped_res.loc[grouped_res['prop_base'] < 0.001].shape[0]
print(f"Amount of headlines where models failed completely: albert: {failed_count_albert}, base: {failed_count_base}")
succ_count_albert = grouped_res.loc[grouped_res['prop_albert'] > 0.999].shape[0]
succ_count_base = grouped_res.loc[grouped_res['prop_base'] > 0.999].shape[0]
print(f"Amount of headlines where models scored perfectly: albert: {succ_count_albert}, base: {succ_count_base}")

In [None]:
hist_range = [0, 50]
plt.hist(grouped_res['total'], bins=20, range=hist_range)
print(f"Amount of headlines that are outliers (larger than {hist_range[1]}): {grouped_res.loc[grouped_res['total'] > hist_range[1]].shape[0]}")
plt.grid()
plt.ylabel("Frequency")
plt.xlabel("Amount of bodies for headline")
plt.show()

In [None]:
hist_range = [0,1]
plt.hist(grouped_res['prop_albert'], bins=20, range=hist_range)
plt.grid()
plt.title("Correctly predicted bodies by ALBERT model")
plt.ylabel('Frequency')
plt.xlabel('Proportion of correctly predicted bodies per headline')
plt.show()

In [None]:
hist_range = [0,1]
plt.hist(grouped_res['prop_base'], bins=20, range=hist_range)
plt.grid()
plt.title("Correctly predicted bodies by baseline model")
plt.ylabel('Frequency')
plt.xlabel('Proportion of correctly predicted bodies per headline')
plt.show()

In [None]:
correct_headlines_albert = set(grouped_res.loc[grouped_res['prop_albert'] > 0.99]['Headline'].values)
correct_headlines_base = set(grouped_res.loc[grouped_res['prop_base'] > 0.99]['Headline'].values)
failed_headlines_albert = set(grouped_res.loc[grouped_res['prop_albert'] < 0.01]['Headline'].values)
failed_headlines_base = set(grouped_res.loc[grouped_res['prop_base'] < 0.01]['Headline'].values)
print(f"Intersection size of set of correct headlines for albert ({len(correct_headlines_albert)}) and baseline ({len(correct_headlines_base)}): {len(correct_headlines_albert.intersection(correct_headlines_base))}")
print(f"Intersection size of set of failed headlines for albert ({len(failed_headlines_albert)}) and baseline ({len(failed_headlines_base)}): {len(failed_headlines_albert.intersection(failed_headlines_base))}")

In [None]:
punct = set(string.punctuation)
sw = stopwords.words('english')
stemmer = PorterStemmer()

def tokenize_corpus(corpus):
    token_dict = {}
    result = []
    tf = {}
    for passage in corpus:
        tokens = word_tokenize(passage)
        tokenized = []
        for token in tokens:
            if token not in sw and token not in punct:
                stem = stemmer.stem(token.lower())
                tokenized.append(stem)
                token_dict[stem] = token
                if stem not in tf.keys():
                    tf[stem] = 0
                tf[stem] += 1
        result.append(tokenized)
    return (result, token_dict, tf)

In [None]:
def sort_dict(x):
    return [(k, v) for k, v in sorted(x.items(), key=lambda item: item[1], reverse=True)]

tokenized_set, token_dict1, tf1 = tokenize_corpus(correct_headlines_albert)
word_dict_albert = {}
for stem in tf1.keys():
    word_dict_albert[stem] = tf1[stem]
    
word_dict_albert = sort_dict(word_dict_albert)


tokenized_set, token_dict2, tf2 = tokenize_corpus(correct_headlines_base)
word_dict_base = {}
for stem in tf2.keys():
    word_dict_base[stem] = tf2[stem]
    
word_dict_base = sort_dict(word_dict_base)

t=15

print(f"Top {t} frequent words in successfull headlines of ALBERT: {word_dict_albert[:t]}")
print(f"Top {t} frequent words in successfull headlines of Baseline: {word_dict_base[:t]}")

# Intermezzo: corpus statistics

In [None]:
TRAIN_BODIES_PATH = "data/fnc-1/train_bodies.csv"
TRAIN_STANCES_PATH = "data/fnc-1/train_stances.csv"

train_bodies = pd.read_csv(TRAIN_BODIES_PATH, names=['Body ID', 'articleBody'], header=0)
train_stances = pd.read_csv(TRAIN_STANCES_PATH, names=['Headline', 'Body ID', 'Stance'], header=0)


print("Num headlines:", len(set(stances['Headline'])) + len(set(train_stances['Headline'])))
print("Num bodies:", len(set(bodies['articleBody'])) + len(set(train_bodies['articleBody'])))
print("Num instances:", train_stances.shape[0] + stances.shape[0])

stance_count = {}
all_stances = np.concatenate((train_stances['Stance'].values, stances['Stance'].values))
for stance in all_stances:
    if stance not in stance_count.keys():
        stance_count[stance] = 0
    else:
        stance_count[stance] += 1
print(stance_count)
stance_freq = [count / len(all_stances) for count in stance_count.values()]
print(stance_freq)

In [None]:

df['Predictions'] = albert['Stance']
df['Baseline'] = baseline['Stance']
display(df.sample(n=5))
print(df.shape)

In [None]:
# Filter out only related pairs
df_rel = df.loc[df['Stance'] != 'unrelated']

display(df_rel.sample(n=5))
print(df_rel.shape)

# Data Transformation

In [None]:
import nltk
import sklearn
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
import string
import gensim
import gensim.downloader as api

# Download if not installed already
# nltk.download('stopwords')

In [None]:
# load word2vec model
wv = api.load('word2vec-google-news-300')
# stopwords
nltk.download("stopwords")
sw = stopwords.words('english')
# punctuation
punct = set(string.punctuation)
# stemmer
stemmer = PorterStemmer()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_matrix  = tf_vectorizer.fit_transform(df_rel['Headline'])
#importing LDAfrom gensim import corpora, models
from sklearn.decomposition import LatentDirichletAllocation
#Fitting 
lda = LatentDirichletAllocation(n_components=100, learning_method='online', 
                                          random_state=0, verbose=0, n_jobs = -1)
lda_model = lda.fit(tfidf_matrix)
lda_matrix = lda_model.transform(tfidf_matrix)
lda_matrix.shape



In [None]:
def print_topics(model, count_vectorizer, n_top_words):
    words = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx )
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))# Print the topics found by the LDA model

print("Topics found via LDA:")
print_topics(lda_model, df_rel['Headline'], 15)

In [None]:
# embeddings = np.zeros((7064, 300))
# print(embeddings.shape)
# not_found = set([])
# for i, headline in tqdm(enumerate(df_rel['Headline'].values), total=len(df_rel['Headline'].values)):
#     tokens = word_tokenize(headline)
#     # Doc vec is average of summed word vectors
#     doc_vec = np.zeros(300)
#     n = len(tokens)
#     for token in tokens or token in sw:
#         if token in punct:
#             continue
#         try: 
#             vec = wv[token]
#             doc_vec += vec                
#         except KeyError: 
#             not_found.add(token)
        
#     doc_vec /= n
#     embeddings[i] = np.array(doc_vec)
#     print(embeddings[i])
#     break
# print(f'{len(not_found)} Tokens not found: {not_found}')
        
        
### Tokenization and lemmatization    
#     new_headline = []
#     for token in tokens:
#         token = token.lower()
#         if token not in punct and token not in sw:
#             new_headline.append(stemmer.stem(token))
#     tokenized_headlines.append(new_headline)
# 'th' means tokenized headlines
# print(embeddings.shape)
# df_rel['embeddings'] = embeddings
# display(df_rel[['Headline', 'embeddings']].sample(n=5))
# from sklearn.cluster import KMeans
# print(df_rel['embeddings'].values.shape)
# vector = np.vectorize(np.float64)
# X = df_rel['embeddings'].values
# for i, x in enumerate(X):
#     X[i] = vector(x)
# # print(df_rel['embeddings'].values.reshape(7064,300).shape)
# print(embeddings.shape)
# model = KMeans(n_clusters=100, verbose=10)
# print(embeddings.shape)
# output = model.fit(embeddings)

# Error Analysis

In [None]:
df_rel['Correct Albert'] = df_rel['Stance'] == df_rel['Predictions']
df_rel['Correct Base'] = df_rel['Stance'] == df_rel['Baseline']


In [None]:
display(df_rel[['Stance', 'Predictions', 'Baseline','Correct Albert', 'Correct Base']].sample(n=5))

In [None]:
grouped = df_rel.groupby(['Headline','Head ID']).sum().sort_values('Correct Albert', ascending=True)
counts = [df_rel.loc[df_rel['Headline'] == h].shape[0] for h in [i[0] for i in grouped.index]]
# print(counts, grouped.index)
# display(grouped)
grouped['Total'] = counts
grouped['Prop Albert'] = grouped['Correct Albert'] / grouped['Total']
grouped['Prop Base'] = grouped['Correct Base'] / grouped['Total']
display(grouped)
print('Mean proportion albert:', grouped['Prop Albert'].mean())
print('Mean proportion baseline:', grouped['Prop Base'].mean())

In [None]:
failed = grouped.loc[grouped['Correct Albert'] == 0]
success = grouped.loc[grouped['Prop Albert'] > 0.5]
failed_headlines = [idx[0] for idx in failed.index]
success_headlines = [idx[0] for idx in success.index]
print(success_headlines, len(success_headlines))

In [None]:
failed_headlines = [(h, df_rel.loc[df_rel['Headline'] == h].shape[0]) for h in failed_headlines]
print(failed_headlines)

In [None]:
import matplotlib.pyplot as plt

plt.hist([f[1] for f in failed_headlines], bins=30, range = [0, 60])
plt.grid()
plt.savefig('hist_failed.pdf')
plt.show()


In [None]:
all_headlines = df_rel['Headline'].values
all_headlines = [(h, df_rel.loc[df_rel['Headline'] == h].shape[0]) for h in all_headlines]

In [None]:
plt.hist([f[1] for f in all_headlines], bins=30, range = [0, 60])
plt.grid()
plt.savefig('hist_all.pdf')
plt.show()


In [None]:
success_headlines = [(h, df_rel.loc[df_rel['Headline'] == h].shape[0]) for h in success_headlines]
# print(success_headlines)
plt.hist([f[1] for f in success_headlines], bins=30, range = [0, 60])
plt.grid()
plt.savefig('hist_succ.pdf')
plt.show()
