In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np


import torch
from datasets import load_dataset, Dataset, load_metric
from collections import Counter
from sklearn.metrics import f1_score, recall_score, precision_score
from utils import numerical_df
import re

import matplotlib
import matplotlib.pyplot as plt

from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import itertools


from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS



### Hyperparameters

In [None]:

df_path = './annotated-dataset.csv'
df_org = pd.read_csv(df_path)
## Remove empty strings
df = df_org[df_org.tweet != '']
df = df[df.tweet.notnull()]

df.loc[df.argumentative >= 0.5, 'argumentative'] = 1
df.loc[df.argumentative < 0.5, 'argumentative'] = 0

df.loc[df.claim >= 0.5, 'claim'] = 1
df.loc[df.claim < 0.5, 'claim'] = 0

df.loc[df.evidence < 0.5, 'evidence'] = 0
df.loc[df.evidence >= 0.5, 'evidence'] = 1


df.loc[df.procon < 0, 'procon'] = -1
df.loc[df.procon > 0, 'procon'] = 1

num = len(df)

In [None]:
def pre(string, lang = 'en'):
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    
    words = word_tokenize(re.sub('[^a-zA-Z]', ' ', string.lower().strip()))
    meaningful_words = [w for w in words if not w in stops]   
    return ' '.join([stemmer.stem(w) for w in meaningful_words])

def mean_topic_tweet_over_lap(df):
    tt = df.tweet.map(pre).map(str.split).map(np.unique)
    t = df.topic.map(pre).map(str.split).map(np.unique)
    return np.mean(list(map(lambda xy: len(np.intersect1d(xy[0], xy[1]))/len(np.union1d(xy[0], xy[1])), zip(tt, t))))

def tweet_overlap(df):
    v = df.tweet.drop_duplicates().map(pre).map(str.split).map(np.unique)
    return np.mean(list(map(lambda xy: len(np.intersect1d(xy[0], xy[1]))/len(np.union1d(xy[0], xy[1])), itertools.combinations(v, 2))))

def generate_word_cloud(df, topic = None, label = None):
    text = " ".join(i.replace('<MENTION>', '') for i in df.drop_duplicates().tweet)
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(stopwords=stopwords, background_color="white", width=1000, height=500).generate(text)
    plt.figure(figsize=(15,10))
    plt.imshow(wordcloud)#, interpolation='bilinear')
    topic_str = '' if topic == None else f'for topic \n "{topic}"'
    label_str = '' if label == None else f'for label "{label}"'
    plt.title(f'Wordcloud of tweets {label_str}, {topic_str}')
    plt.axis("off")
    plt.show()

def generate_word_hist(df, topic = None, label = None):
    rm_mention = lambda x: str.replace(x, '<MENTION>', '')
    text = np.concatenate([i.split(' ') for i in df.tweet.drop_duplicates().map(rm_mention).map(pre)])

    counts = Counter(text)

    labels, values = zip(*counts.items())
    top = 10
    # sort your values in descending order
    indSort = np.argsort(values)[::-1][:top]

    # rearrange your data
    labels = np.array(labels)[indSort]
    values = np.array(values)[indSort]/sum(values)*100

    indexes = np.arange(len(labels))

    bar_width = 0.35

    plt.bar(indexes, values)

    # add labels
    plt.xticks(indexes + bar_width, labels, rotation=45)
    plt.xlabel(f'Top {top} words used in corpus')
    plt.ylabel('Proportion of corpus %')
    
    topic_str = '' if topic == None else f'for topic \n "{topic}", '
    label_str = '' if label == None else f'with label "{label}"'
    plt.title(f'Distribution of words used in tweets {label_str}{topic_str}')
    plt.show()
    
    
def df_info(df, topic = None, gen_word_cloud = True, gen_hist = True):
    print('Total documents:', len(df), 'of which is argumentative:', sum(df.argumentative))
    print('ADUs', len(df[(df.claim > 0) | (df.evidence > 0)]), 'claims:', sum(df.claim), 'evidence:', sum(df.evidence), 'claim with evidence', len(df[(df.claim > 0) & (df.evidence > 0)]))
    print('Support:', sum(df[df.procon > 0].procon), 'Contest:', len(df[df.procon < 0]))
    
    print('Mean tweet length:', df.tweet.map(str.split).map(len).mean())
    print('Mean claim length:', df[df.claim > 0].tweet.map(str.split).map(len).mean())
    print('Mean evidence length:', df[df.evidence > 0].tweet.map(str.split).map(len).mean())
    print()
    print('Mean topic tweet vocab share %:', mean_topic_tweet_over_lap(df))
    print('Mean claim, topic tweet vocab share %:', mean_topic_tweet_over_lap(df[df.claim > 0]))
    print('Mean evidence, topic tweet vocab share %:', mean_topic_tweet_over_lap(df[df.evidence > 0]))
    print()
    print('Mean claim tweet to tweet vocab overlap %:', tweet_overlap(df[df.claim > 0]))
    print('Mean evidence tweet to tweet vocab overlap %:', tweet_overlap(df[df.evidence > 0]))
    
    if gen_word_cloud:
        generate_word_cloud(df[df.argumentative > 0], topic, 'argumentative')
        generate_word_cloud(df[df.claim > 0], topic, 'claim')
        generate_word_cloud(df[df.evidence > 0], topic, 'evidence')
        generate_word_cloud(df[df.procon > 0], topic, 'pro')
        generate_word_cloud(df[df.procon < 0], topic, 'con')

    
    if gen_hist:
        generate_word_hist(df, topic)
        generate_word_hist(df[df.argumentative > 0], topic, 'argumentative')
        generate_word_hist(df[df.claim > 0], topic, 'claim')
        generate_word_hist(df[df.evidence > 0], topic, 'evidence')
        generate_word_hist(df[df.procon > 0], topic, 'pro')
        generate_word_hist(df[df.procon < 0], topic, 'con')
        
    return len(df), sum(df.argumentative), len(df[(df.claim > 0) | (df.evidence > 0)]), sum(df.claim), sum(df.evidence), len(df[(df.claim > 0) & (df.evidence > 0)]), sum(df[df.procon > 0].procon), len(df[df.procon < 0])

    

In [None]:
for topic in df.topic.drop_duplicates():
    print(topic)
    generate_word_cloud(df, topic, 'claim')

In [None]:
topics = df.topic.drop_duplicates()
task = 'claim'
hex_map = []
for topicX in topics:
    inst = []
    for topicY in topics:
        subset = df[((df.topic == topicX) | (df.topic == topicY)) & ((df[task] > .5))]
        r = np.round(sum(subset.groupby(['tweet']).size() == 2) / len(subset.tweet.drop_duplicates()), 2)
        if r == 0:
            inst.append(1.0)
        else:
            inst.append(r)
    hex_map.append(inst)

    



fig, ax = plt.subplots(figsize=(5,5))
im = ax.imshow(hex_map)
hex_map = np.array(hex_map)
labels = [
    'meat',
    'plant',
    'alternative',
    'vegan',
    'policy'
]

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(topics)), labels=labels)
ax.set_yticks(np.arange(len(topics)), labels=labels)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(labels)):
    for j in range(len(labels)):
        text = ax.text(j, i, hex_map[i, j],
                       ha="center", va="center", color="w")

#ax.set_title(f"Percentage of tweets overlapping with other topics for {task}")
fig.tight_layout()
plt.savefig(f'topic_overlap_{task}.pdf')
plt.show()

In [None]:
hex_map

In [None]:
df_info(df, None, False, True) # Total info

In [None]:
for topic in df.topic.drop_duplicates():
    print('Topic', topic)
    df_info(df[df.topic == topic], topic, False, False)
    print()

In [None]:
bc = chr(92)
br = lambda x: '{' + x + '}'
r = lambda x: round(x, 2)
row = lambda topic, num_tweet, num_arg, num_adu, num_claim, num_evi, num_claim_evi, num_pro, num_con: f'''
        {bc}hline
        {topic} & ${r(num_tweet)}$ & ${r(num_arg)}$ & ${r(num_adu)}$ & ${r(num_claim)}$ & ${r(num_evi)}$ & ${r(num_claim_evi)}$ & ${r(num_pro)}$ & ${r(num_con)}$ {bc}{bc}
    '''
rows = [row('Full set', *df_info(df, None, False, False)), *[row(topic, *df_info(df[df.topic == topic], topic, False, False)) for topic in df.topic.drop_duplicates()]]

print(f'''
{bc}begin{br("table")}[H]
    {bc}centering
    {bc}begin{br("tabular")}{br("L|ccc|ccc|cc")}
        Topic & Tweets & Argumentative & ADUs & Claims & Evidence & Claims with evidence & Pro & Con  {bc}{bc}
        {(bc + "n").join(rows)}
    {bc}end{br("tabular")}
    {bc}caption{br("Overall statistics comparison for the different topics and the overall full set. ADUs is argument discourse units and is a union of claim or evidence and says how many argumentative tweets contains either evidence or claims. The probabilities have been rounded to the nearest integer for this comparison.")}
    {bc}label{br("table:overall_stats_of_data")}
{bc}end{br("table")}
''')

In [None]:
import ast
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

evi_dist = {}

for i, evi in enumerate(df.evidence_type):
    if not pd.isna(evi):
        evi = ast.literal_eval(evi)
        for key in evi:
            if key in evi_dist:
                evi_dist[key] += evi[key]
            else:
                evi_dist[key] = evi[key]
                

labels, values = zip(*evi_dist.items())

# rearrange your data
labels = np.array(labels)
values = np.array(values)/sum(values)*100

indexes = np.arange(len(labels))

bar_width = 0.35

plt.bar(indexes, values)

# add labels
plt.xticks(indexes + bar_width, labels, rotation=45, ha="right")
plt.xlabel(f'Evidence types used in corpus')
plt.ylabel('Proportion of evidence labels %')


plt.title(f'Evidence types in corpus')

plt.savefig('evidence_types.pdf')
plt.show()