In [1]:
import sys, time, json, re
from anytree import Node, RenderTree
from anytree.exporter import JsonExporter
from anytree import Node, RenderTree

sys.setrecursionlimit(100000)

In [2]:

def group_arguments(tableau):
    argGroup = tableau[0]
    i = 1

    while True:

        if i > len(tableau) - 1:
            return [argGroup]

        stance = re.search(r"(Con|Pro)(?::)", tableau[i])
        if stance == None:
            argGroup = argGroup + " " + tableau[i]
            i+=1
        else:
            return [argGroup] + group_arguments(tableau[i:]) 


In [3]:
def rawKialo2Json(input_file):
    with open(input_file, 'r') as fi:
        lines = []
        for line in fi:
            if line.startswith("Sources:"):
                break
            lines.append(line.strip())

        lines = [x for x in lines if x]

        # list containing each parsed comment
        result = []

        # we remove the first two lines of the text
        # as we don't need the header
        header = []
        for line in range(0, 4):
            header.append(lines.pop(0))

        subject = header[1]

        lines = group_arguments(lines)

        ##                                            ##
        ##                 REGEDITS                   ##
        ##                                            ##
        # iterate every row in the text file
        counter = 1
        for line in lines:

            # find the tree position the comment is in
            tree =  re.search(r"^(\d{1,}.)+", line)

            # find if the comment is Pro or Con
            stance = re.search(r"(Con|Pro)(?::)", line)

            # find the text of the comment
            content = re.search(r"((Con|Pro)(?::\s))(.*)", line)

            # define the hierarchy of the current comment
            # which is based on the tree structure

            parsed = re.findall(r"(\d{1,}(?=\.))+", tree.group())
            level = len(parsed)-1

            # make a dictionary with the single entry
            # and put it at the end of the list
            result.append({
                "Tree": tree.group(),
                "Level": level,
                "Stance": stance.group(1),
                "ToneInput": content.group(3),
                "node_id":subject.replace(" ","_")+"_"+str(counter)
            })

            counter+=1
        
        to_write = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))

    trees = [x["Tree"] for x in result]
    trees = ['1.'] + trees

    resultAsDict = { x["Tree"]: x for x in result }

    id2Node = {}


    for idNode in trees:
        if idNode == '1.':
            id2Node[idNode] = Node(idNode, node_id=-1)
        else:
            parentId = idNode[:idNode[:-1].rfind(".")+1]
            id2Node[idNode] = Node(idNode,
                                    parent=id2Node[parentId],
                                    tree=resultAsDict[idNode]["Tree"], 
                                    level=resultAsDict[idNode]["Level"], 
                                    stance=resultAsDict[idNode]["Stance"], 
                                    toneInput=resultAsDict[idNode]["ToneInput"], 
                                    subject=subject,
                                    node_id=resultAsDict[idNode]["node_id"]
    )

    return id2Node

In [4]:
def argumentTree2argumentPairTree(node, domains):    
    pairs = []
    
    if len(node.children) == 0:
        return pairs
    elif node.children != None:
        for child in node.children:
            if node.name != "1.":
                pair = {
                    "topArgument"       :   node.toneInput,
                    "subArgument"       :   child.toneInput,
                    "subject"           :   child.subject,
                    "subArgumentLevel"  :   child.level,
                    "domain"            :   domains
                }
                if child.stance == "Con":
                    pair["isAttack"] = True
                    pair["isSupport"] = False
                else:
                    pair["isAttack"] = False
                    pair["isSupport"] = True
                pairs.append(pair)

            pairs += argumentTree2argumentPairTree(child, domains)
        
    return pairs

In [5]:
import pandas as pd

kialoUrlIds = pd.read_csv("../../data/kialo/kialo-url-ids.csv", index_col=0)
pairs = []

for i, x in kialoUrlIds.iterrows():
  try:
    d = x.tags
    kialoUrlId = x.kialoUrlId

    # print(x)

    t = rawKialo2Json("../../data/kialo/debates/en/"+ kialoUrlId +".txt")
    pairs = pairs + argumentTree2argumentPairTree(t['1.'], d)
  except:
    continue
    

In [6]:
pairs

[{'topArgument': 'Football is very popular and a critical part of many communities.',
  'subArgument': 'The institution of American football alienates many communities.',
  'subject': 'Should American Football Be Banned?',
  'subArgumentLevel': 2,
  'domain': "['Sports', 'USA', 'Football', 'American_Football', 'Entertainment']",
  'isAttack': True,
  'isSupport': False},
 {'topArgument': 'The institution of American football alienates many communities.',
  'subArgument': 'The institution of American football promotes problematic gender roles, alienating people who do not conform to them.',
  'subject': 'Should American Football Be Banned?',
  'subArgumentLevel': 3,
  'domain': "['Sports', 'USA', 'Football', 'American_Football', 'Entertainment']",
  'isAttack': False,
  'isSupport': True},
 {'topArgument': 'The institution of American football promotes problematic gender roles, alienating people who do not conform to them.',
  'subArgument': 'American football promotes a form of masculi

In [7]:
pairs[0]

{'topArgument': 'Football is very popular and a critical part of many communities.',
 'subArgument': 'The institution of American football alienates many communities.',
 'subject': 'Should American Football Be Banned?',
 'subArgumentLevel': 2,
 'domain': "['Sports', 'USA', 'Football', 'American_Football', 'Entertainment']",
 'isAttack': True,
 'isSupport': False}

In [8]:
argSrc = [x["subArgument"] for x in pairs]
argTrg = [x["topArgument"] for x in pairs]
datasetSource = ["kialo" for x in pairs]
topic = [x["domain"] for x in pairs]
relations = ["attack" if x["isAttack"] else "support" for x in pairs]

d = pd.DataFrame.from_dict({
  "topic": topic,
  "argSrc" : argSrc,
  "argTrg" : argTrg,
  "relation" : relations
})

d

Unnamed: 0,topic,argSrc,argTrg,relation
0,"['Sports', 'USA', 'Football', 'American_Footba...",The institution of American football alienates...,Football is very popular and a critical part o...,attack
1,"['Sports', 'USA', 'Football', 'American_Footba...",The institution of American football promotes ...,The institution of American football alienates...,support
2,"['Sports', 'USA', 'Football', 'American_Footba...",American football promotes a form of masculini...,The institution of American football promotes ...,support
3,"['Sports', 'USA', 'Football', 'American_Footba...",Jim Harbaugh [2] and Rush Limbaugh [3] argue t...,American football promotes a form of masculini...,attack
4,"['Sports', 'USA', 'Football', 'American_Footba...",A player formerly coached by Jim Harbauge [4] ...,Jim Harbaugh [2] and Rush Limbaugh [3] argue t...,attack
...,...,...,...,...
53517,"['Environment', 'Health', 'Policy', 'Government']","In 2020, 72.7% of the US population on communi...","Despite the available alternatives, fluoridati...",support
53518,"['Environment', 'Health', 'Policy', 'Government']",Hong Kong and Singapore both have 100% of thei...,"Despite the available alternatives, fluoridati...",support
53519,"['Environment', 'Health', 'Policy', 'Government']",Fluoridated water acts as a systemic preventiv...,Alternatives to fluoridated water and promotin...,attack
53520,"['Environment', 'Health', 'Policy', 'Government']",Fluoridated water is considered as pharmacolog...,Fluoridated water acts as a systemic preventiv...,attack


In [9]:
d_filtered = d[~d["argSrc"].str.contains(r'See \d+(\.\d+)*')]
print(d_filtered.shape)
d_filtered.to_csv("../../data/kialoPairsEnglist.csv")

  d_filtered = d[~d["argSrc"].str.contains(r'See \d+(\.\d+)*')]


(51249, 4)
