# SOUNER

Perform NER on a text (here a SOU) and create an actor-network of the entities. 

### Requirements

* Install SweNER, source is here: https://spraakbanken.gu.se/eng/personal/dimitrios#research
* Computer connected to the Internet, for the gendercounter API. For offline usage and higher performance, install https://github.com/christopherkullenberg/gendercounter
* Credit: Kokkinakis D., Niemi J., Hardwick S., Lindén K. and Borin L. (2014). *HFST-SweNER. A New NER Resource for Swedish. Proceedings of the 9th edition of the Language Resources and Evaluation Conference (LREC)*. Reykjavik, Iceland ([pdf](http://www.lrec-conf.org/proceedings/lrec2014/pdf/391_Paper.pdf)).


In [42]:
from subprocess import Popen, PIPE, STDOUT
from bs4 import BeautifulSoup
import json
import requests
from gexf import *
import gendercounter
import uuid

'''
Send these two files through the monad() function of the bottom of this notebook. 
  - Also, please change filename in the monad() function to prevent overwriting
  - Also, please change the first string passed to the function createactornetwork() to prevent node collission
'''
#SOU files:
#soufile = open('SOU 1933_22 - Förslag till lag om sterilisering av vissa sinnessjuka, sinnesslöa eller av annan rubbning av själsverksamheten lidande personer.txt', encoding='utf-8')
#soufile = open('SOU 1978_23 - Växtförädling.txt', encoding='utf-8')
#soufile = open('SOU 1989_73 - TV-politiken.txt', encoding='utf-8')
soufile = open('SOU 1979_69 - Nya vyer.txt', encoding='utf-8')

soulines = soufile.read()

def sendtoswener(text):
    '''
    Input: A string. 
    Output: SweNER XML
    '''
    p = Popen(['hfst-swener'], stdout=PIPE, stdin=PIPE, stderr=STDOUT)    
    swener_stdout = p.communicate(input=text.encode())[0]
    return(swener_stdout.decode())

def parseswenerxml(swenerxml):
    '''
    Input: Swener XML output. 
    Output: Dictionary with name as key and a tuple of 'sbt' 
    and 'type' as values.
    '''
    soup = BeautifulSoup(swenerxml, "lxml") #Change 'lxml' if other parser. 
    valuedict = {}
    for p in soup.findAll("enamex"):
        name = ""
        for x in p:
            name = x
        valuedict[name] = (p['sbt'], p['type'])        
    return(valuedict)

In [43]:
def gendercounterAPI(name):
    '''
    Input: A Swedish name.
    Output: A json-style dictionary - {'name': 'Anna', 'gender': 'female'}
    '''
    webcontent = requests.get('http://genuskollen.se/cgi-bin/api.py?name=' + name)
    jsonobject = webcontent.json()
    return(jsonobject)

In [44]:
if gendercounter.textinput("Adam")[1][1] == 1:
    print("yes")
#gendercounter.textinput("Eva")

yes


In [45]:
def createactornetwork(sourcetext, SwenerDict):
    '''
    Input: Name of source text as string
           Dictionary string as key, tuple as value: {key: (value, value)}
           Example: 
           {Anders Thelin, Svensk Handel.': ('HUM', 'PRS'), 'Staten': ('WAA', 'WRK')}
    Output: Gendered humans. Dictionary of tuples and dictionaries: {(x,y): {key:value}}
            Also prints descriptive statistics. 
    '''    
    male = 0
    female = 0
    undetermined = 0
    ORG = 0 
    LOC = 0
    WRK = 0
    actornetwork = {}
    #for key, value in parseswenerxml(sendtoswener(soulines)).items():
    for key, value in SwenerDict.items():
        if value[0] == "HUM":
            name = key.split(' ')[0].capitalize()
            #print(name)
            if gendercounter.textinput(name)[0][1] == 1:
                female += 1
                gender = "female"
            elif gendercounter.textinput(name)[1][1] == 1:
                    male += 1
                    gender = "male"
            else:
                undetermined += 1
                gender = "undetermined"
                #print(gender)
            actornetwork[(key, gender)] = {sourcetext: "source"}
        else:
            actornetwork[(key, value[1])] = {sourcetext: "source"}
            if value[1] == "ORG":
                ORG += 1
            elif value[1] == "LOC":
                LOC += 1
            elif value[1] == "WRK":
                WRK += 1
    print("Men: " + str(male))
    print("Women: " + str(female))
    print("Undetermined Human: " + str(undetermined))
    print("ORGanizations: " + str(ORG))
    print("LOCations: " + str(LOC))
    print("WRK: " + str(WRK))       
    return(actornetwork)

In [46]:

def monad(name, actornetwork):
    # requires from gexf import *
    '''
    A monad is a "point of view on all the other entities taken severally and
    not as a totality" (Latour 2012: 598).
    Input: Name of network, Dictionary of tuples and dictionaries: {(x,y): {key:value}}
           Example: ("@BjorklundVictor", "person"): {"Finwire": "employer"}
    Output: Gexf network file with nodes, attributes and edges.
    '''
    gexf = Gexf(name, "The whole is always smaller than its parts")
    graph = gexf.addGraph("undirected", "static", "Directed network")
    attribute_node = graph.addNodeAttribute("Type", "default_value", "string") 
    edgecounter = ''
    for key, value in actornetwork.items():
        n = graph.addNode(key[0], key[0])
        n.addAttribute(attribute_node, key[1])
        for k, v in value.items():
            m = graph.addNode(k, k)
            m.addAttribute(attribute_node, v)
            graph.addEdge(str(edgecounter), key[0], k)
            edgecounter = uuid.uuid4().hex # Use a random ID to prevent collissions when merging. 
    output_file = open("1979_69.gexf","wb")
    gexf.write(output_file)  

monad("SOUTEST", createactornetwork("1979_69", parseswenerxml(sendtoswener(soulines))))

Men: 25
Women: 1
Undetermined Human: 34
ORGanizations: 73
LOCations: 30
WRK: 9
Directed network undirected static  
number of nodes : 176
number of edges : 175
