In [1]:
import operator
import gb.hypergraph.symbol as sym
import gb.hypergraph.edge as ed
import gb.nlp.parser as par
import gb.tools.json as json_tools
from gb.clusters.meronomy import Meronomy

In [2]:
MAX_PROB = -12

In [3]:
parser = par.Parser()

In [5]:
# read data
edge_data = json_tools.read('../all.json')

# build full edges list
full_edges = []
for it in edge_data:
    full_edges.append(ed.without_namespaces(ed.str2edge(it['edge'])))

In [6]:
# build meronomy
mer = Meronomy(parser, full_edges)
mer.normalize_graph()

# generate synonyms
mer.generate_synonyms()

In [7]:
def rel_contains(full_edge, term):
    if sym.is_edge(full_edge) and len(full_edge) > 2 and sym.is_edge(full_edge[2]):
        rel = full_edge[0]
        if sym.is_edge(rel):
            return term in rel
        else:
            return rel == term
    return False


say_edges = []
for full_edge in full_edges:
    if rel_contains(full_edge, 'says'):
        say_edges.append(full_edge)

In [8]:
def edge2str(edge):
    s = ed.edge2str(edge, namespaces=False)
    if sym.is_edge(edge):
        return s

    if s[0] == '+':
        s = s[1:]

    if len(s) == 0:
        return None

    if not s[0].isalnum():
        return None

    word = parser.make_word(s)
    if word.prob < MAX_PROB:
        return s

    return None

def edge2syn(edge):
    atom = edge2str(edge)
    if atom:
        syn_id = mer.syn_id(atom)
        if syn_id:
            return syn_id
    return None

In [9]:
sayers = {}
sayers_and_claims = {}
for edge in say_edges:
    sayer = edge2syn(edge[1])
    if sayer not in sayers_and_claims:
        sayers[sayer] = 0
        sayers_and_claims[sayer] = []
    sayers[sayer] += 1
    sayers_and_claims[sayer].append(edge[2])
        
sorted_sayers = sorted(sayers.items(), key=operator.itemgetter(1), reverse=True)
for t in sorted_sayers[:20]:
    syn_id = t[0]
    if syn_id:
        print('%s %s' % (mer.synonym_label(syn_id), t[1]))

{(+ donald trump), trump, donald} 358
{(+ bernie sanders), bernie, sanders} 93
{hillary, (+ hillary clinton), clinton} 88
{obama} 75
{paul, ryan, (+ paul ryan)} 21
{(+ mike pence), pence, mike} 17
{(+ john kasich), john, kasich} 15
{(+ gary johnson), (+libertarian (+ gary johnson)), johnson} 14
{(+the latest)} 14
{(+ white house)} 13
{(+the fbi), fbi} 11
{rubio, marco, (+ marco rubio)} 11
{(+ chris christie), chris, christie} 9
{(+ bill clinton)} 9
{(+ trump campaign)} 9
{mitch, (+ mitch mcconnell), mcconnell} 9
{giuliani, (+ rudy giuliani), rudy} 8
{(+ newt gingrich), gingrich, newt} 8
{warren, (+ elizabeth warren), elizabeth} 7


In [10]:
concepts_by_sayer = {}


def add_concepts(targ, src):
    for key in src:
        if key in targ:
            targ[key] += src[key]
        else:
            targ[key] = src[key]


def concepts_in_claim(claim, concept_map=None):
    if not concept_map:
        concept_map = {}
    syn_id = edge2syn(claim)
    if syn_id:
        if syn_id not in concept_map:
            concept_map[syn_id] = 0
        concept_map[syn_id] += 1
        
        if sym.is_edge(claim):
            for item in claim:
                concepts_in_claim(item, concept_map)
    return concept_map


def get_concepts_by_sayer(sayer, that_include=None):
    concept_map = {}
    for claim in sayers_and_claims[sayer]:
        claim_concepts = concepts_in_claim(claim)
        if not that_include:
            add_concepts(concept_map, claim_concepts)
        elif that_include in claim_concepts.keys():
            del claim_concepts[that_include]
            add_concepts(concept_map, claim_concepts)
    return concept_map


for sayer in sayers_and_claims:
    concepts_by_sayer[sayer] = get_concepts_by_sayer(sayer)

In [108]:
for t in sorted_sayers[:20]:
    syn_id = t[0]
    if syn_id:
        print('%s %s %s %s' % (syn_id, mer.synonym_label(syn_id), t[1], len(concepts_by_sayer[syn_id])))

2 {(+ donald trump), trump, donald} 358 1480
8 {sanders, (+ bernie sanders), bernie} 93 369
20 {hillary, clinton, (+ hillary clinton)} 88 338
258779 {obama} 75 302
765 {ryan, paul, (+ paul ryan)} 21 95
453 {(+ mike pence), mike, pence} 17 70
51 {john, kasich, (+ john kasich)} 15 62
1 {johnson, (+ gary johnson), (+libertarian (+ gary johnson))} 14 44
265028 {(+the latest)} 14 58
259384 {(+ white house)} 13 55
152 {fbi, (+the fbi)} 11 39
504 {(+ marco rubio), rubio, marco} 11 39
724 {(+ chris christie), christie, chris} 9 28
265755 {(+ bill clinton)} 9 40
260928 {(+ trump campaign)} 9 38
843 {(+ mitch mcconnell), mcconnell, mitch} 9 39
811 {rudy, giuliani, (+ rudy giuliani)} 8 33
234 {gingrich, newt, (+ newt gingrich)} 8 38
209 {elizabeth, warren, (+ elizabeth warren)} 7 36


In [11]:
concepts = concepts_by_sayer[20]

sorted_concepts = sorted(concepts.items(), key=operator.itemgetter(1), reverse=True)
for t in sorted_concepts[:20]:
    syn_id = t[0]
    if syn_id:
        print('%s %s' % (mer.synonym_label(syn_id), t[1]))

{(+ donald trump), trump, donald} 28
{(+ bernie sanders), bernie, sanders} 25
{(+under (+federal (+for investigation (+her (+of use (+a (+private (+ email server)))))))), (+ email server), (+of use (+a (+private (+ email server)))), (+over (+her (+of use (+a (+private (+ email server)))))), (+federal (+for investigation (+her (+of use (+a (+private (+ email server))))))), (+private (+ email server)), (+for investigation (+her (+of use (+a (+private (+ email server)))))), (+her (+of use (+a (+private (+ email server))))), (+a (+private (+ email server)))} 6
{(+' choices), (+hard (+' choices)), (+about (+hard (+' choices))), (+but it (+about (+hard (+' choices))))} 4
{homework’, (+ his homework’), (+ bank regulation), (+on (+ his homework’) (+ bank regulation))} 4
{(+and paranoia prejudice), prejudice, (+on (+and paranoia prejudice)), paranoia} 4
{(+more (+than (+$ 200,000))), (+$ 200,000), (+just (+more (+than (+$ 200,000)))), (+in (+just (+more (+than (+$ 200,000)))) income), 200,000, 

In [12]:
# common concepts

concepts1 = set(concepts_by_sayer[2].keys())
concepts2 = set(concepts_by_sayer[258779].keys())

common = concepts1.intersection(concepts2)
for concept in common:
    print(mer.synonym_label(concept))

KeyError: 258779

In [13]:
concepts = get_concepts_by_sayer(2, that_include=8)
for concept in concepts:
    print('%s %s' % (mer.synonym_label(concept), concepts[concept]))

{(+and (+ bernie sanders) (+a communist))} 1
{(+a communist)} 1
{(+' sanders)} 2
{(will_debate he (+ bernie sanders) (+for (+$ (+10 million))))} 1
{(will_debate in), will_debate} 1
{(+for (+$ (+10 million)))} 1
{(+$ (+10 million)), (+10 million), (+than (+$ (+10 million))), (+more (+than (+$ (+10 million))))} 2
{(would_kill (+ bernie sanders) golf (+with high taxes))} 1
{(thought would_kill), ((thought would_kill) economists obamacare jobs), would_kill} 1
{(+with high taxes)} 1
{('ll_order he supporters (+to_disrupt (+rallies (+' sanders))))} 1
{(+to_disrupt (+rallies (+' sanders)))} 1
{(+ pipeline drilling), (+to_disrupt (+ pipeline drilling)), to_disrupt} 1
{(+rallies (+' sanders))} 1
{rallies} 1
{(+clinton says not_qualified’ because sanders so)} 1
{hillary, (+ hillary clinton), clinton} 1
{not_qualified’} 1
{(says sanders (+no debate) (+ interested networks))} 1
{(+no debate)} 1
{(+ interested networks)} 1
{(sold (+ bernie sanders) (+ his soul) (+to (+the devil’)))} 1
{(+ his soul)