In [None]:
import json
import pandas as pd
import re

In [None]:
class BiographyExtracter:
    def __init__(self):
        self.json_path = "./data/raw/biography.json"
        self.name_nodes = []
        self.name_nodes_path = "./data/processed/name_nodes.json"
        self.other_nodes = []
        self.other_nodes_path = "./data/processed/other_nodes.json"
        self.edges = []
        self.edges_path = "./data/processed/edges.json"
        with open(json_path) as json_file:
            self.data = json.load(json_file)

    def _create_edge(self, src, dst, edge_type='', **kwargs):
        self.edges.append({**{'src': src, 'dst': dst, 'edge_type': edge_type}, **kwargs})

    def _append_other_node_and_edge(self, tag, v, **kwargs):
        entry = {"type": tag, "value": v}
        if entry not in self.other_nodes:
            self.other_nodes.append(entry)
        self._create_edge(self._congress_id, v, tag, **kwargs)

    def _create_other_node_and_edge(self, tag):
        value = self._biography[tag]
        if tag == 'data_de_nascimento':
            value = value.split('-')[0][:3]
        self._append_other_node_and_edge(tag, value)
    
    def _create_comission_nodes_edges(self, tag):
        value = self._biography['comissoes_parlamentares_a_que_pertence']
        for v in value:
            node = v.split('[')[0].strip()
            try:
                match = re.search(r'.*?\[(.*)].*' , v)
                edge_info = match.group(1)
            except AttributeError:
                edge_info = 'membro'
            self._append_other_node_and_edge(tag, node, **{'edge_info': edge_info})

    def _save(self):
        with open(self.name_nodes_path, 'w') as outfile:
            json.dump(self.name_nodes, outfile)
        with open(self.other_nodes_path, 'w') as outfile:
            json.dump(self.other_nodes, outfile)
        with open(self.edges_path, 'w') as outfile:
            json.dump(self.edges, outfile)

    def extract_data(self):
        for entry in self.data:
            # Extract data from entries
            self._congress_id = list(entry.keys())[0]
            self._biography = list(entry.values())[0]
            # Create main/name nodes
            self.name_nodes.append({self._congress_id: self._biography['nome_completo']})
            # Create party nodes
            self._create_other_node_and_edge('partido')
            # Create profession nodes
            self._create_other_node_and_edge('profissao')
            # Create comission nodes
            # try clause needed because some congress people do not belong to any?
            try:
                self._create_comission_nodes_edges('comissoes_parlamentares_a_que_pertence')
            except KeyError:
                pass
            # Create birth decade nodes
            self._create_other_node_and_edge('data_de_nascimento')
            self._save()
            
        
        


In [208]:
a = BiographyExtracter('../crawler/biography.json')

In [209]:
a.extract_data()


In [210]:
a.edges

[{'src': '6906', 'dst': 'PSD', 'edge_type': 'partido'},
 {'src': '6906', 'dst': 'Engenheira Civil', 'edge_type': 'profissao'},
 {'src': '6906',
  'dst': 'Comissão de Assuntos Europeus',
  'edge_type': 'comissoes_parlamentares_a_que_pertence',
  'edge_info': 'Suplente'},
 {'src': '6906',
  'dst': 'Comissão de Cultura e Comunicação',
  'edge_type': 'comissoes_parlamentares_a_que_pertence',
  'edge_info': 'membro'},
 {'src': '6906',
  'dst': 'Comissão de Administração Pública, Modernização Administrativa, Descentralização e Poder Local',
  'edge_type': 'comissoes_parlamentares_a_que_pertence',
  'edge_info': 'membro'},
 {'src': '6906',
  'dst': 'Grupo de Trabalho - Prestações por incapacidade, decorrentes de doença ou acidentes de trabalho',
  'edge_type': 'comissoes_parlamentares_a_que_pertence',
  'edge_info': 'membro'},
 {'src': '6906',
  'dst': 'Grupo de Trabalho - Carreira Técnico Superior Diagnóstico e Terapêutica',
  'edge_type': 'comissoes_parlamentares_a_que_pertence',
  'edge_in

In [None]:
edges = []
names = []
parties = []
professions = []
comissions = []
birth_decades = []
other_nodes = []
for entry in data:
    # Extract data from entries
    congress_id = list(entry.keys())[0]
    biography = list(entry.values())[0]
    # Create main/name nodes
    names.append({congress_id: biography['nome_completo']})
    # Create party nodes
    _create_other_node_and_edge(congress_id, biography, 'partido')
    # Create profession nodes
    _create_other_node_and_edge(congress_id, biography, 'profissao')
    # Create comission nodes
    # try clause needed because some congress people do not belong to any?
    try:
        _create_other_node_and_edge(congress_id, biography, 'comissoes_parlamentares_a_que_pertence')
    except KeyError:
        pass
    # Create birth decade nodes
    _create_other_node_and_edge(congress_id, biography, 'data_de_nascimento')

In [None]:
extract_data(data)