In [6]:
class Member(object):
    count = 0
    def __init__(self, id, name):
        Member.count += 1
        self.id = id
        self.name = name
    
    def get_id(self):
        return self.id
    
    def get_name(self):
        return self.name

In [7]:
class Author(Member):
    def __init__(self, id, name, org):
        super().__init__(id, name)
        self.org = org

In [8]:
class Team(object):
    count = 0
    def __init__(self, id, members):
        Team.count += 1
        self.id = id
        self.members = members
        self.team_id = self.set_team_id()
        
    # Set a unique id for each team by adding ids of each team member    
    def set_team_id(self):
        sum_of_ids = 0
        for mem in self.members:
            sum_of_ids += mem.get_id()
        return sum_of_ids
    
    # Return a list of members' names    
    def get_members_names(self):
        members_names = []
        for member in self.members:
            members_names.append(member.get_name())
        return members_names
    
    # Return a list of members' ids
    def get_members_ids(self):
        members_ids = []
        for member in self.members:
            members_ids.append(member.get_id())
        return members_ids

In [9]:
class Document(Team):
    def __init__(self, id, authors, title, doc_type, year, venue, references, fos, keywords):
        super().__init__(id, authors)
        self.title = title
        self.year = year
        self.doc_type = doc_type
        self.venue = venue
        self.references = references
        self.fos = fos
        self.keywords = keywords
        self.fields = self.set_fields()
        
    # Fill the fields attribute with non-zero weight from FOS
    def set_fields(self):
        fields = []
        for field in self.fos:
            if field["w"] != 0.0:
                fields.append(field["name"])
        # Extend the fields with keywords
        if len(self.keywords) != 0:
            fields.extend(self.keywords)
        return fields
    
    def get_fields(self):
        return self.fields
    

In [25]:
import json

counter = 0
docs = []
all_authors = {}  

training_input = []
training_output = []


with open("dblp.v12.json", "r") as jf:
    # Skip the first line
    jf.readline() 
    while counter < 50:
        # Read line by line to not overload the memory
        line = jf.readline().lower().lstrip(",")
        jsonline = json.loads(line)

        # Retrieve the desired attributes
        doc_id = jsonline['id']
        doc_title = jsonline['title']
        doc_year = jsonline['year']
        doc_type = jsonline['doc_type']
        doc_venue = jsonline['venue']
        
        if 'references' in jsonline.keys():
            doc_references = jsonline['references']
        else:
            doc_references = []
            
        doc_fos = jsonline['fos']
        
        if 'keywords' not in jsonline.keys():
            doc_keywords = []
        else:
            doc_keywords = jsonline['keywords']
            
        authors = []
        for auth in jsonline['authors']:
            
            # Retrieve the desired attributes
            auth_id = auth['id']
            auth_name = auth['name']
            
            if 'org' in auth.keys():
                auth_org = auth['org']
            else:
                auth_org = ""
            
            author = Author(auth_id, auth_name, auth_org)
            authors.append(author)
            
            
            if auth_id not in all_authors.keys():
                all_authors[auth_id] = author
            
        doc = Document(doc_id, authors, doc_title, doc_year,doc_type, doc_venue, doc_references, doc_fos, doc_keywords)
        docs.append(doc)
        
        training_input.append(", ".join(doc.get_fields()))
#         training_input.append(doc.get_fields())

        training_output.append(", ".join(doc.get_members_names()))
#         training_output.append(doc.get_members_names())

        counter += 1
    


In [26]:
training_input[:10]

['telecommunications network, computer science, mind map, human–computer interaction, multimedia, empirical research, comprehension, communications protocol',
 'discrete mathematics, combinatorics, direct product, mathematics',
 'statue, engineering drawing, computer science, visualization, polychrome, artificial intelligence',
 'autoregressive–moving-average model, computer science, support vector machine, autoregressive conditional heteroskedasticity, artificial neural network, machine learning',
 'computer vision, polygon mesh, computer graphics (images), computer science, quadric',
 'pattern recognition, computer science, correlation attack',
 "peak signal-to-noise ratio, authentication, secret sharing, computer science, computer network, image sharing, theoretical computer science, verifiable secret sharing, steganography, pattern recognition, shamir's secret sharing, homomorphic secret sharing",
 'computer science, parallel computing',
 'ubiquitous commerce, services computing, d

In [27]:
training_output[:10]

['makoto satoh, ryo muramatsu, mizue kayama, kazunori itoh, masami hashimoto, makoto otani, michio shimizu, masahiko sugimoto',
 'pranava k. jha',
 'g. beale, g. earl',
 'altaf hossain, faisal zaman, m. nasser, m. mufakhkharul islam',
 'rafael álvarez, leandro tortosa, josé-francisco vicent, antonio zamora',
 'jovan dj. golic, guglielmo morgari',
 'güzin ulutas, mustafa ulutas, vasif v. nabiyev',
 'pranay chaudhuri, hussein thompson',
 'phan cong vinh',
 'dominik szajerman, adam jurczyński']