In [3]:
# !/usr/bin/python3
import pandas as pd
import os
import json
import csv
import sys
from pandas import json_normalize
import requests as r
import xmltodict
import time
import json
from datetime import datetime
import string 
import requests
from tenacity import *
from matplotlib import pyplot as plt
import semanticscholar as sch
from unittest import TestCase

In [4]:


API_URL = 'http://api.semanticscholar.org/v1'

def paper(id, include_unknown_references=False) -> dict:
    
    '''Paper lookup
    :param id: S2PaperId, DOI or ArXivId.
    :param include_unknown_references : bool, (optional) include non referenced paper.
    :returns: paper data or empty :class:`dict` if not found.
    :rtype: :class:`dict`
    '''
    
    data = __get_data('paper', id, include_unknown_references)
    
    return data

def author(id) -> dict:

    '''Author lookup
    :param id: S2AuthorId.
    :returns: author data or empty :class:`dict` if not found.
    :rtype: :class:`dict`
    '''

    data = __get_data('author', id)

    return data

@retry(
    wait=wait_fixed(30),
    retry=retry_if_exception_type(ConnectionRefusedError),
    stop=stop_after_attempt(10)
    )
def __get_data(method, id, include_unknown_references=False) -> dict:

    '''Get data from Semantic Scholar API
    :param method: 'paper' or 'author'.
    :param id: :class:`str`.
    :returns: data or empty :class:`dict` if not found.
    :rtype: :class:`dict`
    '''

    data = {}

    method_types = ['paper','author']
    if method not in method_types:
        raise ValueError('Invalid method type. Expected one of: {}'.format(method_types))

    url = '{}/{}/{}'.format(API_URL, method, id)
    if include_unknown_references:
        url += '?include_unknown_references=true'
    r = requests.get(url)
    
    if r.status_code == 200:
        data = r.json()
        if len(data) == 1 and 'error' in data:
            data = {}
    elif r.status_code == 429:
        raise ConnectionRefusedError('HTTP status 429 Too Many Requests.')

    return data

In [5]:


def test_paper():
    data = sch.paper('10.1093/mind/lix.236.433')
    assert data['title'] == 'Computing Machinery and Intelligence'
    
def test_author():
    data = sch.author(2262347)
    assert data['name'] == 'Alan M. Turing'

def test_not_found():
    data = sch.paper(0)
    assert len(data) == 0

In [17]:
data.keys()

dict_keys(['aliases', 'authorId', 'influentialCitationCount', 'name', 'papers', 'url'])

In [1]:

import xmltodict
import time
import json
import string
from bio import Entrez

from .. import constants as c
from ..find_tags import add_result_tags
from datetime import datetime
from ..find_tags import add_result_tags

Entrez.email = c.pubmed.ENTREZ_EMAIL


'''Translate function looks at the dictionary placed in constants/webpage.py and pass the query to the syntax of
the webpage'''


def translate(factorized_query):
    translation_dic = c.pubmed.TRANSLATION_DIC
    for key in translation_dic.keys():
        new_query = factorized_query.replace(key, translation_dic[key])
        factorized_query = new_query

    words_and_syms = factorized_query.split(' ')
    final_query = ''
    for word_or_sym in words_and_syms:
        if '[' in word_or_sym:
            try:
                word, field = word_or_sym.split('[')
                new_field = '[' + field
                if new_field not in translation_dic.values():
                    new_field = ''
                final_query += word + new_field + ' '
            except:
                final_query += word_or_sym + ' '
        else:
            final_query += word_or_sym + ' '


    return final_query


'''For every result in the search, we take title, authors, date, publication journal and url.'''


def search_pubmed(factorized_query, factorized_query_soft, results_by_source, sort_by):

    if not results_by_source:
        results_by_source = 100
    elif results_by_source:
        results_by_source = min(100, results_by_source)
    # Pass the query to the clinical trials format
    final_query = translate(factorized_query)
    # Get ids

    if sort_by == 'date':
        sort_parameter = 'most+recent' # coincides with the results shown in pubmed with order by most recent
    elif sort_by == 'author':
        sort_parameter = 'author' # coincides with the results shown in pubmed with order by first author
    elif sort_by == 'journal':
        sort_parameter = 'journal' # coincides with the results shown in pubmed with order by journal
    else:
        sort_parameter = 'relevance' # does not coincide with the results shown in pubmed with order by best match

    handle = Entrez.esearch(db="pubmed", term=final_query, sort=sort_parameter, retmax=results_by_source, api_key=c.pubmed.KEY)

    record = Entrez.read(handle)
    handle.close()

    id_list = record['IdList']
    number_of_results = str(record['Count'])

    if len(id_list) == 0:
        return {'results': [], 'number_of_results': '0'}

    # Get detailed info including Abstract
    handle = Entrez.efetch(db="pubmed", id=';'.join(id_list), retmode='xml', rettype="abstract", api_key=c.pubmed.KEY)

    s = handle.read()

    articles_list = json.loads(json.dumps(xmltodict.parse(s))).get("PubmedArticleSet", []).get("PubmedArticle", [])

    if len(id_list) == 1:
        articles_list = [articles_list]

    returned_list = list()
    T = 0

    count = 1001

    for article in articles_list:
        result_id = 'PUB' + str(count)[1:]
        count += 1

        title = article['MedlineCitation']['Article']['ArticleTitle']
        if type(title) == dict:

            try:
                intro = title['i'] + ' '
            except:
                intro = ''

            try:
                title = intro + title['#text']
            except:
                title = title['#text']

        if title:
            if title[-1] == '.':
                title = title[:-1]

        try:
            authors = article['MedlineCitation']['Article']['AuthorList']['Author']
            authors_list = list()

            if type(authors) is not list:
                authors = [authors]
            # TODO: Collective authors: maybe better to take collective name?
            for author in authors:
                try:
                    authors_list.append(author['ForeName'] + ' ' + author['LastName'])
                except:
                    # Here: Only Enters When Collective Name
                    pass
            authors = ', '.join(authors_list)
        except:
            authors = ''
            authors_list = []

        abstract = ''
        try:
            if type(article['MedlineCitation']['Article']['Abstract']['AbstractText']) is str:
                abstract += article['MedlineCitation']['Article']['Abstract']['AbstractText']
            elif type(article['MedlineCitation']['Article']['Abstract']['AbstractText']) is dict:
                if '@Label' in article['MedlineCitation']['Article']['Abstract']['AbstractText']:
                    abstract += article['MedlineCitation']['Article']['Abstract']['AbstractText']['@Label'] + ': '
                abstract += article['MedlineCitation']['Article']['Abstract']['AbstractText']['#text']
            elif type(article['MedlineCitation']['Article']['Abstract']['AbstractText']) is list:
                for field in article['MedlineCitation']['Article']['Abstract']['AbstractText']:
                    if '@Label' in field.keys():
                        abstract += field['@Label'] + ': '
                    abstract += field['#text'] + '\n'
            else:
                abstract = 'Unknown abstract'

        except:
            abstract = 'Unknown abstract'

        try:
            date = article['MedlineCitation']['Article']['ArticleDate']
            date_str = str(date['Year'] + '-' + date['Month'] + '-' + date['Day'])
            printing_date = date_str
            date_object = datetime.strptime(date_str, "%Y-%M-%d")
        except:
            try:
                date = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
                date_str = str(date['Year'] + '-' + date['Month'] + '-' + date['Day'])
                printing_date = date_str
                date_object = datetime.strptime(date_str, "%Y-%M-%D")
            except:
                try:
                    date = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
                    date_str = str(date['Year'] + '-' + date['Month'])
                    printing_date = date_str
                    date_object = datetime.strptime(date_str, "%Y-%M")
                except:
                    try:
                        date = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']
                        date_str = str(date['Year'])
                        printing_date = date_str
                        date_object = datetime.strptime(date_str, "%Y")
                    except:
                        try:
                            date = article['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['MedlineDate']
                            try:
                                year_index = date.find('20')
                            except:
                                year_index = date.find('19')
                            date_str = str(date)[year_index:year_index+4]
                            printing_date = date_str
                            date_object = datetime.strptime(date_str, "%Y")
                        except:
                            printing_date = 'Unknown'
                            date = '1000'
                            date_object = datetime.strptime(date, "%Y")

        url = article['PubmedData']['ArticleIdList']['ArticleId']
        if type(url) is not list:
            url = [url]

        try:
            journal = article['MedlineCitation']['Article']['Journal']['Title']
        except:
            journal = ''

        try:
            keywords = list()
            keyword_list = elem['MedlineCitation']['KeywordList']['Keyword']
            for elem in keyword_list:
                keywords.append(elem['#text'])

        except:
            keywords = ['']

        returned_list.append(
            {
                'result_id': result_id,
                'title': title,
                'authors': authors,
                'abstract': abstract,
                'date': date_object.strftime("%Y-%m-%d"),
                'printing_date': printing_date,
                'publication_journal': journal,
                'url': c.pubmed.URL + url[0]['#text'],
                'keywords': keywords,
                'type': 'articles',
                'hard_or_soft': 'publications'
            }
        )
        #print(keywords)
        #print('*'*60)

    #t1 = time.time()
    final_result_list = add_result_tags(returned_list, 'pubmed')
    #t2 = time.time()
    #print('FINAL TIME, pubmed: ', t2 - t1)

    return {'results': final_result_list, 'number_of_results': number_of_results}


ImportError: cannot import name 'Entrez' from 'bio' (/Users/cristinacapdevilachoy/Documents/Git/OD_practical/env/lib/python3.7/site-packages/bio/__init__.py)

In [2]:
final_query = 'breast'

results_by_source = 200

url = 'http://export.arxiv.org/api/query?search_query='

#Call the API specifying the desired number of results
sort_parameter = 'lastUpdatedDate'
## 'lastUpdatedDate','submittedDate', 'relevance'

response = r.get(
    url + final_query + '&start=0&max_results=' + str(results_by_source) +
    '&sortBy='+str(sort_parameter)+'&sortOrder=descending'
)


In [4]:
d = json.loads(json.dumps(xmltodict.parse(response.text)))['feed']['entry']
number_of_results = json.loads(json.dumps(xmltodict.parse(response.text)))['feed']['opensearch:totalResults']['#text']


In [57]:
json_normalize(d)

Unnamed: 0,id,updated,published,title,summary,author,link,category,arxiv:doi.@xmlns:arxiv,arxiv:doi.#text,arxiv:comment.@xmlns:arxiv,arxiv:comment.#text,arxiv:journal_ref.@xmlns:arxiv,arxiv:journal_ref.#text,arxiv:primary_category.@xmlns:arxiv,arxiv:primary_category.@term,arxiv:primary_category.@scheme,author.name,category.@term,category.@scheme
0,http://arxiv.org/abs/2001.09282v2,2020-04-08T08:52:44Z,2020-01-25T09:13:39Z,Limited Angle Tomography reconstruction for no...,The purpose of the present work is the study o...,"[{'name': 'G. E. Poma'}, {'name': 'F. Garibald...","[{'@title': 'doi', '@href': 'http://dx.doi.org...","[{'@term': 'physics.med-ph', '@scheme': 'http:...",http://arxiv.org/schemas/atom,10.1088/1748-0221/15/04/C04019,http://arxiv.org/schemas/atom,"10 pages, 10 figures, proceeding conference",http://arxiv.org/schemas/atom,2020 IOP Publishing Ltd and Sissa Medialab,http://arxiv.org/schemas/atom,physics.med-ph,http://arxiv.org/schemas/atom,,,
1,http://arxiv.org/abs/2004.03500v1,2020-04-07T15:53:26Z,2020-04-07T15:53:26Z,BreastScreening: On the Use of Multi-Modality ...,"This paper describes the field research, desig...","[{'name': 'Francisco Maria Calisto'}, {'name':...",[{'@href': 'http://arxiv.org/abs/2004.03500v1'...,"[{'@term': 'cs.HC', '@scheme': 'http://arxiv.o...",,,http://arxiv.org/schemas/atom,"AVI 2020 Short Papers, 5 pages, 2 figures, for...",,,http://arxiv.org/schemas/atom,cs.HC,http://arxiv.org/schemas/atom,,,
2,http://arxiv.org/abs/2004.03037v1,2020-04-06T23:12:31Z,2020-04-06T23:12:31Z,Dense Steerable Filter CNNs for Exploiting Rot...,Histology images are inherently symmetric unde...,"[{'name': 'Simon Graham'}, {'name': 'David Eps...",[{'@href': 'http://arxiv.org/abs/2004.03037v1'...,"[{'@term': 'eess.IV', '@scheme': 'http://arxiv...",,,,,,,http://arxiv.org/schemas/atom,eess.IV,http://arxiv.org/schemas/atom,,,
3,http://arxiv.org/abs/2004.01397v1,2020-04-03T06:49:31Z,2020-04-03T06:49:31Z,Crossover-Net: Leveraging the Vertical-Horizon...,Robust segmentation for non-elongated tissues ...,"[{'name': 'Qian Yu'}, {'name': 'Yinghuan Shi'}...",[{'@href': 'http://arxiv.org/abs/2004.01397v1'...,"[{'@term': 'eess.IV', '@scheme': 'http://arxiv...",,,http://arxiv.org/schemas/atom,"11 pages, 14 figures",,,http://arxiv.org/schemas/atom,eess.IV,http://arxiv.org/schemas/atom,,,
4,http://arxiv.org/abs/2003.12248v2,2020-04-02T09:08:12Z,2020-03-27T06:09:13Z,Dark-field signal extraction in propagation-ba...,A method for extracting the dark-field signal ...,"[{'name': 'T. E. Gureyev'}, {'name': 'D. M. Pa...",[{'@href': 'http://arxiv.org/abs/2003.12248v2'...,"[{'@term': 'physics.med-ph', '@scheme': 'http:...",,,,,,,http://arxiv.org/schemas/atom,physics.med-ph,http://arxiv.org/schemas/atom,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,http://arxiv.org/abs/1904.04520v1,2019-04-09T08:26:02Z,2019-04-09T08:26:02Z,Regression Concept Vectors for Bidirectional E...,Explanations for deep neural network predictio...,"[{'name': 'Mara Graziani'}, {'name': 'Vincent ...",[{'@href': 'http://arxiv.org/abs/1904.04520v1'...,"[{'@term': 'cs.LG', '@scheme': 'http://arxiv.o...",,,http://arxiv.org/schemas/atom,"9 pages, 3 figures, 3 tables",http://arxiv.org/schemas/atom,Understanding and Interpreting Machine Learnin...,http://arxiv.org/schemas/atom,cs.LG,http://arxiv.org/schemas/atom,,,
196,http://arxiv.org/abs/1804.10812v2,2019-04-07T16:54:51Z,2018-04-28T14:57:53Z,Imaging of Fiber-Like Structures in Digital Br...,Fiber-like features are an important aspect of...,"[{'name': 'Sean D. Rose'}, {'name': 'Emil Y. S...","[{'@title': 'doi', '@href': 'http://dx.doi.org...",,http://arxiv.org/schemas/atom,10.1117/1.JMI.6.3.031404,http://arxiv.org/schemas/atom,This version has been updated with changes mad...,http://arxiv.org/schemas/atom,SPIE Journal of Medical Imaging 6 (2019) 031404,http://arxiv.org/schemas/atom,physics.med-ph,http://arxiv.org/schemas/atom,,physics.med-ph,http://arxiv.org/schemas/atom
197,http://arxiv.org/abs/1904.03517v1,2019-04-06T19:48:54Z,2019-04-06T19:48:54Z,Nonparametric tests for transition probabiliti...,This paper proposes nonparametric two-sample t...,,"[{'@title': 'doi', '@href': 'http://dx.doi.org...",,http://arxiv.org/schemas/atom,10.1080/10485252.2019.1705298,,,,,http://arxiv.org/schemas/atom,stat.ME,http://arxiv.org/schemas/atom,Giorgos Bakoyannis,stat.ME,http://arxiv.org/schemas/atom
198,http://arxiv.org/abs/1904.07295v1,2019-04-05T14:10:54Z,2019-04-05T14:10:54Z,The population-attributable fraction for time-...,The public health impact of a harmful exposure...,"[{'name': 'Maja von Cube'}, {'name': 'Martin S...","[{'@title': 'doi', '@href': 'http://dx.doi.org...","[{'@term': 'stat.AP', '@scheme': 'http://arxiv...",http://arxiv.org/schemas/atom,10.1002/bimj.201800252,http://arxiv.org/schemas/atom,A revised version has been submitted,http://arxiv.org/schemas/atom,"Biometrical Journal, 2019",http://arxiv.org/schemas/atom,stat.AP,http://arxiv.org/schemas/atom,,,


In [None]:
import requests as r
import xmltodict
import time
import json
from datetime import datetime
import string 


def search_arxiv(factorized_query, factorized_query_soft, results_by_source, sort_by):
    results_by_source = 200

    #Pass the factorized query to arxiv notation and call the API for the search
    final_query = translate(factorized_query)

    #Call the API specifying the desired number of results
    sort_parameter = 'lastUpdatedDate'
    ## 'lastUpdatedDate','submittedDate', 'relevance'

    response = r.get(
        c.arxiv.URL + final_query + '&start=0&max_results=' + str(results_by_source) +
        '&sortBy='+str(sort_parameter)+'&sortOrder=descending'
    )

    try:
        d = json.loads(json.dumps(xmltodict.parse(response.text)))['feed']['entry']
        number_of_results = json.loads(json.dumps(xmltodict.parse(response.text)))['feed']['opensearch:totalResults']['#text']
    except:
        return {'results': [], 'number_of_results': '0'}

    if type(d) is not list:
        d = [d]

    trials_list = list()

    count = 1001

    for trial in d:
        result_id = 'AXV' + str(count)[1:]
        count += 1

        try:
            title = trial['title']
        except:
            title = 'Unknown title'

        try:
            summary = trial['summary']
        except:
            summary = 'Unknown summary'

        try:
            authors_list = trial['author']
            authors = list()

            if type(authors_list) is not list:
                authors_list = [authors_list]

            for element in authors_list:
                authors.append(element['name'])

        except:
            authors = 'Unknown authors'

        try:
            printing_date = trial['published'].replace('T', ' ').replace('Z', '')
            date = trial['published'].replace('T', ' ').replace('Z', '')
            date_object = datetime.strptime(date, "%Y-%m-%d %X")

        except:
            printing_date = 'Unknown'
            date = 'January 1, 2000'
            date_object = datetime.strptime(date, "%B %d, %Y")

        try:
            url = trial['link'][0]['@href']
        except:
            url = ''

        trials_list.append(
            {
                'result_id': result_id,
                'title': title,
                'authors': ', '.join(authors),
                'abstract': summary,
                'printing_date': printing_date,
                'date': date_object.strftime("%Y-%m-%d %X"),
                'url': url,
                'type': 'trials',
                'hard_or_soft': 'hard'
            }
        )
    #t1 = time.time()
    final_result_list = add_result_tags(trials_list[:results_by_source], 'arxiv')
    #t2 = time.time()
    #print('FINAL TIME, arxiv: ', t2 - t1)

    return {'results': final_result_list, 'number_of_results': number_of_results}


## Definitive

In [18]:

# Global variables to control script flow
input_default_path = "data/breastcancer.json"
tmp_path = "tmp"

In [19]:
inputdir = input_default_path
# Assign output file for output CSV
if not os.path.exists(tmp_path):
    os.makedirs(tmp_path)
    print(f"[INFO] Created a new folder {tmp_path}")
# Evaluation output file config
head, tail = os.path.split(inputdir)
tail = '.'.join((tail.split('.')[0], 'csv'))
outputfile = f"{head}/{tail}"
print(outputfile)
if os.path.exists(outputfile):
    os.remove(outputfile)
    print(f"{outputfile} file removed")

data/breastcancer.csv
data/breastcancer.csv file removed


In [20]:
node = 'result'
json_file_path = inputdir
csv_file_path = outputfile

fp = open(json_file_path, 'r')
json_value = fp.read()
raw_data = json.loads(json_value)
fp.close()

data_to_be_processed = raw_data[node]['hits']['hit']
df = json_normalize(data_to_be_processed, max_level=2)
for i in range(len(df)):
    if type(df['info.authors.author'][i]) != type(list()):
        df['info.authors.author'][i] = [df['info.authors.author'][i]]



In [22]:
df = df.rename(columns={"@score":"score","@id":"id","url":"url_num","info.authors.author":"author","info.title":"title",
                   "info.year":"year","info.type":"pub_type","info.key":"key","info.ee":"ee","info.url":"url",
                   "info.venue":"venue","info.volume":"vol","info.number":"num","info.pages":"pages",
                   "info.doi":"doi", "info.publisher":"publisher"
                   })


In [57]:
df = df[df['doi'].notna()]

In [73]:
data['topics'][0]

{'topic': 'Deep learning',
 'topicId': '2762',
 'url': 'https://www.semanticscholar.org/topic/2762'}

In [85]:
bool({})

False

In [None]:
i = 0
new = []
to_drop = []
for row in df['doi'][:500]:
    i += 1
    data = sch.paper(row)
    print(row,bool(data))
    if data:
        new_topics = []
        for topic in data['topics']:
            new_topics.append([row, topic['topicId'],topic['topic']])
        new.append(new_topics)
    else:
        to_drop.append(i)


10.1186/S12880-018-0303-3 False
10.1109/TBME.2018.2887083 True
10.1007/978-3-030-30443-0_10 True
10.1109/CISP-BMEI.2017.8301908 True
10.1016/J.CMPB.2015.11.010 False
10.1109/BHI.2014.6864395 True
10.1145/2598394.2609853 True
10.1145/2464576.2466809 True
10.2200/S00325ED1V01Y201012BME039 True
10.1145/2001858.2002040 True
10.1016/J.FUTURE.2019.01.033 False
10.1016/J.IJMEDINF.2019.03.004 False
10.1016/J.NEUCOM.2018.03.072 False
10.1016/J.JBI.2019.103180 False
10.1186/S12911-019-0873-1 False
10.1109/TMI.2018.2870343 True
10.1007/S00371-017-1447-9 False
10.1145/3307339.3342164 True
10.1109/EMBC.2019.8856744 True
10.1109/ISBI.2019.8759205 True
10.1117/12.2513561 True
10.1117/12.2512943 True
10.1186/S12918-018-0530-9 False
10.1016/J.CMPB.2017.10.016 False
10.1504/IJMEI.2018.10014081 True
10.1109/TBME.2017.2695103 True
10.1109/BIBM.2018.8621315 True
10.1117/12.2318324 True
10.1117/12.2317675 True
10.1117/12.2293644 True
10.1117/12.2293412 True
10.3233/978-1-61499-852-5-661 True
10.1117/12.2293

In [81]:
len(to_drop)

707

In [119]:
df.to_csv(outputfile)


In [118]:
any([i!=np.nan for i in df['info.publisher']])

True

In [129]:
group = 'pub_type'
types = df.groupby(group)
#plt.bar(np.arange(len(types)),types.count().iloc[:,0])
#plt.title(group)
types.groups

{'Books and Theses': Int64Index([  0,  10,  12,  43,  44,  45,  46,  47,  55,  56,  67,  68,  81,
              82,  89, 140, 427, 428, 429, 716],
            dtype='int64'),
 'Conference and Workshop Papers': Int64Index([  3,   5,   7,   8,   9,  11,  13,  21,  22,  23,
             ...
             893, 894, 895, 896, 897, 898, 996, 997, 998, 999],
            dtype='int64', length=459),
 'Informal Publications': Int64Index([  4,  27,  28,  29,  30,  42,  61, 138, 139, 396, 397, 398, 399,
             400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
             413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425,
             426, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693,
             694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706,
             707, 708, 709, 710, 711, 712, 713, 714, 715, 899, 900, 901, 902,
             903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915,
             916],
            dtype

In [5]:
# Convert to string keeping encoding in mind...
def to_string(s):
    try:
        return str(s)
    except:
        # Change the encoding type if needed
        return s.encode('utf-8')


In [None]:
def reduce_item(key, value):
    global reduced_item

    # Reduction Condition 1
    if type(value) is list:
        i = 0
        for sub_item in value:
            reduce_item(key + '_' + to_string(i), sub_item)
            i = i + 1

    # Reduction Condition 2
    elif type(value) is dict:
        sub_keys = value.keys()
        for sub_key in sub_keys:
            reduce_item(key + '_' + to_string(sub_key), value[sub_key])

    # Base Condition
    else:
        reduced_item[to_string(key)] = to_string(value)


In [4]:
    node = 'result'
    json_file_path = inputdir
    csv_file_path = outputfile

    fp = open(json_file_path, 'r')
    json_value = fp.read()
    raw_data = json.loads(json_value)
    fp.close()

    data_to_be_processed = raw_data[node]['hits']

In [6]:
import json
from pandas.io.json import json_normalize
import argparse


def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(y)
    return out

In [10]:
import io
from pandas.io.json import json_normalize



df = pd.concat([pd.DataFrame(data_to_be_processed), 
                json_normalize(data_to_be_processed['nested_array_to_expand'])], 
                axis=1).drop('nested_array_to_expand', 1)

KeyError: 'nested_array_to_expand'

In [8]:
final_data

Unnamed: 0,@id,@score,info_authors_author_0_@pid,info_authors_author_0_text,info_authors_author_10_@pid,info_authors_author_10_text,info_authors_author_11_@pid,info_authors_author_11_text,info_authors_author_12_@pid,info_authors_author_12_text,...,info_publisher,info_title,info_type,info_url,info_venue,info_venue_0,info_venue_1,info_volume,info_year,url
0,388703,5,,,,,,,,,...,,Bio-statistical approaches to evaluate the lin...,Books and Theses,https://dblp.org/rec/phd/hal/Perrier18a,,,,,2018,URL#388703
1,59140,4,235/2149,Yoo Kyung Yeom,,,,,,,...,,Screening mammography for second breast cancer...,Journal Articles,https://dblp.org/rec/journals/bmcmi/YeomCKCSC19,BMC Medical Imaging,,,19,2019,URL#59140
2,159453,4,89/494,Hang Song,,,,,,,...,,Detectability of Breast Tumors in Excised Brea...,Journal Articles,https://dblp.org/rec/journals/tbe/SongSMKSOAOK19,IEEE Trans. Biomed. Engineering,,,66,2019,URL#159453
3,271032,4,249/6174,Zbigniew Leszczynski,,,,,,,...,,Artificial Neural Networks in Forecasting Canc...,Conference and Workshop Papers,https://dblp.org/rec/conf/isat/LeszczynskiJ19,ISAT,,,,2019,URL#271032
4,348056,4,239/5091,Han Le,02/99,Ashish Sharma,82/3926,Erich Bremer,02/5135,Jonas S. Almeida,...,,Utilizing Automated Breast Cancer Detection to...,Informal Publications,https://dblp.org/rec/journals/corr/abs-1905-10841,CoRR,,,abs/1905.10841,2019,URL#348056
5,878013,4,30/7002,Dongdong Sun,,,,,,,...,,Prognosis prediction of human breast cancer by...,Conference and Workshop Papers,https://dblp.org/rec/conf/bmei/SunWFL17,CISP-BMEI,,,,2017,URL#878013
6,1077541,4,00/9498,Maria João Cardoso,,,,,,,...,,The breast cancer conservative treatment. Cosm...,Journal Articles,https://dblp.org/rec/journals/cmpb/CardosoCOG16,Comput. Methods Programs Biomed.,,,126,2016,URL#1077541
7,1768787,4,90/9502,Verónica Burriel,,,,,,,...,,Conceptual Schema of Breast Cancer - The backg...,Conference and Workshop Papers,https://dblp.org/rec/conf/bhi/BurrielP14,BHI,,,,2014,URL#1768787
8,1805847,4,w/StephanMWinkler,Stephan M. Winkler,,,,,,,...,,Data based prediction of cancer diagnoses usin...,Conference and Workshop Papers,https://dblp.org/rec/conf/gecco/WinklerASS14,GECCO,,,,2014,URL#1805847
9,2082384,4,w/StephanMWinkler,Stephan M. Winkler,,,,,,,...,,Evolutionary identification of cancer predicto...,Conference and Workshop Papers,https://dblp.org/rec/conf/gecco/WinklerAS13,GECCO,,,,2013,URL#2082384


In [8]:
    processed_data = []
    header = []
    

In [11]:
    item = data_to_be_processed[3]
    reduced_item = {}

In [13]:
key ='result'
value= item

In [15]:
type(value) is list

False

In [12]:
    if type(value) is list:
        i = 0
        for sub_item in value:
            reduce_item(key + '_' + to_string(i), sub_item)
            i = i + 1

{'@score': '4',
 '@id': '271032',
 'info': {'authors': {'author': [{'@pid': '249/6174',
     'text': 'Zbigniew Leszczynski'},
    {'@pid': '249/6179', 'text': 'Tomasz Jasinski'}]},
  'title': 'Artificial Neural Networks in Forecasting Cancer Therapy Methods and Costs of Cancer Patient Treatment. Case Study for Breast Cancer.',
  'venue': 'ISAT',
  'pages': '111-120',
  'year': '2019',
  'type': 'Conference and Workshop Papers',
  'key': 'conf/isat/LeszczynskiJ19',
  'doi': '10.1007/978-3-030-30443-0_10',
  'ee': 'https://doi.org/10.1007/978-3-030-30443-0_10',
  'url': 'https://dblp.org/rec/conf/isat/LeszczynskiJ19'},
 'url': 'URL#271032'}

In [None]:
def JSON_to_CSV(inputdir, outputfile):
    # Reading arguments
    node = 'result'
    json_file_path = inputdir
    csv_file_path = outputfile

    fp = open(json_file_path, 'r')
    json_value = fp.read()
    raw_data = json.loads(json_value)
    fp.close()

    data_to_be_processed = raw_data[node]['hits']['hit']


    processed_data = []
    header = []
    for item in data_to_be_processed:
        reduced_item = {}
        reduce_item(node, item)

        header += reduced_item.keys()

        processed_data.append(reduced_item)

    header = list(set(header))
    header.sort()

    with open(csv_file_path, 'w+') as f:
        writer = csv.DictWriter(f, header, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        for row in processed_data:
            writer.writerow(row)

    print("Just completed writing csv file with %d columns" % len(header))




In [None]:
if __name__ == "__main__":
    inputdir = input_default_path
    # Assign output file for output CSV
    if not os.path.exists(tmp_path):
        os.makedirs(tmp_path)
        print(f"[INFO] Created a new folder {tmp_path}")
    # Evaluation output file config
    head, tail = os.path.split(inputdir)
    tail = '.'.join((tail.split('.')[0], 'csv'))
    outputfile = f"{head}/{tail}"
    print(outputfile)
    if os.path.exists(outputfile):
        os.remove(outputfile)
        print(f"{outputfile} file removed")
    # Run CSV to json
    JSON_to_CSV(inputdir, outputfile)
