In [182]:
import bibtexparser as bp
import pandas as pd
import numpy as np
import yaml as yml

# CARREGA O ARQUIVO DE CONFIGURAÇÃO
dirConfig = r'D:\Impacta\Python for Data Engineer\Aula 2\Arquivos'
configFileName = 'config_analisa_bibtex.yml'

with open(dirConfig + '\\' + configFileName) as f:
    configFile = yml.load(f, Loader=yml.loader.SafeLoader)


In [183]:
# EX BIBTEX
listaArquivos = ( \
	configFile['FILE_ACM'], \
	configFile['FILE_IEE'], \
	configFile['FILE_SD' ]
	)

formatFile = configFile['FORMATO']

dirInput = r'D:\Impacta\Python for Data Engineer\Aula 2\Arquivos\input'
dirOutput = r'D:\Impacta\Python for Data Engineer\Aula 2\Arquivos\output'

mergedFilesCSV = 'ALL_ARTICLES.csv'
mergedFilesJSON = 'ALL_ARTICLES.json'
mergedFilesYAML = 'ALL_ARTICLES.yml'

def bibtexToDf(arqv):
	arquivo = arqv
	with open(dirInput + '\\' + arquivo, encoding='utf8') as f:
		bib_database = bp.load(f)		
	return bib_database.entries_dict.values()  # retorna uma lista de dicionarios
	
# UNION Dataframes e Yml
listaDf = []
listaYml =[]
for i in listaArquivos:
	sourceArticles = bibtexToDf(i)
	listaDf.append(pd.DataFrame(sourceArticles))
	listaYml.append(yml.dump(list(sourceArticles)))

unionDf = pd.concat(listaDf)
unionYml = ''.join(listaYml)

In [185]:
# COMPARA COLUNAS PARA PADRONIZAR
dfCol = pd.DataFrame([df.columns.sort_values() for df in listaDf])
dfCompCol = dfCol.T.rename(columns={0: 'ACM', 1: 'IEEE', 2: 'SD'})
print(dfCompCol)

# COLUNAS UNICAS
dfToArray = dfCompCol.fillna('NULL').values
dfValoresUnicos = pd.unique(dfToArray[dfToArray != 'NULL'])

print('\nDistinct de todas as colunas: ' + str(dfValoresUnicos) + '\n')

          ACM       IEEE         SD
0   ENTRYTYPE  ENTRYTYPE  ENTRYTYPE
1          ID         ID         ID
2    abstract   abstract   abstract
3     address     author     author
4   articleno  booktitle        doi
5      author        doi       issn
6   booktitle       issn    journal
7         doi   keywords   keywords
8        isbn      month       note
9    keywords     number      pages
10   location      pages      title
11   numpages      title        url
12      pages     volume     volume
13  publisher       year       year
14     series       None       None
15      title       None       None
16        url       None       None
17       year       None       None

Distinct de todas as colunas: ['ENTRYTYPE' 'ID' 'abstract' 'address' 'author' 'articleno' 'booktitle'
 'doi' 'issn' 'journal' 'keywords' 'isbn' 'month' 'note' 'number' 'pages'
 'location' 'title' 'numpages' 'url' 'volume' 'publisher' 'year' 'series']



In [186]:
# COLUNAS EXCLUSIVAS
# (set1 ^ set2 ^ set3) - (set1 & set2 & set3)
# a ^ b = a.symmetric_difference(b)
# a & b = a.intersection(b)
# a - b = a.difference(b)

set1 = set(dfCompCol['ACM'])
set2 = set(dfCompCol['IEEE'])
set3 = set(dfCompCol['SD'])

(set1 ^ set2 ^ set3) - (set1 & set2 & set3)


{'address',
 'articleno',
 'isbn',
 'journal',
 'location',
 'month',
 'note',
 'number',
 'numpages',
 'publisher',
 'series'}

In [187]:
# EXPORT arquivos
if formatFile == 'CSV':
	unionDf.to_csv(dirOutput + '\\' + mergedFilesCSV, index=False, encoding='utf-8')
elif formatFile == 'JSON':
	unionDf.to_json(dirOutput + '\\' + mergedFilesJSON, orient = 'records')
elif formatFile == 'YAML':
	with open(dirOutput + '\\' + mergedFilesYAML, mode='w',encoding='utf8') as f:
		f.write(unionYml)
else:
	print('Formato não disponivel.')
