# Bibtex

In [1]:
import bibtexparser as bp
import pandas as pd
import numpy as np
import yaml as yml
import os
import requests
import json

dir_input = os.path.join(os.getcwd(), 'input')
dir_output = os.path.join(os.getcwd(), 'output')
dir_config = os.path.join(os.getcwd(), 'config')
configFileName = 'config_analisa_bibtex.yml'


In [2]:
# CARREGA O ARQUIVO DE CONFIGURAÇÃO
with open(os.path.join(dir_config, configFileName)) as f:
    configFile = yml.load(f, Loader=yml.loader.SafeLoader)

# ANALISA BIBTEX
listaArquivos = ( \
	configFile['FILE_ACM'], \
	configFile['FILE_IEE'], \
	configFile['FILE_SD' ]
	)

def bibtexToDict(arqv):
	with open(os.path.join(dir_input, arqv), encoding='utf8') as f:
		bib_database = bp.load(f)		
	return bib_database.entries_dict.values() # retorna lista de dicionarios
	
# UNION DATAFRAMES AND YML
listaDf = []
listaYml =[]
for arqv in listaArquivos:
	sourceArticles = bibtexToDict(arqv)
	listaDf.append(pd.DataFrame(sourceArticles))
	listaYml.append(yml.dump(list(sourceArticles)))

unionDf = pd.concat(listaDf)
unionYml = ''.join(listaYml)

In [3]:
def df_export(df, file_name, file_format):
	if file_format == 'CSV':
		df.to_csv(os.path.join(dir_output, file_name + '.csv'), sep=';', index=False, encoding='utf-8')		
		print('Export ' + file_format + ' to ' + dir_output)
	elif file_format == 'JSON':
		df.to_json(os.path.join(dir_output, file_name + '.json'), orient = 'records')
		print('Export ' + file_format + ' to ' + dir_output)
	elif file_format == 'YAML':
		with open(os.path.join(dir_output, file_name + '.yml'), mode='w',encoding='utf8') as f:
			f.write(unionYml)
			print('Export ' + file_format + ' to ' + dir_output)	
	else:
		print('Formato não disponivel.')

df_export(unionDf,'ALL_ARTICLES', configFile['FORMATO'])

Export CSV to c:\Users\camil\OneDrive\Documentos\MBA\PythonDataEngineer\MBA_PyForDE\script_analisa_bibtex\output


# Article Impact

In [4]:
file_scimagojr = 'scimagojr 2020.csv'
file_jcs = 'jcs_2020.csv'
file_article_impact = 'ALL_Article_Impact.csv'

df_scimagojr = pd.read_csv(os.path.join(dir_input, file_scimagojr), delimiter=';', quotechar='"', header=0)
df_jcs = pd.read_csv(os.path.join(dir_input, file_jcs), delimiter=';', quotechar='"', header=0)

# Tratamento
dict_treat_data = {'-': np.nan, '': np.nan, None: np.nan, 'Not Available': np.nan}

# JCS
df_jcs.columns = df_jcs.columns.str.lower()
df_jcs.rename(columns ={'journal impact factor':'jcr_value'}, inplace=True)
df_jcs.jcr_value.replace(to_replace=dict_treat_data, inplace=True)
df_jcs.drop(df_jcs.columns[df_jcs.columns.str.contains('unnamed') == True], axis=1, inplace=True) # Remove colunas vazias no arquivo
df_jcs.drop('rank', axis=1, inplace=True)
df_jcs.drop_duplicates(inplace=True) # Linha inteira duplicada
df_jcs['full journal title'] = df_jcs['full journal title'].str.upper()
df_jcs['full journal title'] = df_jcs['full journal title'].str.strip()
df_jcs['jcr_value'].loc[df_jcs['jcr_value'].notnull()] = [number.replace(',','.') for number in df_jcs['jcr_value'] if type(number) != float]
df_jcs['jcr_value'] = pd.to_numeric(df_jcs['jcr_value'])

# SCIMAGO
df_scimagojr.columns = df_scimagojr.columns.str.lower()
df_scimagojr.rename(columns ={'sjr':'scimago_value'}, inplace=True)
df_scimagojr.scimago_value.replace(to_replace=dict_treat_data, inplace=True)
df_scimagojr.drop('rank', axis=1, inplace=True)
df_scimagojr['title'] = df_scimagojr['title'].str.upper()
df_scimagojr['title'] = df_scimagojr['title'].str.strip()
df_scimagojr['issn'].replace(to_replace=dict_treat_data, inplace=True)
df_scimagojr['scimago_value'].loc[df_scimagojr['scimago_value'].notnull()] = [number.replace(',','.') for number in df_scimagojr['scimago_value'] if type(number) != float]
df_scimagojr['scimago_value'] = pd.to_numeric(df_scimagojr['scimago_value'])



  df_scimagojr = pd.read_csv(os.path.join(dir_input, file_scimagojr), delimiter=';', quotechar='"', header=0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jcs['jcr_value'].loc[df_jcs['jcr_value'].notnull()] = [number.replace(',','.') for number in df_jcs['jcr_value'] if type(number) != float]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scimagojr['scimago_value'].loc[df_scimagojr['scimago_value'].notnull()] = [number.replace(',','.') for number in df_scimagojr['scimago_value'] if type(number) != float]


In [5]:
df_scimagojr[df_scimagojr['scimago_value'].notnull()]['scimago_value']

0        62.937
1        40.949
2        37.461
3        34.573
4        32.011
          ...  
32599     0.100
32600     0.100
32601     0.100
32602     0.100
32603     0.100
Name: scimago_value, Length: 32604, dtype: float64

# Article Impact and Bibtex

JOIN SCIMAGO AND BIBTEX

In [6]:
df_bibtex = unionDf.copy()
df_bibtex.columns = df_bibtex.columns.str.lower()

df_bibtex.issn = df_bibtex.issn.replace(np.nan,'')
df_bibtex.issn = list(map(lambda x: x.replace('-',''), df_bibtex.issn))

In [7]:
# Trata scimago issn, criando colunas dinamicamente
# 00257656, 16993993, 16994019
# 19853718, 19858345, 21804249
df_treat_scimago_issn = pd.DataFrame([x.split(',') for x in df_scimagojr.issn if type(x) != float])

for col in df_treat_scimago_issn.columns:
	column_name = 'issn' + str(col)
	df_scimagojr.insert(len(df_scimagojr.columns), column_name, df_treat_scimago_issn[col])

df_scimagojr.drop(columns='issn', inplace=True)

list_issn_col = df_scimagojr.columns[df_scimagojr.columns.str.contains('issn')]
list_df_join = []

for col_name in list_issn_col:
	df_join_temp = df_bibtex.merge(df_scimagojr, how='inner', left_on='issn', right_on=col_name, suffixes=['_1','_2']) 
	print('Column: ' + col_name + ' shape:' + str(df_join_temp.shape))
	list_df_join.append(df_join_temp)

df_bibtex_scimagoj = pd.concat(list_df_join).drop_duplicates()

Column: issn0 shape:(39, 46)
Column: issn1 shape:(0, 46)
Column: issn2 shape:(0, 46)


JOIN JCS AND BIBTEX

In [8]:
df_bibtex['journal'] = df_bibtex['journal'].str.upper()
df_bibtex['journal'] = df_bibtex['journal'].str.strip()

df_bibtex_jcs = df_bibtex.merge(df_jcs, how='inner', left_on='journal', right_on='full journal title', suffixes=['_1','_2'])

FILTRO

In [9]:
# CARREGA O ARQUIVO DE CONFIGURAÇÃO
config_file_name = 'config_article_impact.yml'

with open(os.path.join(dir_config, config_file_name)) as f:
    config_file = yml.load(f, Loader=yml.loader.SafeLoader)

filter_list = ( config_file['TITLE'],\
	config_file['KEYWORDS'],\
	config_file['YEAR'],\
	config_file['TYPE_PUBLICATION'],\
	config_file['DOI'],\
	config_file['JCR_VALUE'],\
	config_file['SCIMAGO_VALUE'],\
	config_file['FORMATO']
	)

if config_file['JCR_VALUE'] == None:
	config_file['JCR_VALUE'] = np.nan

if config_file['SCIMAGO_VALUE'] == None:
	config_file['SCIMAGO_VALUE'] = np.nan	

print(filter_list)

('Big Data', None, None, None, None, None, 2.082, 'CSV')


In [10]:
# JOIN
df_bibtex_impact = pd.concat([df_bibtex_scimagoj, df_bibtex_jcs])

# Filtro simultaneo
df_filtro_texto = df_bibtex_impact.loc[
	(df_bibtex_impact.title_1.str.contains("{}".format(config_file['TITLE']), case=False)) |
	(df_bibtex_impact.journal.str.contains("{}".format(config_file['TITLE']), case=False)) |
	(df_bibtex_impact.keywords.str.contains("{}".format(config_file['KEYWORDS']), case=False)) |
	(df_bibtex_impact.abstract.str.contains("{}".format(config_file['ABSTRACT']), case=False)) |
	(df_bibtex_impact.year.str.contains("{}".format(config_file['YEAR']), case=False)) |
	(df_bibtex_impact.type.str.contains("{}".format(config_file['TYPE_PUBLICATION']), case=False)) |
	(df_bibtex_impact.doi.str.contains("{}".format(config_file['DOI']), case=False))
	]

df_filtro_final = df_filtro_texto.loc[(df_filtro_texto['jcr_value'] == config_file['JCR_VALUE']) |
	(df_filtro_texto['scimago_value'] == config_file['SCIMAGO_VALUE'])]


In [11]:
df_filtro_final[['title_1','journal','jcr_value','scimago_value']].head(10)

Unnamed: 0,title_1,journal,jcr_value,scimago_value
0,Big Data Systems: A Software Engineering Persp...,ACM Comput. Surv.,,2.082
1,Multimedia Big Data Analytics: A Survey,ACM Comput. Surv.,,2.082
2,"A Survey on IoT Big Data: Current Status, 13 V...",ACM Comput. Surv.,,2.082
3,SLA Management for Big Data Analytical Applica...,ACM Comput. Surv.,,2.082
4,Computational Health Informatics in the Big Da...,ACM Comput. Surv.,,2.082


EXPORT

In [12]:
df_export(df_filtro_final,'bibtex_impact',config_file['FORMATO'] )

Export CSV to c:\Users\camil\OneDrive\Documentos\MBA\PythonDataEngineer\MBA_PyForDE\script_analisa_bibtex\output


In [16]:
# CARREGA O ARQUIVO DE CONFIGURAÇÃO
config_file_name = 'config_api.yml'

with open(os.path.join(dir_config, config_file_name)) as f:
    config_file = yml.load(f, Loader=yml.loader.SafeLoader)

url_config = ( config_file['API_TOKEN_SCOPUS'],\
	config_file['API_TOKEN_IEEE'],\
	config_file['API_STRING'],\
	config_file['FORMATO']
	)

print(url_config)

('dd43a9e67059d268b0a6eaf9f3d73cd4', 'zcw5heuva2mrz4an52gf32y6', "/'data quality' AND 'big data'", 'JSON')


In [22]:

url = 'https://ieeexploreapi.ieee.org/api/v1/search/articles?apikey=zcw5heuva2mrz4an52gf32y6&querytext='+config_file['API_STRING']
url_scopus = 'http://api.elsevier.com/content/search/scopus?query='+config_file['API_STRING']+'&apiKey=dd43a9e67059d268b0a6eaf9f3d73cd4'
##var_union = []

resposta_hoteis = requests.request('GET', url)
resposta_scopus = requests.request('GET', url_scopus)
var_teste = resposta_hoteis.json()
var_teste_scopus = resposta_scopus.json()
var_teste.items()
var_teste_scopus.items()
