## Use multiple Scopus Author IDs to retrieve lists of articles by author

In [2]:
# Dependencies
import requests
import json
import pandas as pd
import numpy as np
from config import api_key
from pandas.io.json import json_normalize  
import nltk
import re
import io
from nltk.corpus import stopwords, reuters
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import panel as pn
import panel.widgets 
from pathlib import Path
from panel.interact import interact
import hvplot.pandas
import param
from IPython.display import Image
from IPython.core.display import HTML
lemmatizer = WordNetLemmatizer()
pn.extension()

In [30]:
# Declare necessary parameters for Scopus request API search tool
url = "http://api.elsevier.com/content/search/scopus?" + "?format=json"
headers = {"X-ELS-APIKey": api_key, 'Accept':'application/json'}
view = "Complete"
field = ["dc:description"]

In [34]:
# Declare the timeframe to search by start and end years
# Year is the lowest granularity for this search function
start_year = 2009
end_year = 2010
if start_year == end_year:
    date = str(start_year)
else: 
    date = str(start_year) + "-" + str(end_year)

# Generate a lists of dates for the requested timeframe
date_list = []
for date in range(start_year, end_year + 1):
    date_list.append(int(date))
    
#date_list

In [35]:
# Input the query search term
query = "graphene"

In [36]:
# Test search function to determine the total results from the Get Request for the Specified Date Range
total_results = []
date_list = []
for date in range(start_year, end_year + 1):
    date_list.append(int(date))
    
def total_results_funt(date_list):
    for i in date_list:
        parameters = {"query": query, "view": view, "date": str(i), "count": 25, "start": 0}
        article_response = requests.get(url, headers=headers, params=parameters)
        article_response_json = article_response.json()
        total_results.append(int(article_response_json['search-results']['opensearch:totalResults']))
    return total_results

total_results = total_results_funt(date_list)
total_results

[3885, 7030]

In [37]:
# Search request function 
# Scopus request has a maximum of 5000 articles returned per year
def search_request_funt(query, start_year, end_year):
    scopus_search_appended_df = pd.DataFrame()
    for year in range (start_year, end_year + 1):
        parameters = {"query": query, "view": view, "field": field, "date": str(year), "start": 0}
        article_response = requests.get(url, headers=headers, params=parameters)
        article_response_json = article_response.json()
        total_results = int(article_response_json['search-results']['opensearch:totalResults'])
        for n in range(0, total_results, 25):
            parameters = {"query": query, "view": view, "date": str(year), "count": 25, "start": n}
            article_response = requests.get(url, headers=headers, params=parameters)
            article_response_json = article_response.json()
            scopus_articles_df = pd.DataFrame.from_dict(json_normalize(article_response_json, meta=["search-results"], record_path=["search-results", "entry"]), orient="columns")
            date_title_description_df = scopus_articles_df[["prism:coverDate", "dc:title", "dc:description"]]
            date_title_description_df.columns = ['Date','Title','Content']
            pd.to_datetime(date_title_description_df['Date'], format = "%Y-%m-%d")
            date_title_description_df.sort_values(by='Date')
            date_title_description_df = date_title_description_df.set_index('Date')
            scopus_search_appended_df = scopus_search_appended_df.append(date_title_description_df)
    return scopus_search_appended_df
            
            
    

In [38]:
# Generate DataFrame
pd.set_option('display.max_rows', None)
scopus_search_appended_df = search_request_funt(query, start_year, end_year)
# scopus_search_appended_df

KeyError: 'search-results'

In [11]:
# Save a csv of the dataframe
scopus_search_appended_df.to_csv("Scopus_Search_" + "_" + query + "_" + str(start_year) + "_" + str(end_year) + ".csv")

In [13]:
# Set file path
scopus_search_path = Path("Scopus_Search_" + "_" + query + "_" + str(start_year) + "_" + str(end_year) + ".csv", header = 0)
# Read CSV to DataFrame
query_df = pd.read_csv(scopus_search_path)
query_df.head()

Unnamed: 0.1,Unnamed: 0


In [None]:
# Date Range Slider
year_slider_1 = pn.widgets.IntRangeSlider(name='Timeframe', width=300, start=1990, end=2021, value=(2005, 2021))

@pn.depends(year_slider)
def freq_plot_funt(year_slider):
    total_results = []
    date_list = []
    for date in range(year_slider[0], year_slider[1] + 1):
        date_list.append(int(date))
    for i in date_list:
        parameters = {"query": query, "view": view, "date": str(i), "count": 25, "start": 0}
        article_response = requests.get(url, headers=headers, params=parameters)
        article_response_json = article_response.json()
        total_results.append(int(article_response_json['search-results']['opensearch:totalResults']))
    # Create an annual frequency bar plot for query search
    frequency_df = pd.DataFrame([date_list, total_results]).T
    frequency_df.columns = ['Year', 'Total Count per Year']
    frequency_csv = frequency_df.to_csv("Frequency_" + "_" + query + "_" + str(year_slider[0]) + "_" + str(year_slider[1]) + ".csv")
    return frequency_csv



In [None]:
# Date Range Slider
year_slider = pn.widgets.IntRangeSlider(name='Timeframe', width=300, start=1990, end=2021, value=(2010, 2021))

@pn.depends(year_slider)
def freq_plot_funt(year_slider):
    total_results = []
    date_list = []
    for date in range(year_slider[0], year_slider[1] + 1):
        date_list.append(int(date))
    for i in date_list:
        parameters = {"query": query, "view": view, "date": str(i), "count": 25, "start": 0}
        article_response = requests.get(url, headers=headers, params=parameters)
        article_response_json = article_response.json()
        total_results.append(int(article_response_json['search-results']['opensearch:totalResults']))
    # Create an annual frequency bar plot for query search
    frequency_df = pd.DataFrame([date_list, total_results]).T
    frequency_df.columns = ['Year', 'Total Count per Year']
    plot = frequency_df.hvplot.line(title= "Total Count per Year for the Word '" + query + "' used in Academic Articles", 
                                x = "Year", 
                                y = 'Total Count per Year',
                                invert = False, 
                                height = 400,
                                width = 800
                                )
    return plot
freq_plot = pn.Column(pn.Row(pn.WidgetBox(year_slider), pn.Column(freq_plot_funt)))
#freq_plot.show()

In [None]:
# Y value multiselectors
column_selector = pn.widgets.Select(name='Select for a Title or Abstract', value = 'Title', options=['Title', 'Content'])

# Word Count Slider
word_slider = pn.widgets.IntSlider(name='Max Words', start=5, end=50, step=1, value=15)

# Stop word addition
text_input = pn.widgets.TextInput(name='Enter Additional Words to Ignore', placeholder='Type Words Here, Seperated by a Space')
# Dynamic Plots

# Convert dataframe Title column to string text
@pn.depends(column_selector, word_slider, text_input)
def dataframe_to_string(column_selector, word_slider, text_input):
    if column_selector == 'Content':
        for row in query_df:
            big_string = ''.join(str(query_df['Content']))
        sw = set(stopwords.words('english'))
        regex = re.compile("[^a-zA-Z ]")
        wordlist = re.sub("[^\w]", " ",  text_input).split()
        sw_addons = {'using', 'via', 'based', 'nan', 'date', 'used', 'b', 'tio', 'nanote'}
        sw_addons.update(wordlist)
        re_clean = regex.sub('', big_string)
        words = word_tokenize(re_clean)
        lem = [lemmatizer.lemmatize(word) for word in words]
        output = [word.lower() for word in lem if word.lower() not in sw.union(sw_addons)]
        full_string = ' '.join(output)
        wc_content = WordCloud(width=800, height=600, background_color="white", max_words= word_slider).generate(full_string)
        image_1 = wc_content.to_image()
        return image_1
    else:
        for row in query_df:
            big_string = ''.join(str(query_df['Title']))
        sw = set(stopwords.words('english'))
        regex = re.compile("[^a-zA-Z ]")
        wordlist = re.sub("[^\w]", " ",  text_input).split()
        sw_addons = {'using', 'via', 'based', 'nan', 'date', 'used', 'b', 'tio', 'nanote'}
        sw_addons.update(wordlist)
        re_clean = regex.sub('', big_string)
        words = word_tokenize(re_clean)
        lem = [lemmatizer.lemmatize(word) for word in words]
        output = [word.lower() for word in lem if word.lower() not in sw.union(sw_addons)]
        full_string = ' '.join(output)
        wc_content = WordCloud(width=800, height=600, background_color="white", max_words= word_slider).generate(full_string)
        image_2 = wc_content.to_image()
        return image_2

word_cloud_tab = pn.Column('# Word Cloud', pn.Row(pn.WidgetBox(column_selector, word_slider, text_input), pn.Column(dataframe_to_string)), background='#f0f0f0')
freq_plot_tab = pn.Column('# Frequency Plot', pn.WidgetBox(year_slider), freq_plot_funt, background='#f0f0f0')
all_tabs = pn.Tabs(('Frequency Plot', freq_plot_tab), ('Word Cloud', word_cloud_tab))
all_tabs.show()