## Use multiple Scopus Author IDs to retrieve lists of articles by author

In [6]:
# Dependencies
import requests
import json
import pandas as pd
import numpy as np
from config import api_key
from pandas.io.json import json_normalize  
import nltk
import re
import io
from nltk.corpus import stopwords, reuters
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import panel as pn
import panel.widgets 
from pathlib import Path
from panel.interact import interact
import param
import time
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
import multidict as multidict
lemmatizer = WordNetLemmatizer()
pn.extension()
import hvplot.pandas

In [14]:

class search_scopus_dash(param.Parameterized):
    
    # Search Term Input to query search, default is "Nanosafety"
    Query  = param.String(default="Nanosafety", doc="Insert query term(s)")
    
    # Year range selection parameter for search
    Year_Range = param.Range((2005, 2010), bounds=(1970, 2021))
    
    # Action param to initiate search
    generate_sample_df = param.Action(lambda self: self.update_sample_df(), label="Generate Data", doc="""
      An action callback which will update the sample_df.""")
    
    # Dataframe param to store data from scopus search API
    sample_df = param.DataFrame()
    
    file_name = param.String(default="data.csv", doc="""
      The filename to save to.""")
    
    # Initiate desired parameters
    def __init__(self, **params):
        super().__init__(**params)
        self.search_request_funt()
        self.update_sample_df()        
        self.download = pn.widgets.FileDownload(name="Download Data", filename=self.file_name, callback=self._download_callback, button_type="primary")
        self.table = pn.widgets.DataFrame(self.sample_df, height=300)
        
    # Upate filename in download widget  
    @pn.depends('file_name', watch=True)
    def _update_filename(self):
        self.download.filename = self.file_name
    
    # Download function for excel file
    def _download_callback(self):
        """
        A FileDownload callback will return a file-like object which can be serialized
        and sent to the client.
        """
        self.download.filename = self.file_name
        sio = io.StringIO()
        self.sample_df.to_csv(sio, index=False)
        sio.seek(0)
        return sio
    
    
    # Scopus API search request function to generate dataframe with columns for date, title, and abstrast with queried search term b/t selected year range
    @param.depends('file_name', "Query", "Year_Range", watch=True)
    def search_request_funt(self):
    
        scopus_search_appended_df = pd.DataFrame()
    
        if self.Year_Range[0] == self.Year_Range[1]:
            date = str(self.Year_Range[0])
        else: 
            date = str(self.Year_Range[0]) + "-" + str(self.Year_Range[1])
        # Declare necessary parameters for Scopus request API search tool
        cursor = "*"
        field = "prism:coverDate,dc:title,dc:description"
        url = "http://api.elsevier.com/content/search/scopus?"
        headers = {"X-ELS-APIKey": api_key, 'Accept':'application/json'}
        parameters = {"query": self.Query, "view": "Complete", "date": date, "field": field, "cursor": cursor}
        article_response = requests.get(url, headers=headers, params=parameters)
        article_response_json = article_response.json()
        total_results = int(article_response_json['search-results']['opensearch:totalResults'])
        # Loop to grab all API data
        while article_response_json['search-results'].get('entry') is not None:
            url = "http://api.elsevier.com/content/search/scopus?"
            parameters = {"query": self.Query, "view": "Complete", "date": date, "field": field, "cursor": cursor}
            article_response = requests.get(url, headers=headers, params= parameters)
            article_response_json = article_response.json()
            if article_response_json['search-results'].get('entry') is not None:
                scopus_articles_df = pd.DataFrame.from_dict(json_normalize(article_response_json, meta=["search-results"], record_path=["search-results", "entry"]), orient="columns")
                date_title_description_df = scopus_articles_df[["prism:coverDate", "dc:title", "dc:description"]]
                date_title_description_df.columns = ['Date','Title','Content']
                pd.to_datetime(date_title_description_df['Date'], format = "%Y-%m-%d")
                date_title_description_df.sort_values(by='Date')
                #date_title_description_df = date_title_description_df.set_index('Date')
                scopus_search_appended_df = scopus_search_appended_df.append(date_title_description_df)
                cursor = article_response_json['search-results']['cursor']['@next']
        return scopus_search_appended_df

    # Update dataframe with results
    @pn.depends('file_name', watch=True)
    def update_sample_df(self, event=None):
        self.sample_df = pd.DataFrame(self.search_request_funt())
        
    @pn.depends("sample_df", watch=True)
    def _update_table(self):
        if hasattr(self, "table"):
            self.table.value = self.sample_df
            
    def save_sample_data(self, event=None):
        if not self.sample_df is None:
            self.sample_df
    
    def freq_plot_funt(self):
        scopus_search_appended_df = self.sample_df
        scopus_search_appended_df = scopus_search_appended_df.reset_index()
        scopus_search_appended_df['Date'] = pd.to_datetime(scopus_search_appended_df['Date'], format = "%Y-%m-%d")
        scopus_search_appended_year = scopus_search_appended_df.Date.dt.year.unique()
        scopus_search_appended_count = scopus_search_appended_df['Date'].groupby(scopus_search_appended_df.Date.dt.year).agg('count')
        scopus_search_appended_count_df = pd.DataFrame(scopus_search_appended_count)
        scopus_search_appended_count_df.columns = ['Count']
        scopus_search_appended_count_df = scopus_search_appended_count_df.reset_index() 
        plot = scopus_search_appended_count_df.hvplot.line(title= "Total Count per Year for the Word '" + self.Query + "' used in Academic Articles", 
                                    x = "Date", 
                                    y = 'Count',
                                    invert = False, 
                                    height = 600,
                                    width = 800
                                    )
        return plot
    
    def view(self):
            return pn.Row(pn.Column(
            "## Generate and Download Data",
            pn.Row(
                pn.Param(self, parameters=['Query', 'Year_Range', 'generate_sample_df'], show_name=False, widgets={"generate_sample_df": {"button_type": "primary"}}),
                pn.Column(self.param.file_name, self.download, align='end', margin=(10,5,5,5)),
            ),
            "**Sample data (10 Rows)**",
            self.table), self.freq_plot_funt
        )

        

class word_dash(param.Parameterized):
    
    # Word Count Slider
    Word_Slider = param.Integer(15, bounds=(5,50))

    # Stop word addition
    Text_Input = param.String(default='', doc= 'Type Words Here, Seperated by a Space')
    
    File_Input = param.Parameter()
    
    data = param.DataFrame()
    
    freq_title_data = param.DataFrame()
    
    freq_content_data = param.DataFrame()
    

    
    # Collation Count Slider
    CT_Slider = param.Integer(30, bounds=(0,60))
    
    
    def __init__(self, **params):
        self.param.File_Input.default = pn.widgets.FileInput()
        blank_title = px.imshow(np.full((600, 600, 3), 255, dtype = np.uint8), title= "Upload File to Generate Title WordCloud").update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
        blank_content = px.imshow(np.full((600, 600, 3), 255, dtype = np.uint8), title= "Upload File to Generate Content WordCloud").update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
        super().__init__(**params)
        self.plotly_pane_title = pn.pane.Plotly(blank_title, background= 'white')
        self.plotly_pane_content = pn.pane.Plotly(blank_content, background = 'white')
        self.title_str_pane = pn.pane.Str(max_width = 600, height_policy = "fit")
        self.content_str_pane = pn.pane.Str(max_width = 600, height_policy = "fit")
        self.freq_title_df = pn.pane.DataFrame()
        
    @pn.depends("File_Input.value", watch=True)
    def _parse_file_input(self):
        value = self.File_Input.value
        if value:
            string_io = io.StringIO(value.decode("utf8"))
            self.data = pd.read_csv(string_io)
        else:
            return
    
    @pn.depends("data", "Word_Slider", "Text_Input", 'CT_Slider', watch = True)
    def dataframe_to_string(self):
            scopus = self.data
            if scopus is None:
                return
            else:
                for row in scopus:
                    big_string_title = ''.join(str(scopus['Title']))
                    big_string_content = ''.join(str(scopus['Content']))
                sw = set(stopwords.words('english'))
                regex = re.compile("[^a-zA-Z ]")
                wordlist = re.sub("[^\w]", " ",  self.Text_Input).split()
                sw_addons = {'using', 'via', 'based', 'nan', 'date', 'used', 'b', 'tio', 'nanote'}
                sw_addons.update(wordlist)
                re_clean_title = regex.sub('', big_string_title)
                re_clean_content = regex.sub('', big_string_content)
                words_title = word_tokenize(re_clean_title)
                words_content = word_tokenize(re_clean_content)
                lem_title = [lemmatizer.lemmatize(word) for word in words_title]
                lem_content = [lemmatizer.lemmatize(word) for word in words_content]
                output_title = [word.lower() for word in lem_title if word.lower() not in sw.union(sw_addons)]
                output_content = [word.lower() for word in lem_content if word.lower() not in sw.union(sw_addons)]
                full_string_title = ' '.join(output_title)
                full_string_content = ' '.join(output_content)
                wc_title = WordCloud(background_color="white", min_word_length= 3, normalize_plurals=True, collocation_threshold= self.CT_Slider, max_words= self.Word_Slider).generate(full_string_title)
                wc_content = WordCloud(background_color="white", min_word_length= 3, normalize_plurals=True, collocation_threshold= self.CT_Slider, max_words= self.Word_Slider).generate(full_string_content)
                
                fig_title = px.imshow(wc_title)
                fig_title.update_xaxes(showticklabels=False)
                fig_title.update_yaxes(showticklabels=False)
                fig_title.update_layout(title_font_color="black", title_font_family= "Arial", title_font_size= 45, title_x = 0.5, title_y= 0.95)
                
                
                fig_content = px.imshow(wc_content)
                fig_content.update_xaxes(showticklabels=False)
                fig_content.update_yaxes(showticklabels=False)
                fig_content.update_layout(title_font_color="black", title_font_family= "Arial", title_font_size= 45, title_x = 0.5, title_y= 0.95)
            
                self.plotly_pane_title.object = fig_title
                self.plotly_pane_content.object = fig_content
                
    def view(self):
        return pn.Column(pn.Row(pn.Column("*Select the saved excel data file here*", self.File_Input), 
                         pn.Param(self, parameters = ['Column_Selector', 'Word_Slider', 'CT_Slider', 'Text_Input',], show_name = False, widgets = {
                             "Column_Selector": {
                                 "type": pn.widgets.Select(name = 'Title', options=['Title', 'Content'])
                             },
                             "Word_Slider": {
                                 "type": pn.widgets.IntSlider(name = 'Total Words', start=0, end= 50, step = 1, value = 10, value_throttled= 10),
                                 "throttled": True,
                             },
                             "CT_Slider": {
                                 "type": pn.widgets.IntSlider(name = 'Collocation Threshold', start=0, end= 60, step = 1, value = 30, value_throttled= 10),
                                 "throttled": True,
                             }
                         })),
                         pn.Row(pn.Column(pn.Card(self.plotly_pane_title, title = "WordCloud of Titles", background='WhiteSmoke')),
                                pn.Column(pn.Card(self.plotly_pane_content, title = "WordCloud of Abstracts", background='WhiteSmoke')
                                          ))
                        )
        
# Save dashboard for visualization

search = search_scopus_dash()

search_view = search.view()

word = word_dash()

word_view = word.view()

all_tabs = pn.Tabs(('Search', search_view), ('Word Cloud', word_view))

all_tabs_view = pn.template.MaterialTemplate(site="Search Panel", title="Download Search Data, Generate CSV File, Produce WordClouds", main=[all_tabs]);

all_tabs_view.show()

Launching server at http://localhost:55472


<bokeh.server.server.Server at 0x1c20390a3c8>

In [2]:
?CountVectorizer

Object `CountVectorizer` not found.


In [3]:
list_1 = [('re', 148), ('make', 117), ('ve', 106), ('said', 95), ('able', 88)]


In [4]:
list_1

[('re', 148), ('make', 117), ('ve', 106), ('said', 95), ('able', 88)]