## Use multiple Scopus Author IDs to retrieve lists of articles by author

In [22]:
# Dependencies
import requests
import json
import pandas as pd
import numpy as np
from config import api_key
from pandas.io.json import json_normalize  
import nltk
import re
import io
from nltk.corpus import stopwords, reuters
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import panel as pn
import panel.widgets 
from pathlib import Path
from panel.interact import interact
import param
import time
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
lemmatizer = WordNetLemmatizer()
pn.extension()
import hvplot.pandas as hv

In [2]:
  
class word_dash(param.Parameterized):
    
    # Word Count Slider
    Word_Slider = param.Integer(15, bounds=(5,50))

    # Stop word addition
    Text_Input = param.String(default='', doc= 'Type Words Here, Seperated by a Space')
    
    File_Input = param.Parameter()
    
    data = param.DataFrame()
    
    freq_title_data = param.DataFrame()
    
    freq_content_data = param.DataFrame()
    
    cloud_select = param.ObjectSelector(default='Frequency WordCloud', objects=['WordCloud', 'Frequency WordCloud', 'Frequency Plot'])
    
    # Collation Count Slider
    CT_Slider = param.Integer(30, bounds=(0,60))
    
    
    def __init__(self, **params):
        self.param.File_Input.default = pn.widgets.FileInput()
        blank_title = px.imshow(np.full((600, 600, 3), 255, dtype = np.uint8), title= "Upload File to Generate Title WordCloud").update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
        blank_content = px.imshow(np.full((600, 600, 3), 255, dtype = np.uint8), title= "Upload File to Generate Content WordCloud").update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
        super().__init__(**params)
        self.plotly_pane_title = pn.pane.Plotly(blank_title, background= 'white')
        self.plotly_pane_content = pn.pane.Plotly(blank_content, background = 'white')
        self.title_str_pane = pn.pane.Str(max_width = 600, height_policy = "fit")
        self.content_str_pane = pn.pane.Str(max_width = 600, height_policy = "fit")
        
    @pn.depends("File_Input.value", watch=True)
    def _parse_file_input(self):
        value = self.File_Input.value
        if value:
            string_io = io.StringIO(value.decode("utf8"))
            self.data = pd.read_csv(string_io)
        else:
            return
    
    @pn.depends("data", "Word_Slider", "Text_Input", 'CT_Slider', 'cloud_select', watch = True)
    def dataframe_to_string(self):
            scopus = self.data
            if scopus is None:
                return
            else:
                if self.cloud_select == 'WordCloud':
                    for row in scopus:
                        big_string_title = ''.join(str(scopus['Title']))
                        big_string_content = ''.join(str(scopus['Content']))
                    sw = set(stopwords.words('english'))
                    regex = re.compile("[^a-zA-Z ]")
                    wordlist = re.sub("[^\w]", " ",  self.Text_Input).split()
                    sw_addons = {'using', 'via', 'based', 'nan', 'date', 'used', 'b', 'tio', 'nanote'}
                    sw_addons.update(wordlist)
                    re_clean_title = regex.sub('', big_string_title)
                    re_clean_content = regex.sub('', big_string_content)
                    words_title = word_tokenize(re_clean_title)
                    words_content = word_tokenize(re_clean_content)
                    lem_title = [lemmatizer.lemmatize(word) for word in words_title]
                    lem_content = [lemmatizer.lemmatize(word) for word in words_content]
                    output_title = [word.lower() for word in lem_title if word.lower() not in sw.union(sw_addons)]
                    output_content = [word.lower() for word in lem_content if word.lower() not in sw.union(sw_addons)]
                    full_string_title = ' '.join(output_title)
                    full_string_content = ' '.join(output_content)
                    wc_title = WordCloud(background_color="white", min_word_length= 3, normalize_plurals=True, collocation_threshold= self.CT_Slider, max_words= self.Word_Slider).generate(full_string_title)
                    wc_content = WordCloud(background_color="white", min_word_length= 3, normalize_plurals=True, collocation_threshold= self.CT_Slider, max_words= self.Word_Slider).generate(full_string_content)

                    fig_title = px.imshow(wc_title)
                    fig_title.update_xaxes(showticklabels=False)
                    fig_title.update_yaxes(showticklabels=False)
                    fig_title.update_layout(title_font_color="black", title_font_family= "Arial", title_font_size= 45, title_x = 0.5, title_y= 0.95)


                    fig_content = px.imshow(wc_content)
                    fig_content.update_xaxes(showticklabels=False)
                    fig_content.update_yaxes(showticklabels=False)
                    fig_content.update_layout(title_font_color="black", title_font_family= "Arial", title_font_size= 45, title_x = 0.5, title_y= 0.95)

                    self.plotly_pane_title.object = fig_title
                    self.plotly_pane_content.object = fig_content
                
                elif self.cloud_select == 'Frequency WordCloud':
                    scopus = scopus.astype(str)
                    # Set stopwords
                    sw = set(stopwords.words('english'))
                    wordlist = re.sub("[^\w]", " ",  self.Text_Input).split()
                    sw_addons = {'inf', 'nano', 'based', 'effect', 'impact'}
                    sw_addons.update(wordlist)
                    # Make corpus in list form
                    corpus_title = [' '.join(scopus.Title.tolist()) for row in scopus]
                    corpus_content = [' '.join(scopus.Content.tolist()) for row in scopus]
                    # Use countvectorizer to analyze corpus
                    cv_title = CountVectorizer(ngram_range=(1, 3), stop_words = sw.union(sw_addons), analyzer = 'word')
                    cv_content = CountVectorizer(ngram_range=(1, 3), stop_words = sw.union(sw_addons), analyzer = 'word')
                    fit_title = cv_title.fit_transform(corpus_title)
                    fit_content = cv_content.fit_transform(corpus_content)
                    fit_title = fit_title.toarray()
                    fit_content = fit_content.toarray()
                    # Create dataframes from cv transformed data
                    title_df = pd.DataFrame(fit_title, columns = cv_title.get_feature_names())
                    content_df = pd.DataFrame(fit_content, columns = cv_content.get_feature_names())
                    # Sort text from most frequent to less
                    text_title = title_df.loc[0].sort_values(ascending=False)
                    text_content = content_df.loc[0].sort_values(ascending=False)
                    # Set max most freqent words to plot
                    text_title_top = text_title[:self.Word_Slider]
                    text_content_top = text_content[:self.Word_Slider]
                    # Create Frequency WordCloud
                    wordcloud = WordCloud(min_word_length =3, background_color='white')
                    title_wc = wordcloud.generate_from_frequencies(text_title_top)
                    content_wc = wordcloud.generate_from_frequencies(text_content_top)

                    fig_title_2 = px.imshow(title_wc)
                    fig_title_2.update_xaxes(showticklabels=False)
                    fig_title_2.update_yaxes(showticklabels=False)
                    fig_title_2.update_layout(title_font_color="black", title_font_family= "Arial", title_font_size= 45, title_x = 0.5, title_y= 0.95)


                    fig_content_2 = px.imshow(content_wc)
                    fig_content_2.update_xaxes(showticklabels=False)
                    fig_content_2.update_yaxes(showticklabels=False)
                    fig_content_2.update_layout(title_font_color="black", title_font_family= "Arial", title_font_size= 45, title_x = 0.5, title_y= 0.95)

                    self.plotly_pane_title.object = fig_title_2
                    self.plotly_pane_content.object = fig_content_2
                    
                elif self.cloud_select == 'Frequency Plot':
                    scopus = scopus.astype(str)
                    # Set stopwords
                    sw = set(stopwords.words('english'))
                    wordlist = re.sub("[^\w]", " ",  self.Text_Input).split()
                    sw_addons = {'inf', 'nano', 'based', 'effect', 'impact'}
                    sw_addons.update(wordlist)
                    # Make corpus in list form
                    corpus_title = [' '.join(scopus.Title.tolist()) for row in scopus]
                    corpus_content = [' '.join(scopus.Content.tolist()) for row in scopus]
                    # Use countvectorizer to analyze corpus
                    cv_title = CountVectorizer(ngram_range=(1, 3), stop_words = sw.union(sw_addons), analyzer = 'word')
                    cv_content = CountVectorizer(ngram_range=(1, 3), stop_words = sw.union(sw_addons), analyzer = 'word')
                    fit_title = cv_title.fit_transform(corpus_title)
                    fit_content = cv_content.fit_transform(corpus_content)
                    fit_title = fit_title.toarray()
                    fit_content = fit_content.toarray()
                    # Create dataframes from cv transformed data
                    title_df = pd.DataFrame(fit_title, columns = cv_title.get_feature_names())
                    content_df = pd.DataFrame(fit_content, columns = cv_content.get_feature_names())
                    # Sort text from most frequent to less
                    text_title = title_df.loc[0].sort_values(ascending=False)
                    text_content = content_df.loc[0].sort_values(ascending=False)
                    # Set max most freqent words to plot
                    text__title_top = text_title[:self.Word_Slider]
                    text__content_top = text_content[:self.Word_Slider]
                    # Create Dictionaries
                    text_title_dict = text__title_top.to_dict()
                    text_content_dict = text__content_top.to_dict()
                    
                    title_df = pd.DataFrame.from_dict(text_title_dict.items())
                    title_df.columns = ['Word', 'Count']
                    content_df = pd.DataFrame.from_dict(text_content_dict.items())
                    content_df.columns = ['Word', 'Count']
                    
                    fig_title = px.bar(title_df, x='Word', y='Count')
                    fig_content = px.bar(content_df, x='Word', y='Count')
                    
                    
                    self.plotly_pane_title.object = fig_title
                    self.plotly_pane_content.object = fig_content
                
    def view(self):
        return pn.Column(pn.Row(pn.Column("*Select the saved excel data file here*", self.File_Input),
                         pn.Param(self, parameters = ['Column_Selector', 'Word_Slider', 'CT_Slider', 'Text_Input','cloud_select'], show_name = False, widgets = {
                             "Column_Selector": {
                                 "type": pn.widgets.Select(name = 'Title', options=['Title', 'Content'])
                             },
                             "Word_Slider": {
                                 "type": pn.widgets.IntSlider(name = 'Total Words', start=0, end= 50, step = 1, value = 10, value_throttled= 10),
                                 "throttled": True,
                             },
                             "CT_Slider": {
                                 "type": pn.widgets.IntSlider(name = 'Collocation Threshold', start=0, end= 60, step = 1, value = 30, value_throttled= 10),
                                 "throttled": True,
                             }
                         })),
                         pn.Row(pn.Column(pn.Card(self.plotly_pane_title, title = "Titles", background='WhiteSmoke')),
                                pn.Column(pn.Card(self.plotly_pane_content, title = "Abstracts", background='WhiteSmoke')
                                          ))
                        )
        
# Save dashboard for visualization


word = word_dash()

word_view = word.view()

all_tabs = pn.Tabs(('Word Cloud', word_view))

all_tabs_view = pn.template.MaterialTemplate(site="Search Panel", title="Download Search Data, Generate CSV File, Produce WordClouds", main=[all_tabs]);

all_tabs_view.show()

Launching server at http://localhost:53490


<bokeh.server.server.Server at 0x2d8fa954908>

In [6]:
?CountVectorizer

[1;31mInit signature:[0m
[0mCountVectorizer[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0minput[0m[1;33m=[0m[1;34m'content'[0m[1;33m,[0m[1;33m
[0m    [0mencoding[0m[1;33m=[0m[1;34m'utf-8'[0m[1;33m,[0m[1;33m
[0m    [0mdecode_error[0m[1;33m=[0m[1;34m'strict'[0m[1;33m,[0m[1;33m
[0m    [0mstrip_accents[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlowercase[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mpreprocessor[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtokenizer[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mstop_words[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtoken_pattern[0m[1;33m=[0m[1;34m'(?u)\\b\\w\\w+\\b'[0m[1;33m,[0m[1;33m
[0m    [0mngram_range[0m[1;33m=[0m[1;33m([0m[1;36m1[0m[1;33m,[0m [1;36m1[0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [0manalyzer[0m[1;33m=[0m[1;34m'word'[0m[1;33m,[0m[1;33m
[0m    [0m

In [None]:
list_1 = [('re', 148), ('make', 117), ('ve', 106), ('said', 95), ('able', 88)]


In [None]:
list_1