# Trend Analysis

This notebook is used to setup all the methods and tools for the analyis of trends in the literature. First, methods that useful in storing, importing and manipulating data are introducted, followed by the class "TrendAnalysis()", which serves as the main object to manage searches in Scopus and trend evaluation and analysis.

## Imports

In [None]:
#from tqdm import tqdm
#The above import is only for console
import pandas as pd
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import AbstractRetrieval
import json
from datetime import datetime
import os
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from collections import Counter
from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize

## Save and import data

In [None]:
def save_df(df, name, time = None, folder = "data"):
    """
    Saves a DataFrame "df" under the filename "name" with the current time stamp into the default folder "data".
    """
    if time is None:
        time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    if name.endswith('.csv'):
        filename = f'{time} {name}'
    else:
        filename = f'{time} {name}.csv'
        
    filepath = os.path.join(os.path.abspath(os.getcwd()), folder, filename)
    df.to_csv(filepath)
    
    return filepath

In [None]:
def read_df(filename, folder = "data"):
    """
    Reads and returns the DataFrame with the name "filename" from the default folder "data".
    """
    converters = {
        "authkeywords": lambda x: set(x.strip("{}").replace("'","").split(", ")),
        #"index_terms": lambda x: x.strip("[]").replace("'","").split(", ")
    }

    # Read data from files
    filepath = os.path.join(os.path.abspath(os.getcwd()), folder, filename)
    df = pd.read_csv(filepath, index_col = 'eid', converters=converters)
    
    return df

In [None]:
def save_dict(dict_, name, time = None, folder ="data"):
    """
    Saves the dictionary "dict_" under the name "name" with the current time stamp into the default folder "data".
    """
    if time is None:
        time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
        
    filename = f'{time} {name}'
    filepath = os.path.join(os.path.abspath(os.getcwd()), folder, filename)
    
    file = open(filepath, "w")
    json.dump(dict_, file)
    file.close()
    
    return filepath

In [None]:
def read_dict(filename, folder ="data"):
    """
    Reads and returns the dictionary with the name "filename" from the default folder "data".
    """
        
    # Read data from files
    filepath = os.path.join(os.path.abspath(os.getcwd()), folder, filename)
    
    file = open("data.json", "r")
    dict_ = file.read()
    file.close()
    
    return dict_

In [None]:
def create_keyword_file(folder, confirm_manually, min_appearances):
    """
    Creates a text file for all new keywords to be stored that are included in the extended search.
    """
    
    time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
    
    filename = f'{time} additional keywords.txt'
    
    filepath = os.path.join(os.path.abspath(os.getcwd()), folder, filename)
    
    f = open(filepath, "x")
    
    if confirm_manually:
        f.write("The following keywords were used in the extended search. Keywords denied by the user can be found in a separate file. "+
                f"All keywords appeared at least {min_appearances} times in a previous search.\n\n")
    else:
        f.write("The following keywords were used in the extended search. Keywords were not manually confirmed by the user and "+
                f"automatically included when they appeared more than {min_appearances} times in a previous search.\n\n")

    f.close()
    
    return filepath

In [None]:
def create_denied_keyword_file(folder):
    """
    Creates a text file for all keywords that are manually denied by the user.
    """
    time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")

    filename = f'{time} denied keywords.txt'
    
    filepath = os.path.join(os.path.abspath(os.getcwd()), folder, filename)
    
    f = open(filepath, "x")
    f.write("The following keywords were manually selected by the user to not be included in the extended search:\n\n")
    f.close()
    
    return filepath

In [None]:
def write_to_keyword_file(filepath, content, recursion):
    """
    Opens the file specified by the filepath and inserts the content and the recursion in which it was called.
    """
    f = open(filepath, "a")
    
    if(isinstance(content, str)):
        f.write(f'{recursion}: {content}\n')
    elif(isinstance(content, list)):
        for keyword in content:
            f.write(f'{recursion}: {keyword}\n')
    else:
        print(f"When trying to write to {filepath}, the input was neither a string nor a list. Trying string conversion. " + 
             "It is recommended to check for validity in the file.")
        f.write(f'{recursion}: {str(content)}\n')
        
    f.close()

## Keyword and data manipulation

In [None]:
def convert_keywords(series):
    """
    Converts the keyword data retrieved from Scopus in a form so that it can be more easily used in the program.
    
    Example of doc[i]["authkeywords"] to visualize the nested characteristic of the data set:
        
        {'author-keyword': [{'@_fa': 'true', '$': 'Information'},
              {'@_fa': 'true', '$': 'Network'},
              {'@_fa': 'true', '$': 'Optimization'},
              {'@_fa': 'true', '$': 'Production'},
              {'@_fa': 'true', '$': 'Simulation'}]}
    """
    
    author_keywords = [None] * len(series)
    doc_keywords = []
    not_found = 0
    
    for i in range(0, len(author_keywords)):
        entry = series[i]
        if entry != None:
            if '|' in entry:
                keywords = entry.lower().split(' | ')
                keywords = {transform(keyword) for keyword in keywords}
            else:
                keywords = entry.lower().split()
                keywords = {transform(keyword) for keyword in keywords}
            author_keywords[i] = keywords
    
    return pd.Series(author_keywords, index=series.index)

In [None]:
def transform(string):
    """
    Plurals are removed for common words to avoid the cases where keywords or index terms are retrieved in plural and singular forms.
    """
    tokenized = word_tokenize(string)
    
    dict_ = {
        # british american spelling
        "digitalisation": "digitalization",
        "digitisation": "digitization",
        "digitised": "digitized",
        "optimisation": "optimization",
        "servitisation": "servitization",
        
        # plural removal
        "assessments": "assessment",
        "automations": "automation",
        "chains": "chain",
        "changes": "change",
        "companies": "company",
        "contracts": "contract",
        "decisions": "decision",
        "dependencies": "dependency",
        "experiments": "experiment",
        "impacts": "impact",
        "industries": "industry",
        "informations": "information",
        "interviews": "interview",
        "managements": "management",
        "models": "model",
        "networks": "network",
        "processes": "process",
        "projects": "project",
        "relationships": "relationship",
        "researches": "research",
        "revolutions": "revolution",
        "services": "service",
        "systems": "system",
        "technologies": "technology",
        "transfers": "transfer",
        "transformations": "transformation"
        
    }
    
    normalized_tokens = []
    
    for word in tokenized:
        if word in dict_:
            normalized_tokens.append(dict_[word])
        else:
            normalized_tokens.append(word)

    return ' '.join(normalized_tokens)

In [None]:
def concat_and_remove_duplicates(dfs):
    """
    Returns a concatenated dataframe without duplicate entries.
    """
    appended_dfs = pd.concat(dfs)
    original_length = len(appended_dfs)
    appended_dfs = appended_dfs[~appended_dfs.index.duplicated(keep='first')]
    new_length = len(appended_dfs)
    
    n_duplicates = original_length - new_length
    if(n_duplicates == 0):
        print("None of the search results were duplicates.\n")
    elif(n_duplicates == 1 ):
        print(f"{n_duplicates} result was a duplicate.\n")
    else:
        print(f"{n_duplicates} of the results were duplicates.\n")
        
    return appended_dfs

In [None]:
def count_entries(series):
    """
    Returns a Counter object for all keywords that appear in the series.
    """
    entries = []

    for list_ in series:
        if list_ is not None:
            try:
                for entry in list_:
                    entries.append(entry)
            except:
                print(list_)
    return Counter(entries)

In [None]:
def get_total_publications(df, column_name = None):
    """
    Returns a DataFrame of time series of total publications and publications without keywords.
    """
    # Create date index
    cover_dates = pd.to_datetime(df["coverDate"])
    min_date = cover_dates.min()
    max_date = cover_dates.max()
    index_col = pd.date_range(min_date, max_date, freq="d")
    
    # Create DataFrame 
    publications = pd.DataFrame(0, index=index_col, columns=['publications', 'no_keywords'])

    # Fill data frame    
    for doc in df.index:
        date = cover_dates[doc]
        publications['publications'][date] += 1
        if df['authkeywords'][doc] is None:
            publications['no_keywords'][date] += 1
    return publications

## Main class

In [None]:
class TrendAnalysis():
    """
    This class is used to retrieve, store and analyze literature data from Scopus. It is based on two sets of search terms that are used to identify relevant publications in Scopus.
    """
    def __init__(self, base_terms, trend_terms, to_file = True, refresh = True, folder = 'data', max_year = 2020, min_year = 2000, gf_base_year = 2015):
        """
        Initialized the TrendAnalysis object. This object is used for all search and primary analysis purposes.

        Parameters
        ----------
        
        to_file : Specifies whether interim and final results shall be saved in a local file in the project folder.
        
        refresh : Controls whether all searches in Scopus shall be rerun or loaded from cashed files on the system.

        folder : Folder in the project folder where the results shall be saved.
        
        max_year : Searches in Scopus will only regard documents that were published until this year. max_year will also be used to calculate the percentage change of publications for a certain topic until this year. Default 2020.
        
        min_year : Searches in Scopus will only regard documents that were published from this year onward. Default 2000.
        
        gf_base_year : To calculate the percentage change of publications for a certain topic (GF) this year is used as the base yea
        """  
        # Check if the input has the right format. If only one argument is given in the form of a string it will be wrapped in a list.
        if type(base_terms) != list:
            if type(base_terms) is str:
                self.st1 = [base_terms]
            else:
                print(f"The given input argument for production keywords should be a list but is {type(base_terms)}")
                return None
        else:
            self.st1 = base_terms
        if type(trend_terms) != list:
            if type(trend_terms) is str:
                self.st2 = [trend_terms]
            else:
                print(f"The given input argument for digitalisation keywords should be a list but is {type(trend_terms)}")
                return None
        else:    
            self.st2 = trend_terms
        
        self.to_file = to_file
        self.refresh = refresh
        self.folder = folder
        self.min_year = min_year
        self.max_year = max_year
        self.gf_base_year = gf_base_year
        self.st_ts = None
        
    def reset_covered_kws(self):
        """
        Reset the covered keywords to those of the initial search.
        """
        self.covered_kws = self.st1 + self.st2 + ['', "manufacturing", "production", "manufacturing network", "digital transformation"]
            
        return self.covered_kws
    
    def remove_duplicates(self, dict_):
        """
        Ensures that for every key in the dictionary, the associated entry does not contain any duplicates. Returns this dictionary.
        """
        cleaned = dict_
        for key in cleaned:
            cleaned[key] = list(set(cleaned[key]))
        return cleaned
        
        
    def run_initial_search(self):
        """
        Runs an initial search for the explicitly specified search term lists.
    
        """
        # Search for all queries
        print("Commencing search...")
        appended_dfs = []
        srch_affil = {search_term: [] for search_term in (self.st1+self.st2)}
        for term1 in self.st1:
            for term2 in self.st2:
                query = f'TITLE-ABS-KEY("{term1}" AND "{term2}") AND PUBYEAR > {self.min_year-1} AND PUBYEAR < {self.max_year+1}'
                search = ScopusSearch(query, verbose=True, refresh=self.refresh)
                results = pd.DataFrame(search.results)
                print(f'{len(results)} results downloaded for query "{query}".')
                if not results.empty:
                    results = results.set_index('eid')
                    appended_dfs.append(results)
                    srch_affil[term1] += list(results.index)
                    srch_affil[term2] += list(results.index)
           

        # Concatenate and drop duplicates
        appended_dfs = concat_and_remove_duplicates(appended_dfs)

    
        appended_dfs['authkeywords'] = convert_keywords(appended_dfs['authkeywords'])
        appended_dfs['citedby_count'] = appended_dfs['citedby_count'].astype(int)


        self.initial_results = appended_dfs
        self.initial_dict = self.remove_duplicates(srch_affil)
        self.reset_covered_kws()
        self.ts = None

        # Write to disk        
        if self.to_file:
            time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")
            save_df(appended_dfs, "initial results", time=time)
            save_dict(srch_affil, "dictionary", time=time)
        
        print("Done.")
    
    def run_extended_search(self, initial_min = 10, iterations = 1, min_appearances = 100):
        """
        Runs an extended search for the most frequently occurring keywords in the initial results and previous iterations.
        For each keyword, a dictionary entry is stored with all corresponding document ids of the search.
        
        
        Parameters
        ----------
        iterations : 1 by default. Change to a value larger than 1 to allow for further iterations.
        
        initial_min : 10 by default. Minimum number of occurrences of an author keyword to be proposed for the first iteration.
        
        min_appearances : 100 by default. Minimum appearances for the second iteration. This number will double after each iteration.
                          Only has an effect, if the number of iterations is larger than 1.
        """
        # Set default minimum appearances.
        
        if iterations == 1:
             print(f"\nThe search will run for {iterations} iteration and select all keywords that occur more than {initial_min} in the initial results.")
    
        elif (iterations > 1) & (type(iterations) == int):
            print(f"\nThe search will select all keywords that occur more than {initial_min} in the initial results for the first iteration. In the second iteration,"+
                 f" the search will start with {min_appearances} minimum keyword occurrences and double for each of the remaining {iterations-1} iterations.")
        else:
            print("The parameter 'iterations' should be set to an integer number larger or equal to 1. Please try again.")
            return
        

        # Create the list of additional keywords by looking at the most common occurences and filtering out certain topics.
        counter = count_entries(self.initial_results['authkeywords'])
        most_common = counter.most_common()

        # Manually blacklisted keywords that have been covered in the initial search.
        covered_keywords = self.covered_kws

        additional_keywords = []
        covered = covered_keywords.copy()
        covered.remove('')
        for entry in most_common:
            if entry[0] not in covered_keywords:
                if entry[1] >= initial_min:
                    # Check that the new keyword is not a part of any previously covered keyword
                    if not any((keyword in entry[0]) for keyword in covered):
                        additional_keywords.append(entry[0])
                else:
                    break

        # Stop here if no additional keywords are found.
        if len(additional_keywords) == 0:
            print("No additional keywords found that meet criteria. Consider lowering the minimum number of appearances required.")
            print("(If the extended search has been run before, you might can use self.reset_covered_kws() to reset the covered keywords to those of the initial search.)")
            return None

        # Let user decide whether he/she wants to confirm additional keywords
        print(f"\nDo you want to confirm these additional keywords manually at each iteration ? (Y/N).")
        confirm_manually = True
        while True:
            user_input = input()
            if user_input == 'y' or user_input == 'Y':
                if self.to_file:
                    denied_keyword_file = create_denied_keyword_file(self.folder)
                break
            elif user_input == 'n' or user_input == 'N':
                confirm_manually = False
                break
            else:
                print("Please enter a valid input like 'Y' or 'N'.")

        if self.to_file:
            keyword_file = create_keyword_file(self.folder, confirm_manually, min_appearances)

        # Iterating through keywords
        covered_documents = self.initial_results.index
        recursion = 0
        failed_to_read = []
        srch_affil = {search_term: [] for search_term in (self.st1+additional_keywords)}
        extended_results = self.initial_results 
        
        while (len(additional_keywords) != 0) and recursion < iterations:
            
            # Print most frequently occurring keywords if user chose to confirm manually.
            if confirm_manually:
                if recursion == 0:
                    print(f"\nThe most common keywords that appear more than {initial_min} times are:")
                else:
                    print(f"\nThe most common keywords that appear more than {min_appearances} times are:")
                for keyword in additional_keywords:
                    print(f"\t{keyword} ({counter[keyword]})")
                print("To remove one of the above keywords from the search enter it below (e.g. 'big data'). "+
                 "Otherwise enter 'N'. If you want to stop altogether enter 'STOP'.")
    
            # Let user confirm additional keywords at each iteration.
            min_appearances = min_appearances * pow(2, recursion)
            while True:
                user_input = input()
                if user_input in additional_keywords:
                    additional_keywords.remove(user_input)
                    del srch_affil[user_input]
                    covered_keywords.append(user_input)
                    # Write denied keyword to log file.
                    if self.to_file:
                        write_to_keyword_file(denied_keyword_file, user_input, recursion)
                    print(f"\nRemoved '{user_input}' from additional keywords. The new list is:")
                    for keyword in additional_keywords:
                        print(f"\t{keyword} ({counter[keyword]})")
                    print("To remove another keyword from the search enter it below (e.g. 'big data'). "+
                         "Otherwise enter 'N'.")
                elif user_input == 'n' or user_input == 'N':
                    break
                elif user_input == 'STOP' or user_input == 'stop':
                    print("This might not return valid results if no iteration has been run.")
                    # 7. Save final results 
                    self.covered_kws = covered_keywords
                    self.ext_results = extended_results
                    self.ext_dict = self.remove_duplicates(srch_affil)
                    print("Search finished.")
                    if self.to_file:
                        new_filename = save_df(extended_results, 'extended results') 
                        print(f"Extended results are also saved in {new_filename}")    
                    return None
                else:
                    print("Please enter a valid keyword to remove or enter 'N' to accept current keyword selection. Or if you want to stop enther 'STOP'") 

            # Create dictionary of search results for each term
            dict_ = {search_term: [] for search_term in (self.st1+additional_keywords)}
                
            # Search for all queries
            results = [extended_results]
            print("\nCommencing search...")
            for term1 in self.st1:
                for term2 in additional_keywords:
                    query = f'TITLE-ABS-KEY("{term1}" AND "{term2}") AND PUBYEAR > {self.min_year-1} AND PUBYEAR < {self.max_year+1}'
                    search = ScopusSearch(query, verbose=True, refresh=self.refresh)

                    result = pd.DataFrame(search.results)
                    print(f'{len(result)} results downloaded for query "{query}"".')

                    if not result.empty:
                        result = result.set_index('eid')
                        result['authkeywords'] = convert_keywords(result['authkeywords'])
                        for term in [term1, term2]:
                            if term in srch_affil.keys():
                                srch_affil[term] += list(result.index)
                            else:
                                srch_affil[term] = []
                                srch_affil[term] += list(result.index)
                        # Drop already covered documents.
                        result.drop(covered_documents.intersection(result.index), inplace=True)            
                        # Update covered documents and keywords.
                        covered_documents = covered_documents.union(result.index)
                        # Append to list of data frames
                        results.append(result)


            # Write keywords to log file
            if self.to_file:
                write_to_keyword_file(keyword_file, additional_keywords, recursion)

            # Update the already covered search terms and recalculate most common keywords and 
            covered_keywords += additional_keywords
            additional_keywords = []
            extended_results = concat_and_remove_duplicates(results)
            counter = count_entries(extended_results['authkeywords'])
            most_common = counter.most_common()

            # Create a copy of covered_keywords that does not contain the following keywords and pre-select new terms for the next iteration
            covered = covered_keywords.copy()
            covered.remove('')
            covered.remove('production')
            covered.remove('manufacturing')
            for entry in most_common:
                if entry[0] not in covered_keywords:
                    if entry[1] >= min_appearances:
                        # Check that the new keyword is not a part of any previously covered keyword
                        if not any(keyword in entry[0] for keyword in covered):
                            additional_keywords.append(entry[0])
                    else:
                        break
            recursion += 1


        # Save final results after all iterations are complete
        extended_results['citedby_count'] = extended_results['citedby_count'].astype(int)
        if self.to_file:
            new_filename = save_df(extended_results, 'extended results')
            print(self.to_file)
            print(f"Search finished. Extended results are also saved in {new_filename}")
        self.covered_kws = covered_keywords
        self.ext_results = extended_results
        self.ext_dict = self.remove_duplicates(srch_affil)
        
        
    def create_kw_ts(self, keyword_column = 'authkeywords', threshold = 10):
        """
        Creates a dataframe of time series for every keyword that appears at least as many times as specified in the
        threshold. Every entry represents the number of documents in which a certain author keyword appears in a specific year.

        Results are stored in self.ts
        
        Note: Currently unused.
        
        """
        # Create date index
        print("Creating time series of publications per keyword...")
        
        cover_dates = pd.to_datetime(self.ext_results["coverDate"])

        min_date = cover_dates.min()
        max_date = cover_dates.max()

        index_col = pd.date_range(min_date, max_date, freq="d")

        # Get keywords
        keywords = []
        counted_keywords = count_entries(self.ext_results[keyword_column])
        for tuple_ in counted_keywords.most_common():
            keyword = tuple_[0]
            appearances = tuple_[1]

            if appearances >= threshold:
                keywords.append(keyword)
            else:
                break

        # Create empty dataframes for every time series
        ts = pd.DataFrame(0, index=index_col, columns=keywords)

        # Fill data frame    
        for document in tqdm(self.ext_results.index):
            try:
                for keyword in self.ext_results[keyword_column][document]:
                    if keyword in ts.columns:
                        cover_date = self.ext_results['coverDate'][document]
                        ts[keyword][cover_date] += 1
            except TypeError:
                pass
        
        self.ts = ts  
    
    def create_search_ts(self):
        """
        Creates a dataframe of time series for every search term in the subsequent/extended search. 
        Each entry represents the number of documents that were identified for the specific search term in a given year.
        
        Results are stored in self.st_ts

        """
        # Create date index
        print("Creating time series of publications per keyword...")

        cover_dates = pd.to_datetime(self.ext_results["coverDate"])

        min_date = cover_dates.min()
        max_date = cover_dates.max()

        index_col = pd.date_range(min_date, max_date, freq="d")

        # Get keywords
        keywords = self.ext_dict.keys()

        # Create empty dataframes for every time series
        ts = pd.DataFrame(0, index=index_col, columns=keywords)

        # Fill data frame    
        for keyword in tqdm(keywords):
            for document in self.ext_dict[keyword]:
                try:
                    cover_date = self.ext_results['coverDate'][document]
                    ts[keyword][cover_date] += 1
                except TypeError:
                    pass

        self.st_ts = ts  
        
    
    def plot_analysis(self):
        """
        Plots and prints information on the trends in descending order of their growth factor (GF).
        
        Printed are the number of documents per year per trend as well as the percentage of total publications, 
        the most cited documents per trend and the most frequently occurring documents for each trend.
        
        """
        if self.st_ts is None:
            self.total_pub = get_total_publications(self.ext_results).resample('YS').sum()            
            self.create_search_ts()
            self.st_ts_a = self.st_ts.resample('YS').sum()
            self.st_ts_share_a = self.st_ts_a.divide(self.total_pub['publications'], axis = 0)
            
            
        # Rank publications, citations and growth factors.
        pub_count = {}
        cit_count = {}
        gf = {}
        
        for trend in self.ext_dict.keys():
            if trend not in self.st1:
                pub_count[trend] = len(self.ext_dict[trend])
                cit_count[trend] = self.ext_results.loc[self.ext_dict[trend]].citedby_count.sum()
                gf[trend] = float(self.st_ts_share_a[trend][str(self.max_year)].values-self.st_ts_share_a[trend][str(self.gf_base_year)].values)
        
        r = {key: rank for rank, key in enumerate(sorted(set(pub_count.values()), reverse=True), 1)}
        pub_r = {k: r[v] for k,v in pub_count.items()}
        
        r = {key: rank for rank, key in enumerate(sorted(set(cit_count.values()), reverse=True), 1)}
        cit_r = {k: r[v] for k,v in cit_count.items()}

        r = {key: rank for rank, key in enumerate(sorted(set(gf.values()), reverse=True), 1)}
        gf_r = {k: r[v] for k,v in gf.items()}
        
        # Save rankings
        data = list(zip(gf.values(), pub_count.values(), cit_count.values()))
        self.rankings = pd.DataFrame(data = data, index = pub_count.keys(), columns = ['Growth', 'Publications', 'Citations'])

    
        # Plot graphs by GF
        for trend, v in sorted(gf.items(), key=lambda item: item[1], reverse=True):

            fig = make_subplots(
                rows=2, cols=1,
                subplot_titles=(f"Number of documents retrieved for '{trend}' per year.",f"Percentage of documents with {trend} in title/abstract/keywords."),
                print_grid=False)
            
            # Plot number of publications per year
            fig.add_trace(go.Scatter(x=self.st_ts_a['2010':str(self.max_year)].index, y=self.st_ts_a[trend]['2010':str(self.max_year)]), row=1, col=1)
            # Plot share of publications per year
            fig.add_trace(go.Scatter(x=self.st_ts_share_a['2010':str(self.max_year)].index, y=self.st_ts_share_a[trend]['2010':str(self.max_year)]), row=2, col=1)
            fig.update_layout(height=600, width=600, title_text=trend.upper(), template='simple_white', showlegend=False, yaxis2=dict(tickformat=".2%", range= [0,0.5]))
            
            # Print summary statistics
            print(trend.upper())
            print(f'Total number of publications: {pub_count[trend]} (Rank: {pub_r[trend]})')
            print(f'Total number of citations: {cit_count[trend]} (Rank: {cit_r[trend]})')
            print(f'Total growth rate of publications as share of total (2015-2020): {round(gf[trend]*100, 2)}% (Rank: {gf_r[trend]})')

            fig.show()
            
            # Print most frequently occurring keywords
            print("Number of documents with a specific keyword:")
            for keyword in count_entries(self.ext_results.loc[self.ext_dict[trend]].authkeywords).most_common()[:15]:
                print(f"\t{keyword[0]}: {keyword[1]}")
                

            # Print top 5 documents by citations
            print(f"\n\nThe most-cited documents in the context of production networks and {trend} are:")
            relevant = self.ext_results.loc[self.ext_dict[trend]]
            top_5 = relevant.citedby_count.sort_values(ascending=False)[:5]
            for doc in top_5.index:
                print(f"\tTitle: {relevant.loc[doc].title}")
                print(f"\tAuthors: {relevant.loc[doc].author_names}")
                print(f"\tCitations: {top_5[doc]}")
                print(f"\tKeywords: {relevant.loc[doc].authkeywords}\n")

    def calc_combined_gf(self, terms, filename = None):
        """
        Calculates the GF when combining synonymous terms. "terms" should be a list of two strings.
        
        If the time series of the combined terms shall be saved, provide a filename in the form of a string.
        """
        if (isinstance(terms, list)) & (len(terms) == 2):
            cover_dates = pd.to_datetime(self.ext_results["coverDate"])
            min_date = cover_dates.min()
            max_date = cover_dates.max()
            index_col = pd.date_range(min_date, max_date, freq="d")

            ts = pd.DataFrame(0, index=index_col, columns=['trend'])
            for document in set(self.ext_dict[terms[0]]+self.ext_dict[terms[1]]):
                cover_date = self.ext_results['coverDate'][document]
                ts['trend'][cover_date] += 1

            ts_a = ts.resample('YS').sum()
            ts_a_share = ts_a.divide(self.total_pub['publications'], axis = 0)
            if isinstance(filename, str):
                ts_a.to_csv(filename)
            gf = float(ts_a_share.loc[str(self.max_year)].values-ts_a_share.loc[str(self.gf_base_year)].values)
            print(f"The GF for {terms[0]} and {terms[1]} is: {round(gf*100,2)}%")
        else:
            print("Please enter a list of two strings, e.g. '[\"industry 4.0\", \"industrie 4.0\"]'")
            return           
      
    
    def export_top_pub_table(self, search_terms = None, number = 10):
        """
        Exports data on the top publications. If no argument is given, the data on the most frequently cited documents 
        from the extended search will be exported. If an export for specific topics is desired, enter a single topic as a 
        string or multiple ones as list.
        """
        # Check for search_terms type and initialize filename and choose the relevant documents
        if isinstance(search_terms, str):
            top = self.ext_results.loc[self.ext_dict[search_terms]].citedby_count.sort_values(ascending=False)[:number].index
            filename = f"{search_terms} top {number}.csv"
        elif isinstance(search_terms, list):
            docs = []
            for st in search_terms:
                docs += self.ext_dict[st]
            top = self.ext_results.loc[set(docs)].citedby_count.sort_values(ascending=False)[:number].index
            filename = f"{search_terms[0]} top {number}.csv"
        elif search_terms is None:
            top = self.ext_results.citedby_count.sort_values(ascending=False)[:number].index
            filename = f"Overall top {number}.csv"
        else:
            print("Error: Enter a search term as string or a list of search_terms")
            return

        subtypes =  []
        authors = []
        citations = []
        titles = []
        separator = ', '
        
        # Retrieve detailed data on these publications
        for doc in top:
            ab = AbstractRetrieval(doc)

            subtypes.append(ab.subtypedescription)
            if len(ab.authors) >= 2:
                authors.append(f'{ab.authors[0].surname} et al. ({ab.coverDate.split("-")[0]})')
            else:
                authors.append(f'{ab.authors[0].surname} ({ab.coverDate.split("-")[0]})')

            citations.append(self.ext_results.loc[doc].citedby_count)    
            titles.append(ab.title)

        table = pd.DataFrame(columns = ["Category", "Author (Year)", "Titles", "Citations"])
        table["Category"] = subtypes
        table["Author (Year)"] = authors
        table["Citations"] = citations
        table["Titles"] = titles
        table = table.set_index("Category")
        table = table.sort_values(by =['Category', 'Citations'], ascending = [True, False])
        table.to_csv(filename)
        print(f"Table exported into project folder as '{filename}'.")
        
        
    def export_for_vos(self, search_terms = None):
        """
        Exports author keywords so they can be displayed in a thematic map in VOSviewer. If no argument 
        is given, the author keywords for all documents from the extended search will be exported.
        If an export for specific topics is desired, enter a single topic as a string or multiple ones as list.
        """
        # Check for the type of search_terms and initialize the filename and the author keywords of the relevant documents
        kws = []
        if isinstance(search_terms, str):
            doc_ids = self.ext_dict[search_terms]
            base = self.ext_results.loc[doc_ids].authkeywords
            filename = f'vos_{search_terms}.csv'
        elif isinstance(search_terms, list):
            doc_ids = []
            topic = ""
            for term in search_terms:
                doc_ids += self.ext_dict[term]
                topic += "_"+ term
            doc_ids = set(doc_ids)
            base = self.ext_results.loc[doc_ids].authkeywords
            filename = f'vos{topic}.csv'
        elif search_terms is None:
            base = self.ext_results.authkeywords
            filename = f'vos_full_export.csv'
        else:
            print("Please enter a string or list of search terms to export for specific topics or call without an argument for a full export.")
            return
        
        # Store the author keywords in the form required by VOS.
        for doc in base:
            if doc is None:
                kws.append("")
            else:
                kws.append("; ".join(list(doc)))
        # Export keywords to csv in project folder
        pd.DataFrame(index = base.index, data = kws, columns=['Author Keywords']).to_csv(filename)
        print(f"Data exported for VOS into project folder as '{filename}'.")