In [2]:
from metapub import PubMedFetcher
from time import sleep
import numpy as np
import pandas as pd
from pybliometrics.scopus import ScopusSearch

<br>

# PubMed Analysis

---

- User inputs the keyowrd of interest, starting year to fetch articles,
  and ending year

- Bellow code cell saves the results to a pandas dataframe, and if
  inputted by user, saves the results to a csv

In [4]:
keyword = str(input('Keyword to search for articles in the DataBase: '))
year_1 = int(input('Starting year: '))
year_2 = int(input('Ending year: '))
save_path = str(input('Save file to a csv? (file_path/n): '))

# Fetching the PubMed database
fetch = PubMedFetcher()

# Creating empty pandas dataframe with 'date' and 'articles' columns
pubmed_df = pd.DataFrame({'date': [],
                          'articles': []})

# Creating a list containing the number correspondant to every year month
month_list = np.arange(1, 13, 1)

# Creating a pandas dataframe with every month number in one column, named
# 'month', and in the other column 'ends', the corresponding months ending
# day
months_days = pd.DataFrame({'month': month_list,
                            'ends': [31, 28, 31, 30, 31, 30, 31, 31, 30, 31,
                                    30, 31]})

# Creating a list for with every year in the given range
year_list = np.arange(year_1, year_2 + 1, 1)

# Looping for every month in every year
for year in year_list:
    for month in months_days['month']:
        # Creating a variable to store the current loop month ending day
        endsin = (months_days.loc[months_days['month'] == month, 'ends'])[month-1]

        # Fetching all articles containing the given keyword, in the date
        # ranging from the first day (1) of the current loop month, to the
        # above defined current loop month last day
        pmids = fetch.pmids_for_query(f'{keyword} '+str(year)+f'/{month}/01[MDAT] : '+str(year)+f'/{month}/{endsin}[MDAT]',retmax=100000000)

        # Appending to the result dataframe, in the first available row
        # (given by the dataframe length), the current loop date in the
        # format year-month, and the number of published articles in this
        # month, given by the above articles fetch length
        pubmed_df.loc[len(pubmed_df)] = [f'{year}-{month}', len(pmids)]

        # Printing for tracking the current loop date, in the year-month
        # format, and the number of articles containing the given keyword
        # found in the Scoupus database with corresponding year-month
        # publication date
        print(f"{year}-{month}: ", len(pmids))

        # Sleeping for avoiding to many/simultaneous API requests, if any
        # error is presented during this function execution, this sleeping
        # time may be enlarged
        sleep(0.1)

# Converting the result dataframe 'date' column to pandas date format
pd.to_datetime(pubmed_df.date, format="%Y-%m")

# If optional argument save_path is given, the bellow saving loop is
# executed
if save_path != 'n' and save_path != 'N':
    pathcsv = f'{save_path}/{keyword}_pubmed.csv'
    pubmed_df.to_csv(path_or_buf=pathcsv)
pubmed_df

Keyword to search for articles in the DataBase:  Plasmonic
Starting year:  2018
Ending year:  2018
Save file to a csv? (file_path/n):  n


2018-1:  212
2018-2:  299
2018-3:  123
2018-4:  124
2018-5:  110
2018-6:  131
2018-7:  758
2018-8:  216
2018-9:  167
2018-10:  4422
2018-11:  1990
2018-12:  936


Unnamed: 0,date,articles
0,2018-1,212
1,2018-2,299
2,2018-3,123
3,2018-4,124
4,2018-5,110
5,2018-6,131
6,2018-7,758
7,2018-8,216
8,2018-9,167
9,2018-10,4422


<br>

## Read PubMed saved csv file

---


In [6]:
# Reading the csv file with pandas, stating that the file first column
# corresponds to the index values (row number)
csv_path = str(input('Path to the csv file: '))
df_readed = pd.read_csv(csv_path, index_col=0)

# Converting 'date' column to pandas date format
df_readed['date'] = pd.to_datetime(df_readed.date, format="%Y-%m")

print(df_readed)

Path to the csv file:  /home/eduardotc/Programação/my_gits/materials_chempy/example_data/Upconversion_pubmed.csv


         date  articles
0  2015-01-01        17
1  2015-02-01        20
2  2015-03-01        33
3  2015-04-01        22
4  2015-05-01        20
5  2015-06-01        17
6  2015-07-01        20
7  2015-08-01        31
8  2015-09-01        20
9  2015-10-01        12
10 2015-11-01        64
11 2015-12-01        17


In [None]:
csv_path = str(input('Path to the csv file: '))
# Reading in a pandas dataframe the csv
csv_stat = read_pubmed_csv(csv_path)

# Getting the date column sum, mean and max
csv_stat = csv_stat.groupby(csv_stat['date'].dt.year)['articles'].agg(['sum', 'mean', 'max'])

return csv_stat

In [None]:
def scopusfetcher(keyword, year_1, year_2, **kwargs):
    """

    Queryes from the Scopus database the number of articles containing a
    specific keyword, in an specified year range, returning a pandas dataframe
    with one column named 'date' containing the year month in format year-month
    and one column named 'articles', with the number of published articles
    containing the given keyword in that corresponding month.If the argument
    csv_path is given, the function saves the results to a csv file.

    Parameters
    ----------
    keyword : str
        Keyword to search articles tha contains it.
        Required.

    year_1 : int
        Starting year to search for articles. Remember that pubmed was created
        in january 1996.
        Required.

    year_2 : int
        Ending year to search for articles.
        Required.

    save_path : str
        Path to save the dataframe into a csv file.
        Optional.

    Returns
    -------
    scopus_df : pandas.DataFrame
        Pandas dataframe with every selected year months in a column named
        "date", and the corresponding number of published articles in that
        month containing the given keyword in other column named "articles".

    Examples
    --------
    Querying, per month, the number of published articles containing the
    keyword 'Upconversion', in the year of 2015, in the Scopus database.

    >>> scopus_df = scopusfetcher('Upconversion', 2015, 2015, save_path="./tmp")
    2015-january:  136
    2015-february:  126
    2015-march:  122
    2015-april:  135
    2015-june:  154
    2015-july:  178
    2015-august:  187
    2015-september:  149
    2015-october:  125
    2015-november:  120
    2015-december:  120
    >>> print(scopus_df)
             date  articles
    0  2015-01-01       136
    1  2015-02-01       126
    2  2015-03-01       122
    3  2015-04-01       135
    4  2015-06-01       154
    5  2015-07-01       178
    6  2015-08-01       187
    7  2015-09-01       149
    8  2015-10-01       125
    9  2015-11-01       120
    10 2015-12-01       120

    """
    # Defining optional argument save_path
    save_path = kwargs.get('save_path', None)

    # Creating a list with every month in the year
    months_list = ['january', 'february', 'march', 'april', 'june', 'july',
                   'august', 'september', 'october', 'november', 'december']

    # Creating an empty dataframe with one column named 'date' and one column
    # named 'articles'
    scopus_df = pd.DataFrame({'date': [],
                       'articles': []})

    # Creating a list containing every year in the given range
    years_list = np.arange(year_1, year_2+1, 1)

    # Looping for every month in every year
    for year in years_list:
        for month in months_list:
            # Searching articles containing the given keyword, in the specific
            # month and year, in the scopus database. subscriber=False refers
            # to be using a free API key.
            s = ScopusSearch(f'KEY {keyword}, PUBDATETXT({month} {year})',
                             subscriber=False)

            # Prints the year-month and number of published articles (size of
            # the search result)
            print(f"{year}-{month}: ", s.get_results_size())

            # Appending to previously created empty list, always in the first
            # available row (given by the length of the list), the date in the
            # format year-month of current loop, and the number of published
            # articles containing the keyword in this month
            scopus_df.loc[len(scopus_df)] = [f'{year}-{month}',
                                             s.get_results_size()]

            # Some errors have being ocurring while querying the database, to
            # try to minimize it, i setted a relatvielly large sleep time, may
            # be lowered to test your specific case results.
            sleep(2)

    # Converting the 'date' column to pandas datetime format
    scopus_df['date'] = pd.to_datetime(scopus_df.date.astype(str),
                                       format="%Y-%B")

    # If optional argument save_path is given, the saving loop bellow is
    # executed
    if save_path:
        pathcsv = f'{save_path}/{keyword}_scopus.csv'
        scopus_df.to_csv(path_or_buf=pathcsv)

    return scopus_df


def df_statistics(df):
    """

    Given a dataframe with 'date' column, in the format year-month, returns
    the sum, the mean and the max of the 'date' column, organized as well in
    a dataframe.

    Paramters
    ---------
    df : pd.DataFrame
        Pandas DataFrame, with one column named 'date', in the year-month
        format, and one column with integer values correlating to the date
        column.

    Returns
    -------
    df_stat : pd.DataFrame
        Pandas dataframe with the results, one column named 'sum', one named
        'mean' and one named 'max', all the columns refers to the values
        calculated from the original dataframe 'date' column values.

    Examples
    --------
    Defining an example dataframe, formated similar to articles databases
    functions present in this file.

    >>> x = ['2015-01-01', '2015-02-01', '2015-03-01', '2015-04-01',
    ...      '2015-06-01', '2015-07-01', '2015-08-01', '2015-09-01',
    ...      '2015-10-01', '2015-11-01', '2015-12-01']
    >>> y = [136, 126, 122, 135, 154, 178, 187, 149, 125, 120, 120]
    >>> df_test = pd.DataFrame({'date': x,
    ...                         'articles': y})
    >>> stat_test_df = df_statistics(df_test)
    >>> print(stat_test_df)
           sum        mean  max
    Date ...
    2015  1552  141.090909  187
    """
    df_clean = clean_df(df)
    # df_clean['Date'] = pd.to_datetime(df.date.astype(str), format="%Y-%m-%B")
    df_stat = df_clean.groupby(df_clean['Date'].dt.year)['Articles'].agg(['sum', 'mean', 'max'])
    return df_stat


def find_matching_positions(string1, string2):
    """

    Find the matching letters between 2 strings, and attribute the positions
    values of the matching cases to a list variable.

    Parameters
    ----------
    string1 : str
        First string to check matching letters

    string2 : str
        Second string to compare matching letters

    Returns
    -------
    matching_postions : list
        List with the positions of matching letters between the 2 strings

    Examples
    --------
    Find the matching letters between the strings 'hello' and 'hallo'

    >>> string1 = "hello"
    >>> string2 = "hallo"
    >>> positions = find_matching_positions(string1, string2)
    >>> print("Matching positions:", positions)
    Matching positions: [0, 2, 3, 4]

    Find the matching letters between '2015-02-01' and '2015-01-01'

    >>> string1 = "2015-02-01"
    >>> string2 = "2015-01-01"
    >>> positions = find_matching_positions(string1, string2)
    >>> print("Matching positions:", positions)
    Matching positions: [0, 1, 2, 3, 4, 5, 7, 8, 9]
    """
    matching_positions = []

    # Make sure both strings are of the same length
    if len(string1) != len(string2):
        raise ValueError("Both strings must have the same length")

    for i in range(len(string1)):
        if string1[i] == string2[i]:
            matching_positions.append(i)

    return matching_positions


def clean_csv(csv_path):
    """

    Reads a csv file, organizing its columns, returning a dataframe with one
    'Date' column (pandas datetime), one 'Year' column (pandas datetime), one
    'Month' column (pandas datetime), and if the csv has an articles column,
    the cleaned dataframe also has a 'Articles' column (integer).

    Parameters
    ----------
    csv_path : str
        Path to the csv file to be cleaned

    Returns
    -------
    df_clean : Pandas.DataFrame
        Pandas dataframe of cleaned and organized csv

    Examples
    --------
    Clean example_data csv

    >>> test_clean_csv = clean_csv('../example_data/Upconversion_pubmed.csv')
    >>> print(test_clean_csv)
        Year  Month       Date  Articles
    0   2015      1 2015-01-01        17
    1   2015      2 2015-02-01        20
    2   2015      3 2015-03-01        33
    3   2015      4 2015-04-01        22
    4   2015      5 2015-05-01        20
    5   2015      6 2015-06-01        17
    6   2015      7 2015-07-01        20
    7   2015      8 2015-08-01        31
    8   2015      9 2015-09-01        20
    9   2015      1 2015-01-01        12
    10  2015     11 2015-11-01        64
    11  2015     12 2015-12-01        17

    """
    month_new = []
    df = pd.read_csv(csv_path, index_col=0)

    # Creating empty dataframe in case readed csv 'date' column is shorter
    # than 7
    if (len(df.date[0])) < 7:
        df_split = pd.DataFrame({'a': [],
                                'b': []})

    # Creating empty dataframe in case readed csv 'date' column is longer
    # than 7
    else:
        df_split = pd.DataFrame({'a': [],
                                'b': [],
                                'c': []})

    df_clean = pd.DataFrame({'Year': [],
                        'Month': [],
                        'Date': [],
                        'Articles': []})

    # Iterating for every column from inputted dataframe
    for col in df.columns:

        # If input dataframe has a column named 'articles'
        if col == 'articles' or col == 'Articles':
            df_clean['Articles'] = df['articles']

        # If input dataframe has a column named 'date'
        if col == 'date' or col == 'Date':

            # Iterating for every value from the row 'date'
            for values in df.date:

                # Adding each date element to empty dataframe
                df_split.loc[len(df_split)] = values.split("-")

            # Iterating for every column in the new created every
            # date element df
            for cols in df_split.columns:

                # If length from every element from the first row >
                # 4 (correspond (High chances of being a year column)
                if len(df_split[cols][0]) == 4:
                    df_clean['Year'] = df_split[cols]

                # Getting total column values range (max - min)
                col_min = int(df_split[cols].min())
                col_max = int(df_split[cols].max())
                col_diff = col_max - col_min

                # Loop if the column range is between 17 and 4 (high chances of
                # being a month column
                if 17 > col_diff > 4:
                    df_clean['Month'] = df_split[cols]

    # Removing month column trailing zeroes
    for number in df_clean.Month:
        month_new.append(int(str(number).rstrip('0')))

    # Converting year month and date columns to pandas datetime
    df_clean['Year'] = pd.to_datetime(df_clean['Year'])
    df_clean['Year'] = df_clean.Year.dt.year
    df_clean['Month'] = month_new
    df_clean['Month'] = pd.to_datetime(df_clean['Month'], format="%m")
    df_clean['Month'] = df_clean.Month.dt.month
    df_clean['Date'] =  pd.to_datetime(df_clean[['Year', 'Month']].assign(Day=1))


    return(df_clean)


def clean_df(df):
    """

    Organize and clean a pandas DataFrame, returning a dataframe with one
    'Date' column (pandas datetime), one 'Year' column (pandas datetime), one
    'Month' column (pandas datetime), and if the csv has an articles column,
    the cleaned dataframe also has a 'Articles' column (integer).

    Parameters
    ----------
    df : pandas.DataFrame
        Pandas Dataframe to be cleaned

    Returns
    -------
    df_clean : pandas.DataFrame
        Cleaned and organized pandas DataFrame

    Examples
    --------
    Clean a fetched pubmed dataframe with the keyword 'Singlet Oxygen' in the
    year of 2017

    >>> test_df = pubmedfetcher('Singlet Oxygen', 2017, 2017)
    2017-1:  10
    2017-2:  20
    2017-3:  29
    2017-4:  8
    2017-5:  9
    2017-6:  8
    2017-7:  2
    2017-8:  11
    2017-9:  11
    2017-10:  16
    2017-11:  128
    2017-12:  27
    >>> test_clean = clean_df(test_df)
    >>> print(test_clean)
        Year  Month       Date  Articles
    0   2017      1 2017-01-01        10
    1   2017      2 2017-02-01        20
    2   2017      3 2017-03-01        29
    3   2017      4 2017-04-01         8
    4   2017      5 2017-05-01         9
    5   2017      6 2017-06-01         8
    6   2017      7 2017-07-01         2
    7   2017      8 2017-08-01        11
    8   2017      9 2017-09-01        11
    9   2017      1 2017-01-01        16
    10  2017     11 2017-11-01       128
    11  2017     12 2017-12-01        27

    """
    month_new = []

    # Creating empty dataframe in case readed csv 'date' column is shorter
    # than 7
    if (len(df.date[0])) < 7:
        df_split = pd.DataFrame({'a': [],
                                'b': []})

    # Creating empty dataframe in case readed csv 'date' column is longer
    # than 7
    else:
        df_split = pd.DataFrame({'a': [],
                                'b': [],
                                'c': []})

    df_clean = pd.DataFrame({'Year': [],
                        'Month': [],
                        'Date': [],
                        'Articles': []})

    # Iterating for every column from inputted dataframe
    for col in df.columns:

        # If input dataframe has a column named 'articles'
        if col == 'articles' or col == 'Articles':
            df_clean['Articles'] = df['articles']

        # If input dataframe has a column named 'date'
        if col == 'date' or col == 'Date':

            # Iterating for every value from the row 'date'
            for values in df.date:

                # Adding each date element to empty dataframe
                df_split.loc[len(df_split)] = values.split("-")

            # Iterating for every column in the new created every
            # date element df
            for cols in df_split.columns:

                # If length from every element from the first row >
                # 4 (correspond (High chances of being a year column)
                if len(df_split[cols][0]) == 4:
                    df_clean['Year'] = df_split[cols]

                # Getting total column values range (max - min)
                col_min = int(df_split[cols].min())
                col_max = int(df_split[cols].max())
                col_diff = col_max - col_min

                # Loop if the column range is between 17 and 4 (high chances of
                # being a month column
                if 17 > col_diff > 4:
                    df_clean['Month'] = df_split[cols]

    # Removing month column trailing zeroes
    for number in df_clean.Month:
        month_new.append(int(str(number).rstrip('0')))

    # Converting year month and date columns to pandas datetime
    df_clean['Year'] = pd.to_datetime(df_clean['Year'])
    df_clean['Year'] = df_clean.Year.dt.year
    df_clean['Month'] = month_new
    df_clean['Month'] = pd.to_datetime(df_clean['Month'], format="%m")
    df_clean['Month'] = df_clean.Month.dt.month
    df_clean['Date'] =  pd.to_datetime(df_clean[['Year', 'Month']].assign(Day=1))


    return(df_clean)