In [1]:
import pandas as pd
import json
from pybliometrics.scopus import AbstractRetrieval, AuthorRetrieval, AffiliationRetrieval
import warnings
import datetime

In [2]:
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Importing excel sheet from Google Drive

In [85]:
#importing and opening file
url_id = config['url_id']
url_base = "https://docs.google.com/spreadsheets/d/"
url_e = "export/format=xlsx"
file = pd.ExcelFile(f"{url_base}{url_id}{url_e}")  
with pd.ExcelFile(file) as xls:  
    df = pd.read_excel(xls, "cleaner table", header=0)
    
#ignoring warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*utcnow.*")
now = datetime.datetime.utcnow()


#removing blank columns
df = df.iloc[:,:44]
#removing useless columns for analysis
columns_to_drop = ["Author(s) ID", "Volume", "Issue", "Art. No.", 
                   "Page start", "Page end", "Page count", 
                   "Link", "Index Keywords", "Document Type", 
                   "Publication Stage", "Open Access", "Source"]
df = df.drop(columns=columns_to_drop)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855 entries, 0 to 854
Data columns (total 31 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Authors                                  855 non-null    object
 1   Author full names                        843 non-null    object
 2   Title                                    854 non-null    object
 3   Year                                     855 non-null    int64 
 4   Source title                             855 non-null    object
 5   Cited by                                 855 non-null    int64 
 6   DOI                                      849 non-null    object
 7   Author Keywords                          613 non-null    object
 8   Abstract                                 824 non-null    object
 9   EID                                      855 non-null    object
 10  SDG                                      855 non-null    int64

## Authors dataframe

In [71]:
#creating the function
def articles_info(eids):
    arts = []
    for eid in eids:
        #applying pybliometrics
        arts.append(AbstractRetrieval(eid, view='FULL'))
    return arts

In [72]:
#extracting all the eid's and scopus id's
eids = df["EID"].tolist()
#calling the function, make sure to have your API key running
articles = articles_info(eids)

In [73]:
#creating dictionary
authors = {
            'article_eid': [],
            'author_id': [],
            'author_name': [],
            'affiliation_country': [],
            'position in article': []
            }
#iterating over each element in the articles list
for article in articles:
    #iterating over each element in authorgroup
    for i in range(len(article.authorgroup)):
        authors['article_eid'].append(article.eid)
        authors['author_id'].append(article.authorgroup[i].auid)
        first_name = article.authorgroup[i].given_name
        last_name = article.authorgroup[i].surname
        authors['author_name'].append(f'{first_name} {last_name}')
        authors['affiliation_country'].append(article.authorgroup[i].country)
        authors['position in article'].append(str(i+1))

authors_df = pd.DataFrame(authors)

In [74]:
authors_df.loc[authors_df["position in article"]=="1",]

Unnamed: 0,article_eid,author_id,author_name,affiliation_country,position in article
0,2-s2.0-85091803788,56319399400,Scott Thiebes,Germany,1
3,2-s2.0-85089383999,57218510909,Juin-Hao Ho,Taiwan,1
6,2-s2.0-85128853821,56512092600,Marius Bartmann,Germany,1
7,2-s2.0-85127588032,57562370000,Aurélie Halsband,Germany,1
8,2-s2.0-85177554926,36618183700,Raghu Raman,India,1
...,...,...,...,...,...
6361,2-s2.0-85128453304,57580812100,Jiyoung Jang,South Korea,1
6363,2-s2.0-85174748358,57205140187,Johnblack K. Kabukye,Sweden,1
6376,2-s2.0-85171645006,58606568400,Meizhen Deng,China,1
6379,2-s2.0-85160969682,58221931100,Xingce Zhu,China,1


## Converting columns to boolean

In [86]:
columns_to_boolean = ["AI (yes/no)", 
            "Sustainability (yes/no)", 
            "AI as buzzword? (0/1)",
            "policy recommendations (1/0)"]

In [87]:
df[columns_to_boolean] = df[columns_to_boolean].astype('bool')

## Metadataframe

In [88]:
metadata_df = df.loc[:,["EID", "Title", "Year",
                        "SDG", "Source title", 
                        "Cited by", "DOI", 
                        "Abstract", "Author Keywords", 
                        "AI (yes/no)", "Sustainability (yes/no)" ]]
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 855 entries, 0 to 854
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   EID                      855 non-null    object
 1   Title                    854 non-null    object
 2   Year                     855 non-null    int64 
 3   SDG                      855 non-null    int64 
 4   Source title             855 non-null    object
 5   Cited by                 855 non-null    int64 
 6   DOI                      849 non-null    object
 7   Abstract                 824 non-null    object
 8   Author Keywords          613 non-null    object
 9   AI (yes/no)              855 non-null    bool  
 10  Sustainability (yes/no)  855 non-null    bool  
dtypes: bool(2), int64(3), object(6)
memory usage: 61.9+ KB


## Function to separate values in the same cell

In [89]:
def separating(df, old_column, new_column):
    df[new_column] = df[old_column].str.split(';')
    df = df.explode(new_column)
    df = df.drop(columns=old_column).reset_index(drop=True)
    df[new_column] = df[new_column].str.strip()
    df[new_column] = df[new_column].str.lower()
    return df

## AI_type dataframe

In [90]:
ai_df = df.loc[:, ["EID", "Type of AI"]]
aitype_df = separating(ai_df, "Type of AI", "AI_type")
aitype_df

Unnamed: 0,EID,AI_type
0,2-s2.0-85091803788,overall
1,2-s2.0-85089383999,overall
2,2-s2.0-85128853821,overall
3,2-s2.0-85127588032,overall
4,2-s2.0-85177554926,nlp
...,...,...
1054,2-s2.0-85174748358,machine learning
1055,2-s2.0-85171645006,unsupervised machine learning
1056,2-s2.0-85171645006,nlp
1057,2-s2.0-85171645006,deep learning


In [91]:
dff = aitype_df.drop_duplicates(subset=["EID"]).reset_index(drop=True)
dff

Unnamed: 0,EID,AI_type
0,2-s2.0-85091803788,overall
1,2-s2.0-85089383999,overall
2,2-s2.0-85128853821,overall
3,2-s2.0-85127588032,overall
4,2-s2.0-85177554926,nlp
...,...,...
850,2-s2.0-85099138027,overall
851,2-s2.0-85128453304,overall
852,2-s2.0-85174748358,machine learning
853,2-s2.0-85171645006,unsupervised machine learning


In [92]:
dff.loc[dff["AI_type"] == "blockchain",]

Unnamed: 0,EID,AI_type


In [93]:
dff["AI_type"].unique()

array(['overall', 'nlp', 'deep learning', 'machine learning',
       'supervised machine learning', 'idss',
       'unsupervised machine learning', 'evolutionary algorithms',
       'fuzzy logic', 'google earth engine', 'reinforcement learning',
       'symbolic ai', 'mas', 'edge ai', 'mcdm ai', 'eai', 'llm',
       'geospatial ai', 'generative ai', 'quantum ai', 'explainable ai'],
      dtype=object)

## Algorithms dataframe

In [94]:
a_df = df.loc[:, ["EID", "Algorithm(s) used"]]
algo_df = separating(a_df, "Algorithm(s) used", "Algorithms")
algo_df = algo_df.sort_values(by="Algorithms", ascending=True).reset_index(drop=True)

In [97]:
algo_df

Unnamed: 0,EID,Algorithms
0,2-s2.0-85121255385,aaa
1,2-s2.0-85102000803,ablwl
2,2-s2.0-85042656860,aco
3,2-s2.0-85038906891,aco
4,2-s2.0-85084937718,aco
...,...,...
1189,2-s2.0-85150761407,xlnet
1190,2-s2.0-85111814672,xyf
1191,2-s2.0-85133863118,yolo
1192,2-s2.0-85146707006,yolo


In [95]:
algo_df.loc[algo_df["Algorithms"].isnull(),]

Unnamed: 0,EID,Algorithms


In [96]:
temp_array = algo_df["Algorithms"].unique()

In [98]:
temp_array

array(['aaa', 'ablwl', 'aco', 'adaboost', 'adam optimizer', 'aeba',
       'anfis', 'ann', 'ap', 'ba', 'bayesian algorithm', 'bbo', 'bert',
       'blwl', 'bn', 'bnm', 'bpit', 'bpnn', 'bra', 'brt', 'canfis',
       'cart', 'catboost', 'ccnn', 'cdrm', 'ceda', 'cgda', 'cgpb',
       'chatbots', 'chatgpt', 'cma-es', 'cnn', 'cpann', 'crm',
       'cultural algorithm', 'dbm', 'dbscan', 'ddqn', 'de',
       'decision tree', 'dexined', 'dnn', 'dpm', 'efo', 'elm', 'elnn',
       'emla', 'enn', 'ensemble learning', 'ensemble nn', 'fa', 'fahp',
       'fda', 'ffbp', 'fnn', 'fpd-smp-bal', 'frl', 'gan', 'garch',
       'gaussian yolov3', 'gbfs', 'gbm', 'gdalbnn', 'genetic algorithm',
       'genetic programming', 'gep', 'ggcn', 'gmdh', 'gmm', 'goa', 'gpr',
       'gpt models', 'gpt-2', 'gpt3', 'grnn', 'gru', 'gtm', 'gwoa', 'gwr',
       'harmony search', 'hierarchical clustering', 'hnn', 'hog',
       'hybrid reasoning', 'ica', 'id3', 'isnn', 'iwo', 'joa', 'k-means',
       'knn', 'knowledge graph

In [37]:
new_df = pd.DataFrame(temp_array, index=range(2, 2 + len(temp_array)))

In [58]:
new_df[179:len(temp_array)]

Unnamed: 0,0
181,svm
182,svr
183,"svr, xgboost, adaboost"
184,swlr
185,tcn
186,tcnn
187,tdnn
188,tlbom
189,topsis
190,tree bost


In [17]:
url_id = config['url_id']
url_base = "https://docs.google.com/spreadsheets/d/"
url_e = "export/format=xlsx"
file = pd.ExcelFile(f"{url_base}{url_id}{url_e}")  
with pd.ExcelFile(file) as xls:  
    names_df = pd.read_excel(xls, "algorithms abbreviations", header=0)
    
#ignoring warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*utcnow.*")
now = datetime.datetime.utcnow()

In [18]:
names_df["Algorithm name"] = names_df["Algorithm name"].str.lower()
names_df["Abbreviations"] = names_df["Abbreviations"].str.lower()
names_df["Abbreviations"] = names_df["Abbreviations"].str.strip()
names_df

Unnamed: 0,Algorithm name,Abbreviations
0,artificial algae algorithm,aaa
1,adaboost with locally weighted learning,ablwl
2,ant-colony optimization,aco
3,auto encoder based approach,aeba
4,adaptive network-based fuzzy inference system ...,anfis
...,...,...
169,sentiment reasoner,vader
170,very deep convolutional networks for large sca...,vggnet
171,wind driven optimization,wdo
172,xlnet neural network mode,xlnet


In [113]:
merged_df = pd.merge(algo_df, names_df, 
                     left_on=algo_df['Algorithms'], 
                     right_on=names_df["Abbreviations"], how='left')[["EID", "Algorithms", "Algorithm name", "Abbreviations"]]
merged_df.head(50)

Unnamed: 0,EID,Algorithms,Algorithm name
0,2-s2.0-85121255385,aaa,artificial algae algorithm
1,2-s2.0-85102000803,ablwl,adaboost with locally weighted learning
2,2-s2.0-85042656860,aco,ant-colony optimization
3,2-s2.0-85038906891,aco,ant-colony optimization
4,2-s2.0-85084937718,aco,ant-colony optimization
5,2-s2.0-85079043107,adaboost,
6,2-s2.0-85143803463,"adaboost, knn",
7,2-s2.0-85133285340,adam optimizaer,
8,2-s2.0-85150761407,adam optimizer,
9,2-s2.0-85122798542,aeba,auto encoder based approach


## Role of AI

In [99]:
role_df = df.loc[:, ["EID", "Role of AI"]]
role_df = separating(role_df, "Role of AI", "AI Role")
role_df = role_df.sort_values(by="AI Role", ascending=True).reset_index(drop=True)

In [101]:
role_df["AI Role"].unique()

array(['a.i integrated in products', 'accelerated experimentation', 'all',
       'classification', 'communication infraestructure', 'data mining',
       'data mining and remote sensing', 'decision making',
       'fast approximate simulation', 'forecasting',
       'increase effectiveness of blended learning',
       'managing system data', 'maximizing power generation', 'overall',
       'performance optimization', 'planning sustainable infrastructure',
       'predictive maintainance', 'predictive maintaionance',
       'predictive maintenance', 'reducing emissions', 'security',
       'simulation', 'supply and demand prediction',
       'system optimization', 'system optimization?',
       'system optimzation', 'systemt optimization', nan], dtype=object)