# Google Scholar paper collection

### To run this notebook, you need to install the following packages:

- ```google-search-results``` ([SerpAPI](https://serpapi.com/) key is also needed.)
- ```pandas```
- ```matplotlib```
- ```networkx```
- ```python-slugify```

### The notebook is sectioned into the following parts:

- [Metadata collection](#Metadata-collection)
- [Metadata exploration](#Metadata-exploration)
- [Fulltext collection](#Fulltext-collection)
- [Unavailable papers](#Unavailable-papers)

In [None]:
# Metadata collection

import json
from serpapi import GoogleSearch

In [None]:
# Metadata exploration

import pandas as pd
pd.set_option("display.max.columns", None)
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Fulltext collection

import os
import shutil
import pandas as pd
pd.set_option("display.max.columns", None)
import urllib.request as ur
from slugify import slugify

## Metadata collection

- [Back to top](#Google-Scholar-paper-collection)

In [None]:
# Collecting metadata of Google Scholar papers

for i in range(0, 49): # Maximum of 980 papers were available at the time of request (2022/11/21)
    start_page = i*20

    params = {
      "api_key": "", # Removed key for security reasons
      "device": "desktop",
      "engine": "google_scholar",
      "q": "artificial intelligence climate change",
      "hl": "en",
      "scisbd": "0",
      "num": "20",
      "start": str(start_page)
    }

    search = GoogleSearch(params)
    results = search.get_dict()
    
    results_filename = "./data/google-scholar/metadata/results" + str(i) + ".json"
    
    with open(results_filename, "w") as results_file:
        json.dump(results, results_file)
    
    print("printed:" + results_filename)

In [None]:
# Organizing results by filtering out papers with authors, and summary

organic_results = []
organic_results_authors = []
organic_results_summary = []

for i in range(0, 49):
    start_page = i*20
    results_filename = "./data/google-scholar/metadata/results" + str(i) + ".json"

    with open(results_filename) as results_file:
        results_dict = json.load(results_file)

    for j in range(len(results_dict["organic_results"])):
        results_dict["organic_results"][j]["position"] = start_page + j
        organic_results.append(results_dict["organic_results"][j])
        
        if len(results_dict["organic_results"][j]["publication_info"]) > 1:
            organic_results_authors.append(results_dict["organic_results"][j])
            print("added paper no.", str(start_page + j), "to authors")
        else:
            organic_results_summary.append(results_dict["organic_results"][j])
            print("added paper no.", str(start_page + j), "to summary")

In [None]:
organic_results[0]

In [None]:
# Saving organic results for later use

with open("./data/google-scholar/metadata/google-scholar.json", "w") as or_filename:
    json.dump(organic_results, or_filename, indent=2)

In [None]:
# Flattening nested data for papers and getting unique authors

papers = []
authors = {}

for i in range(len(organic_results)):
    print("---> started analyzing paper no.", str(i))
    paper = {}
    
    paper["position"] = organic_results[i]["position"]
    paper["title"] = organic_results[i]["title"]
    paper["result_id"] = organic_results[i]["result_id"]
    
    if "link" in organic_results[i].keys():
        paper["link"] = organic_results[i]["link"]

    paper["snippet"] = organic_results[i]["snippet"]
    
    paper["pi_summary"] = organic_results[i]["publication_info"]["summary"]
    author_names = []
    author_links = []
    author_serpapi_scholar_links = []
    author_author_ids = []
    author_count = 0
    
    if len(organic_results[i]["publication_info"]) > 1:
        for j in range(len(organic_results[i]["publication_info"]["authors"])):
            author_names.append(organic_results[i]["publication_info"]["authors"][j]["name"])
            author_links.append(organic_results[i]["publication_info"]["authors"][j]["link"])
            author_serpapi_scholar_links.append(organic_results[i]["publication_info"]["authors"][j]["serpapi_scholar_link"])
            author_author_ids.append(organic_results[i]["publication_info"]["authors"][j]["author_id"])
            
            if organic_results[i]["publication_info"]["authors"][j]["author_id"] not in authors.keys():
                author = {}
            
                author["name"] = organic_results[i]["publication_info"]["authors"][j]["name"]
                author["link"] = organic_results[i]["publication_info"]["authors"][j]["link"]
                author["serpapi_scholar_link"] = organic_results[i]["publication_info"]["authors"][j]["serpapi_scholar_link"]
                author["author_id"] = organic_results[i]["publication_info"]["authors"][j]["author_id"]
                author["papers"] = []
                
                authors[organic_results[i]["publication_info"]["authors"][j]["author_id"]] = author
                
                print("new author", organic_results[i]["publication_info"]["authors"][j]["author_id"], "added at no.", str(len(authors)))
            
            author_paper = {}
                
            author_paper["position"] = organic_results[i]["position"]
            author_paper["result_id"] = organic_results[i]["result_id"]
            author_paper["authorship_order"] = j
            
            authors[organic_results[i]["publication_info"]["authors"][j]["author_id"]]["papers"].append(author_paper)
                
            print("new paper no.", str(organic_results[i]["position"]), "added to the author", organic_results[i]["publication_info"]["authors"][j]["author_id"])
            
        author_count = len(organic_results[i]["publication_info"]["authors"])
            
    paper["pi_author_names"] = author_names
    paper["pi_author_links"] = author_links
    paper["pi_author_serpapi_scholar_links"] = author_serpapi_scholar_links
    paper["pi_author_author_ids"] = author_author_ids
    paper["pi_author_count"] = author_count
    
    resource_titles = []
    resource_file_formats = []
    resource_links = []
    
    if "resources" in organic_results[i].keys():
        for k in range(len(organic_results[i]["resources"])):
            resource_titles.append(organic_results[i]["resources"][k]["title"])
            resource_links.append(organic_results[i]["resources"][k]["link"])
            if "file_format" in organic_results[i]["resources"][k].keys():
                resource_file_formats.append(organic_results[i]["resources"][k]["file_format"])
    
    paper["r_title"] = resource_titles
    paper["r_file_format"] = resource_file_formats
    paper["r_link"] = resource_links
    
    paper["il_serpapi_cite_link"] = organic_results[i]["inline_links"]["serpapi_cite_link"]
    
    if "cited_by" in organic_results[i]["inline_links"].keys():
        paper["il_cb_total"] = organic_results[i]["inline_links"]["cited_by"]["total"]
        paper["il_cb_link"] = organic_results[i]["inline_links"]["cited_by"]["link"]
        
        if "cites_id" in organic_results[i]["inline_links"]["cited_by"].keys():
            paper["il_cb_cites_id"] = organic_results[i]["inline_links"]["cited_by"]["cites_id"]
            
        if "serpapi_scholar_link" in organic_results[i]["inline_links"]["cited_by"].keys():
            paper["il_cb_serpapi_scholar_link"] = organic_results[i]["inline_links"]["cited_by"]["serpapi_scholar_link"]
    
    if "related_pages_link" in organic_results[i]["inline_links"].keys():
        paper["il_related_pages_link"] = organic_results[i]["inline_links"]["related_pages_link"]

    if "serpapi_related_pages_link" in organic_results[i]["inline_links"].keys():
        paper["il_serpapi_related_pages_link"] = organic_results[i]["inline_links"]["serpapi_related_pages_link"]
    
    if "versions" in organic_results[i]["inline_links"].keys():
        paper["il_v_total"] = organic_results[i]["inline_links"]["versions"]["total"]
        paper["il_v_link"] = organic_results[i]["inline_links"]["versions"]["link"]
        paper["il_v_cluster_id"] = organic_results[i]["inline_links"]["versions"]["cluster_id"]
        paper["il_v_serpapi_scholar_link"] = organic_results[i]["inline_links"]["versions"]["serpapi_scholar_link"]
    
    papers.append(paper)
    
    print("new paper", str(organic_results[i]["position"]), "added\n")

In [None]:
# Enrichening authors data

authors_list = []

for key in authors.keys():
    authors[key]["paper_count"] = len(authors[key]["papers"])
    
    paper_positions = []
    paper_result_ids = []
    paper_authorship_orders = []
    
    for i in range(len(authors[key]["papers"])):
        paper_positions.append(authors[key]["papers"][i]["position"])
        paper_result_ids.append(authors[key]["papers"][i]["result_id"])
        paper_authorship_orders.append(authors[key]["papers"][i]["authorship_order"])
        
    authors[key]["paper_positions"] = paper_positions
    authors[key]["paper_result_ids"] = paper_result_ids
    authors[key]["paper_authorship_orders"] = paper_authorship_orders
    
    authors[key].pop("papers", None)
    
    authors_list.append(authors[key])

In [None]:
papers[0]

In [None]:
# Saving papers metadata for later use

with open("./data/google-scholar/metadata/google-scholar-papers.json", "w") as papers_filename:
    json.dump(papers, papers_filename, indent=2)

In [None]:
authors_list[0]

In [None]:
# Saving authors metadata for later use

with open("./data/google-scholar/metadata/google-scholar-authors.json", "w") as authors_filename:
    json.dump(authors_list, authors_filename, indent=2)

## Metadata exploration

- [Back to top](#Google-Scholar-paper-collection)

In [None]:
# Reading saved data

authors_df = pd.read_json("./data/google-scholar/metadata/google-scholar-authors.json")
papers_df = pd.read_json("./data/google-scholar/metadata/google-scholar-papers.json")

In [None]:
authors_df

In [None]:
len(authors_df["author_id"].unique())

In [None]:
papers_df

In [None]:
# Checking for duplicate pairs

duplicate_pairs = []

for d in papers_duplicates:
    pair = []
    for p in papers:
        if p["title"] == d:
            pair.append(p["position"])
    duplicate_pairs.append(pair)
    
duplicate_pairs

In [None]:
# Checking different features of duplicate titles

for dp in duplicate_pairs[:1]:
    x = papers[dp[0]]
    y = papers[dp[1]]
    diff = {k: x[k] for k in x if k not in y or x[k] != y[k]}
    print(diff.keys())

In [None]:
# Filtering out unique titles with listed authors

papers_df_uniques = papers_df.groupby("title",sort=False).max().sort_values("pi_author_count", ascending=False)

papers_w_authors = papers_df_uniques[papers_df_uniques["pi_author_count"] > 0]

papers_w_authors

In [None]:
# Creating connections between authors and papers

connections = []

for i in range(len(papers_w_authors)):
    for j in range(len(papers_w_authors["pi_author_author_ids"][i])):
        connection = {}
        connection["paper_id"] = papers_w_authors["result_id"][i]
        connection["author_id"] = papers_w_authors["pi_author_author_ids"][i][j]
        connections.append(connection)
        
len(connections)

In [None]:
# Filtering out unique authors from connections

authors_dict = {}

for i in range(len(connections)):
    if connections[i]["author_id"] not in authors_dict.keys():
        authors_dict[connections[i]["author_id"]] = []
    authors_dict[connections[i]["author_id"]].append(connections[i]["paper_id"])
    
len(authors_dict)

In [None]:
# Creating connections between papers written by the same authors

paper_pairs = []

for i in range(len(connections)):
    for j in range(len(authors_dict[connections[i]["author_id"]])):
        paper_pair = {}
        paper_pair["paper_1"] = connections[i]["paper_id"]
        paper_pair["paper_2"] = authors_dict[connections[i]["author_id"]][j]
        if paper_pair["paper_1"] != paper_pair["paper_2"]:
            paper_pairs.append(paper_pair)

len(paper_pairs)

In [None]:
collab_df = pd.DataFrame.from_records(paper_pairs)

collab_df = collab_df.groupby(["paper_1", "paper_2"]).agg({"paper_1": ["count"]}).reset_index()

collab_df.columns = ["paper_1", "paper_2", "common_authors"]

collab_df

In [None]:
# Connecting authors who worked on same papers

author_pairs = []

for i in range(len(papers_w_authors)):
    for j in range(len(papers_w_authors["pi_author_author_ids"][i])):
        for k in range(j+1, len(papers_w_authors["pi_author_author_ids"][i])):
            author_pair = {}
            author_pair["author_1"] = papers_w_authors["pi_author_author_ids"][i][j]
            author_pair["author_2"] = papers_w_authors["pi_author_author_ids"][i][k]
            author_pairs.append(author_pair)
        
len(author_pairs)

In [None]:
author_pairs

In [None]:
collab_df = pd.DataFrame.from_records(author_pairs)

collab_df

In [None]:
# Creating graph from papers that have authors in common

collab_graph = nx.from_pandas_edgelist(collab_df, "paper_1", "paper_2", edge_attr = "common_authors", create_using = nx.DiGraph())

print(nx.info(collab_graph))

In [None]:
plt.figure(figsize =(20, 20))
layout = nx.spring_layout(collab_graph, k = 0.7)

nx.draw_networkx_edges(collab_graph, layout, edge_color = '#AAAAAA')

uni_dots = [node for node in collab_graph.nodes() 
            if node in collab_df[["paper_1", "paper_2"]].values]

nx.draw_networkx_nodes(collab_graph, layout, nodelist = uni_dots, 
                       node_size = 30, node_color = '#AAAAAA')

In [None]:
# Calculating eigenvector centrality to test

dict(sorted(nx.eigenvector_centrality(collab_graph).items(), key = lambda item: item[1], reverse = True))

In [None]:
# Calculating in-degree centrality to test

dict(sorted(nx.in_degree_centrality(collab_graph).items(), key = lambda item: item[1], reverse = True))

In [None]:
# Calculating betweenness centrality to test

dict(filter(lambda value: value[1] > 0, 
            dict(sorted(nx.betweenness_centrality(collab_graph, normalized = True, endpoints = True).items(), 
                        key = lambda item: item[1], reverse = True)).items()))

In [None]:
plt.figure(figsize =(20, 20))
layout = nx.spring_layout(collab_graph, k = 0.7)

nx.draw_networkx_edges(collab_graph, layout, edge_color = '#AAAAAA')

uni_dots = [node for node in collab_graph.nodes() 
            if node in collab_df[["paper_1", "paper_2"]].values]

nx.draw_networkx_nodes(collab_graph, layout, nodelist = uni_dots, 
                       node_size = 30, node_color = '#AAAAAA')

# Calculating different centralities

eig_dict = dict(filter(lambda value: value[1] > 0.0005, 
                       dict(sorted(nx.eigenvector_centrality(collab_graph).items(), 
                                   key = lambda item: item[1], 
                                   reverse = True)).items()))

inde_dict = dict(filter(lambda value: value[1] > 0.0005, 
                        dict(sorted(nx.in_degree_centrality(collab_graph).items(),
                                    key = lambda item: item[1], 
                                    reverse = True)).items()))

btn_dict = dict(filter(lambda value: value[1] > 0.0005, 
                       dict(sorted(nx.betweenness_centrality(collab_graph, normalized = True, endpoints = True).items(), 
                                   key = lambda item: item[1], 
                                   reverse = True)).items()))

# Finding and mapping intersections

intersection =  [node for node in collab_graph.nodes() 
                 if node in eig_dict.keys()
                 if node in inde_dict.keys() 
                 if node in btn_dict.keys()]

size_intersection = [value * 500000 for (node, value) in nx.in_degree_centrality(collab_graph).items() 
                     if node in intersection]

nx.draw_networkx_nodes(collab_graph, layout, nodelist = intersection, 
                       node_size = size_intersection, node_color = 'purple', alpha = 0.3)

nx.draw_networkx_labels(collab_graph, layout, 
                        labels = dict(zip(list(intersection), list(intersection))),
                        font_size = 12)

# Finding and mapping parts with higher in-degree + between centralities

inde_btn = [node for node in collab_graph.nodes()
            if node not in eig_dict.keys()
            if node in inde_dict.keys()
            if node in btn_dict.keys()]

size_inde_btn = [value * 500000 for (node, value) in nx.in_degree_centrality(collab_graph).items() 
                 if node in inde_btn]

nx.draw_networkx_nodes(collab_graph, layout, nodelist = inde_btn, 
                       node_size = size_inde_btn, node_color = 'aqua', alpha = 0.3)

nx.draw_networkx_labels(collab_graph, layout, 
                        labels = dict(zip(list(inde_btn), list(inde_btn))),
                        font_size = 10)

# Finding and mapping parts with higher eigenvalue + in-degree centralities

eig_inde = [node for node in collab_graph.nodes()
            if node in eig_dict.keys()
            if node in inde_dict.keys()
            if node not in btn_dict.keys()]

size_eig_inde = [value * 500000 for (node, value) in nx.in_degree_centrality(collab_graph).items() 
                 if node in eig_inde]

nx.draw_networkx_nodes(collab_graph, layout, nodelist = eig_inde, 
                       node_size = size_eig_inde, node_color = 'yellow', alpha = 0.3)

nx.draw_networkx_labels(collab_graph, layout, 
                        labels = dict(zip(list(eig_inde), list(eig_inde))),
                        font_size = 10)

# Finding and mapping parts with higher eigenvalue + betweenness centralities

eig_btn = [node for node in collab_graph.nodes()
           if node in eig_dict.keys()
           if node not in inde_dict.keys()
           if node in btn_dict.keys()]

size_eig_btn = [value * 500000 for (node, value) in nx.in_degree_centrality(collab_graph).items() 
                if node in eig_btn]

nx.draw_networkx_nodes(collab_graph, layout, nodelist = eig_btn, 
                       node_size = size_eig_btn, node_color = 'orange', alpha = 0.4)

nx.draw_networkx_labels(collab_graph, layout, 
                        labels = dict(zip(list(eig_btn), list(eig_btn))),
                        font_size = 10)

plt.axis('off')
plt.title("Collaborative papers")

plt.show()

In [None]:
# Creating unique ID to title map for papers

papers_map = {}
for i in range(len(papers)):
    papers_map[papers[i]["result_id"]] = papers[i]["title"]
len(papers_map)

In [None]:
papers_map["XkWqVRksXioJ"]

## Fulltext collection

- [Back to top](#Google-Scholar-paper-collection)

In [None]:
# Reading saved metadata

gs_authors = pd.read_json("./data/google-scholar/metadata/google-scholar-authors.json")
gs_papers = pd.read_json("./data/google-scholar/metadata/google-scholar-papers.json")

In [None]:
gs_papers.columns

In [None]:
gs_papers

In [None]:
gs_papers.info()

In [None]:
gs_papers.describe(include=object)

In [None]:
gs_papers["r_file_format"].value_counts()

In [None]:
gs_papers["r_title"].value_counts()

In [None]:
# Downloading available fulltexts automatically

parent_dir = "/home/gereltuya/Downloads/spbu/ai-for-climate-action/data/google-scholar/papers/"
publishers = {}
unavailable = []
j = 1

for i, row in gs_papers.iterrows():
    if gs_papers["r_file_format"][i] == ["PDF"]:
        publisher = gs_papers["r_title"][i][0]
        paper = parent_dir + publisher + "/" + slugify(gs_papers["title"][i], separator='_', lowercase=False) + ".pdf"
        if publisher in publishers.keys():
            publishers[publisher]["paper_count"] += 1
            publishers[publisher]["paper_links"].append(gs_papers["r_link"][i][0])
        else:
            publishers[publisher] = {"paper_count": 1, "paper_links" : [gs_papers["r_link"][i][0]]}
            try:
                os.mkdir(os.path.join(parent_dir, publisher))
                print("Created directory:", publisher, "\n")
            except FileExistsError:
                pass
        if os.path.exists(paper):
            print(str(j), "--> Downloaded at:", paper, "\n")
        else:
            try:
                ur.urlretrieve(gs_papers["r_link"][i][0], paper)
                shutil.copy(paper, parent_dir)
                print(str(j), "--> Downloaded at:", paper, "\n")
            except:
                print(str(j), "--> Error:", gs_papers["r_link"][i][0], "\n")
                unavailable.append({"link": gs_papers["r_link"][i][0], "filename": paper})
        j += 1
        
# 157 fulltexts were downloaded automatically, log saved

In [None]:
# Displaying URLs for fulltexts to be downloaded manually

j = 1
for i, row in gs_papers.iterrows():
    if gs_papers["r_file_format"][i] == ["HTML"]:
        print(j, "-->", gs_papers["r_link"][i][0], "\n")
        j += 1

In [None]:
j = 1
for i, row in gs_papers.iterrows():
    if gs_papers["r_file_format"][i] == ["DOC"]:
        print(j, "-->", gs_papers["r_link"][i][0], "\n")
        j += 1
        
# 333 fulltexts were downloaded manually, log saved

## Unavailable papers

- [Back to top](#Google-Scholar-paper-collection)

### Not online:

- https://www.academia.edu/download/67761160/nhess_2020_90.pdf
- https://www.currentscience.ac.in/data/forthcoming/397.pdf
- https://www.ijee.net/article_85007_6e39d36f6d36c1df57493a30a974f629.pdf
- https://repository.unescap.org/bitstream/handle/20.500.12870/4694/ESCAP-1995-RP-Improving-access-women-formal-credit-financial-institutions.pdf?sequence=1
- https://wellcomeopenresearch.s3.eu-west-1.amazonaws.com/manuscripts/20348/a99e9e04-e34b-4ce8-8b8e-fdfab2d44d45_17263_-_angela_mcbride_v3.pdf
- http://www.scielo.org.co/pdf/dyna/v85n204/0012-7353-dyna-85-204-00194.pdf
- https://demtech.oii.ox.ac.uk/wp-content/uploads/sites/127/2015/01/Trusted-Innovation-Project-Executive-Summary.pdf
- https://www.aaai.org/ocs/index.php/AAAI/AAAI10/paper/download/1644/2011

### Not accessible:

- https://asmedigitalcollection.asme.org/memagazineselect/article-abstract/142/04/36/1082911/Resilient-Technologies-Battling-Climate-ChangeAs?redirectedFrom=PDF
- https://www.researchgate.net/publication/348363962_A_novel_framework_for_risk_assessment_and_resilience_of_critical_infrastructure_towards_climate_change
- https://www.researchgate.net/publication/364769527_Cities_Allocating_climate_change_responsibilities_at_planetary_scale
- https://www.tandfonline.com/doi/abs/10.1080/10106049.2022.2088861?journalCode=tgei20
- https://meetingorganizer.copernicus.org/EGU22/EGU22-6568.html
- https://dl.acm.org/doi/abs/10.4018/jats.2010100103
- https://www.science.org/doi/10.1126/science.abj4216