# Fulltext analysis

This notebook is a work in progress - to be updated soon...

### To run this notebook, you need to install the following packages:

- ```paperetl``` (Python)
- ```GROBID``` (Java, [installation](https://grobid.readthedocs.io/en/latest/Install-Grobid/))

### The notebook is sectioned into the following parts:

- [Parsing PDFs](#Parsing-PDFs)
- [Handling duplicates](#Handling-duplicates)

In [None]:
# Parsing PDFs

# To use paperetl, active GROBID instance is needed
# To use GROBID, JVM should also be installed
# !wget https://github.com/kermitt2/grobid/archive/0.7.2.zip
# !unzip 0.7.2.zip
# !cd grobid-0.7.2
# !./gradlew run

import glob

In [None]:
# Analyzing fulltexts

import json
import glob

## Parsing PDFs

- [Back to top](#Fulltext-analysis)

In [None]:
# Parsing Google Scholar PDFs
# !mkdir ./data/google-scholar/papers/json
# !python -m paperetl.file data/google-scholar/papers json://data/google-scholar/papers/json

# Parsing Springer PDFs
# !mkdir ./data/springer/papers/json
# !python -m paperetl.file data/springer/papers json://data/springer/papers/json

# Parsing arXiv PDFs
# !mkdir ./data/arxiv/papers/json
# !python -m paperetl.file data/arxiv/papers json://data/arxiv/papers/json

In [None]:
gs_papers = glob.glob("./data/google-scholar/papers/json/*.json")
s_papers = glob.glob("./data/springer/papers/json/*.json")
a_papers = glob.glob("./data/arxiv/papers/json/*.json")

print("Parsed {0}, {1}, {2} papers from Google Scholar (various publishers), Springer, and arXiv; respectively.".format(len(gs_papers), len(s_papers), len(a_papers)))

## Handling duplicates

- [Back to top](#Fulltext-analysis)

In [None]:
all_papers = []
all_papers_map = {}
sources = ["google-scholar", "springer", "arxiv"]

for source in sources:
    source_papers = glob.glob("./data/{0}/papers/json/*.json".format(source))
    for source_paper in source_papers:
        with open(source_paper) as json_file:
            paper = json.load(json_file)
            location_pdf = "-".join(source_paper.split("-")[:-1]).replace("/papers/json/", "/papers/") + ".pdf"
            paper_dict = {"source": source, "location_json": source_paper, "location_pdf": location_pdf, "details": paper}
            all_papers.append(paper_dict)
            if paper["title"] in all_papers_map.keys():
                all_papers_map[paper["title"]].append(paper_dict)
            else:
                all_papers_map[paper["title"]] = [paper_dict]
    print("Added papers from source:", source)
print("Total papers collected:", len(all_papers))
print("Total unique titles collected:", len(all_papers_map.keys()))

In [None]:
duplicates = []

for title in all_papers_map.keys():
    if len(all_papers_map[title]) > 1:
        paper_sources = []
        paper_versions = []
        paper_pdfs = []
        paper_jsons = []
        for paper in all_papers_map[title]:
            paper_sources.append(paper["source"])
            paper_versions.append(paper["details"])
            paper_pdfs.append(paper["location_pdf"])
            paper_jsons.append(paper["location_json"])
        duplicates.append({"title": all_papers_map[title][0]["details"]["title"], "sources": paper_sources, "versions": paper_versions, "pdfs": paper_pdfs, "jsons": paper_jsons})
print("Separated {} duplicate papers for further analysis".format(len(duplicates)))

In [None]:
duplicates_log = []

for duplicate in duplicates:
    duplicates_log.append({"title": duplicate["title"], "sources": duplicate["sources"], "pdfs": duplicate["pdfs"], "jsons": duplicate["jsons"], "comment": ""})
    
with open("./log/duplicates-log.json", "w") as duplicates_log_file:
    json.dump(duplicates_log, duplicates_log_file, indent=2)

In [None]:
with open("./log/duplicates-log-commented.json", "r") as duplicates_log_file:
    duplicates_log_commented = json.load(duplicates_log_file)
    for duplicate in duplicates_log_commented:
        print(duplicate["title"], "-->", duplicate["comment"])

In [None]:
gs_papers = glob.glob("./data/google-scholar/papers/json/*.json")
s_papers = glob.glob("./data/springer/papers/json/*.json")
a_papers = glob.glob("./data/arxiv/papers/json/*.json")

print("After handling duplicates, {0}, {1}, {2} papers from Google Scholar (various publishers), Springer, and arXiv are left; respectively.".format(len(gs_papers), len(s_papers), len(a_papers)))

## Finding reviews and summaries

- [Back to top](#Fulltext-analysis)

In [None]:
all_papers = []
sources = ["google-scholar", "springer", "arxiv"]

for source in sources:
    source_papers = glob.glob("./data/{0}/papers/json/*.json".format(source))
    for source_paper in source_papers:
        with open(source_paper) as json_file:
            paper = json.load(json_file)
            location_pdf = "-".join(source_paper.split("-")[:-1]).replace("/papers/json/", "/papers/") + ".pdf"
            paper_dict = {"source": source, "location_json": source_paper, "location_pdf": location_pdf, "details": paper}
            all_papers.append(paper_dict)
    print("Added papers from source:", source)
print("Total papers collected:", len(all_papers))

In [None]:
for paper in all_papers:
    if "review" in paper["details"]["title"] or "summary" in paper["details"]["title"] or "survey" in paper["details"]["title"]:
#         print(paper["details"]["title"])
        file_url = "file:///home/gereltuya/Downloads/spbu/ai-for-climate-action" + paper["location_pdf"][1:]
        print(file_url, "\n")

In [None]:
all_papers[0].keys()

In [None]:
all_papers[0]["details"].keys()

## Analyzing Springer abstracts with pyResearchInsights

- [Back to top](#Fulltext-analysis)

In [None]:
from pyResearchInsights.Cleaner import cleaner_main

abstracts_log_name = "./LOGS/log/abstracts.txt"
status_logger_name = "test_run"
cleaner_main(abstracts_log_name, status_logger_name)

In [None]:
from pyResearchInsights.Analyzer import analyzer_main

abstracts_log_name = "./LOGS/log/abstracts_CLEANED.txt"
status_logger_name = "test_run"
analyzer_main(abstracts_log_name, status_logger_name)

In [None]:
from pyResearchInsights.NLP_Engine import nlp_engine_main
abstracts_log_name = "./LOGS/log/abstracts_CLEANED.txt"
status_logger_name = "test_run"
nlp_engine_main(abstracts_log_name, status_logger_name)