# An open-source Python framework for systematic review based on PRISMA

#### Required Dependencies

In [None]:
!pip install rispy
!pip install pandas
!pip install matplotlib
!pip install seaborn

Collecting rispy
  Downloading rispy-0.7.1-py3-none-any.whl (15 kB)
Installing collected packages: rispy
Successfully installed rispy-0.7.1


#### installing the systematic-reviewpy

In [None]:
!python3 -m pip install systematic-reviewpy

google colab Jupyter notebook Instruction :     
`Ctrl m m` will convert a code cell to a text cell.       
 `Ctrl m y` will convert a text cell to a code cell.       

##### install pdftotext dependencies: Installing needed python pdf readers for validation and search count of pdf text.

<font color="#F7B905" size="3">Please run cell based on your OS and keep other cells as markdown.</font>

In [None]:
##### Debian, Ubuntu, and friends
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev

##### Fedora, Red Hat, and friends
!sudo yum install gcc-c++ pkgconfig poppler-cpp-devel python3-devel

##### macOS
!brew install pkg-config poppler python

##### Windows using conda
!conda install -c conda-forge poppler

#### Install python pdf readers

In [None]:
## https://pypi.org/project/PyMuPDF/
!python -m pip install --upgrade pip
!python -m pip install --upgrade pymupdf
## https://pypi.org/project/pdftotext/
!pip install pdftotext

Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
pkg-config is already the newest version (0.29.1-0ubuntu2).
python3-dev is already the newest version (3.6.7-1~18.04).
python3-dev set to manually installed.
The following additional packages will be installed:
  libpoppler-cpp0v5
The following NEW packages will be installed:
  libpoppler-cpp-dev libpoppler-cpp0v5
0 upgraded, 2 newly installed, 0 to remove and 37 not upgraded.
Need to get 36.7 kB of archives.
After this operation, 188 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpoppler-cpp0v5 amd64 0.62.0-2ubuntu2.12 [28.0 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpoppler-cpp-dev amd64 0.62.0-2ubuntu2.12 [8,676 B]
Fetched 36.7 kB in 1s (30.6 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is i

#### importing the systematic-reviewpy

In [None]:
import systematic_review

ModuleNotFoundError: ignored

Most of the object contains methods like to_csv and to_excel to output files

Check documentation for more string manipulation methods : 
- preprocess_string (default and applied before all other implemented functions)
- custom_text_manipulation_function : for putting your custom_text_manipulation_function function to preprocess the text
- nltk_remove_stopwords
- pattern_lemma_or_lemmatize_text 
- nltk_word_net_lemmatizer 
- nltk_porter_stemmer
- nltk_lancaster_stemmer 
- spacy_lemma 
- nltk_remove_stopwords_spacy_lemma 
- convert_string_to_lowercase
- preprocess_string_to_space_separated_words

<font color="#F7B905" size="3">Please provide name of string manipulation method.</font>

In [None]:
string_manipulation_method = 'convert_string_to_lowercase'

## Optional Converting and wrangling citation files

wrangling or modification of the citation files is required if there is format error while uploading files into reference manager.

In [None]:
#citation.csv_citations_to_ris_converter("./Data files and Python Code/Downloaded files/springer.csv", "./Data files and Python Code/Modified files/springer.ris")

In [None]:
#citation.remove_empty_lines("./Data files and Python Code/Downloaded files/entropy-v12-i12_20210610.ris", "./Data files and Python Code/Modified files/MDPI.ris")

In [None]:
#citation.edit_ris_citation_paste_values_after_regex_pattern("./Data files and Python Code/Modified files/MDPI.ris", "./Data files and Python Code/Modified files/mdpi.ris")

In [None]:
#import os
#os.remove("./Data files and Python Code/Modified files/MDPI.ris")

## All files are uploaded to mendeley reference manager, updated using mendeley database, and downloaded in ris format.

<font color="#F7B905" size="3">Please provide the path of the folder that contains all citations ris files.</font>

In [None]:
CITATIONS_FILES_PARENT_DIR_PATH = "./Data files and Python Code/Articles_by_sources"

In [None]:
citations = systematic_review.citation.Citations(CITATIONS_FILES_PARENT_DIR_PATH)

In [None]:
citations_df = citations.get_dataframe()
citations_df

## Search Words

<span style="color:green">Please provide the path of search_words.json or make keyword dictionary.</span>

In [None]:
systematic_review.search_count.SearchWords().get_sample_keywords_json()

<font color="#F7B905" size="3">Edit the template based on your need and provide the file path in cell below. if filename and location is not changed no need to change anything.</font>

In [None]:
#KEYWORDS_JSON_FILE_PATH = "./Data files and Python Code/keywords.json"
SEARCH_WORDS_JSON_FILE_PATH = "./sample_search_words_template.json"

In [None]:
search_words = systematic_review.search_count.SearchWords(SEARCH_WORDS_JSON_FILE_PATH, string_manipulation_method)

In [None]:
print(search_words.value)

## Search and count words in citations

In [None]:
citations_search_words_count = systematic_review.search_count.SearchCount(citations_df, search_words, string_manipulation_method)

In [None]:
citations_search_words_count_df = citations_search_words_count.get_dataframe()
citations_search_words_count_df

citations_search_words_count.to_csv("./Data files and Python Code/OutputFiles/citations_keywords_count_df.csv")

## Sort and Filter the citations

<font color="#F7B905" size="3">Please provide how many research papers needed.</font>

In [None]:
# Filter the citations to required number
required_citations_number = 500

In [None]:
filter_sorted_citations = systematic_review.filter_sort.FilterSort(citations_search_words_count_df, search_words, required_citations_number)

In [None]:
filter_sorted_citations_df = filter_sorted_citations.get_dataframe()

In [None]:
print(len(filter_sorted_citations_df))

filter_sorted_citations.to_csv("./Data files and Python Code/OutputFiles/filter_sorted_citations_df.csv")

## Downloading above selected pdf from databases.

This is completed with [browser-automationpy](https://github.com/chandraveshchaudhari/browser-automationpy)

## Validating the downloaded articles

<font color="#F7B905" size="3">Please provide parent directory path of all downloaded research papers.</font>

In [None]:
DOWNLOADED_ARTICLES_PATH = "./Data files and Python Code/downloadedArticles"

<font color="#F7B905" size="3">Please provide path of text file containing names of research papers separated by new line OR write None.</font>

In [None]:
IN_ACCESSIBLE_ARTICLES_TEXT_FILE_PATH = "./Data files and Python Code/not_accessible_articles.txt"

In [None]:
validation = systematic_review.validation.Validation(filter_sorted_citations_df, DOWNLOADED_ARTICLES_PATH, IN_ACCESSIBLE_ARTICLES_TEXT_FILE_PATH)

In [None]:
validated_research_papers = validation.get_dataframe()

In [None]:
validation.info()

validation.to_csv("validation.csv")

## Search and count the research papers files.

In [None]:
research_paper_search_words_count = systematic_review.search_count.SearchCount(validated_research_papers, search_words, string_manipulation_method)

In [None]:
research_paper_search_words_count_df = research_paper_search_words_count.get_dataframe()

research_paper_search_words_count.to_csv("./Data files and Python Code/OutputFiles/pdf_keywords_count_df.csv")

## Filter and sort pdf counted df

<font color="#F7B905" size="3">Please provide how many research papers needed.</font>

In [None]:
required_full_text_documents = 100

In [None]:
filter_sorted_research_papers = systematic_review.filter_sort.FilterSort(research_paper_search_words_count_df, search_words, required_full_text_documents)

In [None]:
selected_review_articles_df = filter_sorted_research_papers.get_dataframe()

filter_sorted_research_papers.to_csv("./Data files and Python Code/OutputFiles/selected_review_articles_df.csv")

### Generating research papers review files: 
choose any of following

- sorted based on sources: to make it easier to find articles in folder.

In [None]:
sorted_Finaldf = systematic_review.filter_sort.sort_dataframe_based_on_column(selected_review_articles_df, 'source')

In [None]:
#sorted_Finaldf.to_csv("./Data files and Python Code/OutputFiles/sorted_Finaldf.csv")

- Creating the sample literature review file:      
by adding review columns to enter details manually. The keywords counts are not required at this point of the time, so they are dropped.    

In [None]:
selected_citation = systematic_review.citation.drop_search_words_count_columns(sorted_Finaldf, search_words)
selected_citation_review = systematic_review.analysis.creating_sample_review_file(selected_citation)

selected_citation_review.to_csv("./Data files and Python Code/OutputFiles/selected_citation_review.csv")

# Sytematic Review Workflow diagram and info

In [None]:
my_analysis = systematic_review.analysis.SystematicReviewInfo(CITATIONS_FILES_PARENT_DIR_PATH, filter_sorted_citations_df,
                 validated_research_papers, selected_review_articles_df)

In [None]:
my_analysis.info()

In [None]:
my_analysis.systematic_review_diagram()

## Analysis

| Analysis needed                                     | Fact table | Diagram |
| --------------------------------------------------- | ---------- | ------- |
| The number of articles                              | yes        | no      |
| Period of the publications                          | yes        | yes     |
| Number of authors                                   | yes        | no      |
| Articles with single authors                        | yes        | no      |
| Articles per authors                                | yes        | no      |
| Authors per articles                                | yes        | no      |
| Top N countries with the highest number of articles | yes        | yes     |
| Top N journals with the highest number of articles  | yes        | yes     |
| Top N keywords most used in the articles            | yes        | yes     |
| The year with the highest number of articles        | yes        | yes     |

In [None]:
my_cite_analysis = systematic_review.analysis.CitationAnalysis(sorted_Finaldf)

In [None]:
my_cite_analysis.publication_year_info()

In [None]:
my_cite_analysis.publication_year_diagram()

In [None]:
my_cite_analysis.authors_info()

In [None]:
my_cite_analysis.publication_place_info()

In [None]:
my_cite_analysis.publication_place_diagram()

In [None]:
my_cite_analysis.keywords_info()

In [None]:
my_cite_analysis.keyword_diagram(top_result=10)

In [None]:
my_cite_analysis.publisher_info()

In [None]:
my_cite_analysis.publisher_diagram()

# End of the File