#### Publication Paper Analysis

From paper extract/generate:
1. title
2. author
3. publication date
4. publisher
5. summary

In [1]:
# !pip install PyPDF2

In [1]:
# Imports
import pandas as pd
import numpy as np
import time
# PyPDF2 for loading PDF docs
import PyPDF2
import openai
from langchain.llms import OpenAI
from langchain.llms import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import time
import os
os.environ["OPENAI_API_KEY"] = ""
api_key = ""

In [2]:
# specify directory path to read all PDF files from directory
directory = "../DGreen_test_papers-20230726T075419Z-001/DGreen_test_papers/"

In [3]:
llm = OpenAI(openai_api_key=api_key, temperature=0)

In [4]:
# function to extract title, author, publisher and date of publication from paper
# Implement catching connection lost error from chatGPT server being overloaded
def paper_info(pdf_file_path, llm):
    pdf_reader = PyPDF2.PdfReader(open(pdf_file_path, 'rb'))
    pdf_page_1 = pdf_reader.pages[0].extract_text()
    query_ = "Provide the following information from this paper: title, author, publisher, and date of publication"
    info = llm(pdf_page_1 + query_)
    return info.strip()

In [5]:
files = os.listdir(directory)
print(len(files))

52


In [4]:
# create file to save paper details generated by the chatGPT model
file_name = os.path.join(directory, "paper_details.txt")

In [5]:
print(file_name)

../DGreen_test_papers-20230726T075419Z-001/DGreen_test_papers/paper_details.txt


In [None]:
with open(file_name, 'w', encoding="utf-8") as file:
    print("writing...")

In [17]:
# removed parameters start_index and last_index
def extract_papers_info(directory, llm):
    files = os.listdir(directory)
    start_index = 0
    pdf_info = ""
    while start_index < len(files):
        pdf_file = files[start_index]
        if pdf_file.endswith('.pdf'):
            pdf_file_path = os.path.join(directory, pdf_file)
            pdf_info = paper_info(pdf_file_path, llm)
            # txt_file = pdf_file_path.replace(os.path.splitext(pdf_file_path)[1], "paper_details.txt")
            with open(file_name, 'a', encoding="utf-8") as file:
                file.write(pdf_info)
                file.write("\n")
                # print()
            # print("\n")
        start_index = start_index + 1
        time.sleep(5)

In [18]:
# Think of handling case when chatGPT isn't responding

In [19]:
extract_papers_info(directory, llm)

In [6]:
file = open(file_name)
file_content = file.readlines()
print(len(file_content))

200


In [7]:
refined_ = os.path.join(directory, "refined_paper_details.txt")
# opening file and writing new one without trailing spaces and empty new lines
with open(
    file_name, 'r') as r, open(
    refined_, 'w', encoding="utf-8") as w:
    for line in r:
        if not line.isspace():
            w.write(line)

In [8]:
f = open(refined_)
f_content = f.readlines()
print(len(f_content))

200


In [9]:
# Compute indixes for determining right key for title, author, ...
# create list of titles, authors, etc.
len_t = len("Title: ")
len_a = len("Author: ")
len_p = len("Publisher: ")
len_d = len("Date of publication: ")
titles = []
authors = []
publishers = []
dates_p = []
for line in f_content:
    end_index = len(line) - 1
    if line.startswith("Title"):
        titles.append(line[len_t:end_index])
    elif line.startswith("Author"):
        authors.append(line[len_a:end_index])
    elif line.startswith("Publisher"):
        publishers.append(line[len_p:end_index])
    elif line.startswith("Date"):
        dates_p.append(line[len_d:end_index])

In [11]:
# Creating a dataframe
data = {"Title_pred": titles, "Author_pred": authors, "Publisher_pred": publishers, "Date_published_pred": dates_p}
df = pd.DataFrame(data)

In [12]:
df.head()

Unnamed: 0,Title_pred,Author_pred,Publisher_pred,Date_published_pred
0,Cultivation and Processing of Potato in Bihar:...,"K. M. Singh, Abhay Kumar",ICAR Research Complex for Eastern Region,October 2014
1,The Potato Value Chain in Bihar: An Assessment...,"K. M. Singh, Rajib Sutradhar",SSRN Electronic Journal,January 2011
2,Potato Diseases: A Constraint in Potato Cultiv...,Ajay Kumar and Dr Jai P. Rai,Guru Gobind Singh Indr aprastha University and...,January 2016
3,Training Needs of Potato Growers in Nalanda Di...,S.L. Verma and M.N. Ansari,Hind Agricultural Research and Training Institute,"August, 2013"
4,Bihar Agriculture Growth and Reform Initiative...,"Submitted to Department of Agriculture, Govern...",,"August, 2018"


In [13]:
df.shape

(50, 4)

In [14]:
# loading csv file with paper details entered manually
df_papers = pd.read_csv('../DGreen_test_papers-20230726T075419Z-001/DGreen_test_papers/Papers_details.csv')
df_papers.head()

Unnamed: 0,Title,Author,Publisher,Date_published
0,Cultivation and Processing of Potato in Bihar:...,"K. M. Singh, Abhay Kumar, Tara Shankar, S. K. ...",Environment & Ecology 32 (4B),May 2014
1,The Potato Value Chain in Bihar: An assessment...,"K. M. Singh, Rajib Sutradhar",Electronic Journal,January 2011
2,Potato diseases: a constraint potato cultivation,"Ajay Kumar, Jai P. Rai",Indian Agriculture and Farmers,January 2016
3,Training needs of potato growers in Nalanda di...,"S. L. Verma, M.N. Ansari",Hind Agricultural Research and training institute,August 2013
4,Bihar Agriculture Growth and reform initiative,,,August 2018


In [15]:
# Preprocessing values in dataset (df) with chatGPT extracted details
# Fill N/A, Not Specified, Unkown with NaNs
to_replace = ["N/A", "Not Specified.", "Not Specified", "Unknown", "Unknown.", "Unknow", "N/A."]
for s in to_replace:
    df.replace(s, np.nan, inplace=True)

In [16]:
df

Unnamed: 0,Title_pred,Author_pred,Publisher_pred,Date_published_pred
0,Cultivation and Processing of Potato in Bihar:...,"K. M. Singh, Abhay Kumar",ICAR Research Complex for Eastern Region,October 2014
1,The Potato Value Chain in Bihar: An Assessment...,"K. M. Singh, Rajib Sutradhar",SSRN Electronic Journal,January 2011
2,Potato Diseases: A Constraint in Potato Cultiv...,Ajay Kumar and Dr Jai P. Rai,Guru Gobind Singh Indr aprastha University and...,January 2016
3,Training Needs of Potato Growers in Nalanda Di...,S.L. Verma and M.N. Ansari,Hind Agricultural Research and Training Institute,"August, 2013"
4,Bihar Agriculture Growth and Reform Initiative...,"Submitted to Department of Agriculture, Govern...",,"August, 2018"
5,Organic Potato in Nalanda (Bihar): Using Eco-F...,M.D. Ojha and Bholanath Saha,Indian Res. J. Ext. Edu.,"September, 2014"
6,Potato Production Scenario and Analysis of its...,Rajesh K Rana and Md. Ejaz Anwer,Indian Journal of Agricultural Sciences,September 2018
7,Analysis of Yield and Technological Gaps of Po...,"Dhiraj K. Singh, N.K. Pandey, P . Kharumnuid, ...",Economic Affairs,March 2020
8,Origin and History of Potato,,,
9,Development of Potato in Bihar: Issues and Str...,K.M. Singh and Abhay Kumar,"ICAR-RCER, Patna, India",3 November 2013


In [38]:
# !pip install torchmetrics

In [17]:
# concant both datasets
merged_df = pd.concat([df_papers, df], axis=1)

In [47]:
merged_df.head()

Unnamed: 0,Title,Author,Publisher,Date_published,Title_pred,Author_pred,Publisher_pred,Date_published_pred
0,Cultivation and Processing of Potato in Bihar:...,"K. M. Singh, Abhay Kumar, Tara Shankar, S. K. ...",Environment & Ecology 32 (4B),May 2014,Cultivation and Processing of Potato in Bihar:...,"K. M. Singh, Abhay Kumar",ICAR Research Complex for Eastern Region,October 2014
1,The Potato Value Chain in Bihar: An assessment...,"K. M. Singh, Rajib Sutradhar",Electronic Journal,January 2011,The Potato Value Chain in Bihar: An Assessment...,"K. M. Singh, Rajib Sutradhar",SSRN Electronic Journal,January 2011
2,Potato diseases: a constraint potato cultivation,"Ajay Kumar, Jai P. Rai",Indian Agriculture and Farmers,January 2016,Potato Diseases: A Constraint in Potato Cultiv...,Ajay Kumar and Dr Jai P. Rai,Guru Gobind Singh Indr aprastha University and...,January 2016
3,Training needs of potato growers in Nalanda di...,"S. L. Verma, M.N. Ansari",Hind Agricultural Research and training institute,August 2013,Training Needs of Potato Growers in Nalanda Di...,S.L. Verma and M.N. Ansari,Hind Agricultural Research and Training Institute,"August, 2013"
4,Bihar Agriculture Growth and reform initiative,,,August 2018,Bihar Agriculture Growth and Reform Initiative...,"Submitted to Department of Agriculture, Govern...",,"August, 2018"


In [20]:
# computing the error in extracting titles, authors, publilshers and date published
from torchmetrics.text import CharErrorRate
# cherr_ = CharErrorRate()

# total number of rows 
rows = len(merged_df.index)

def get_char_err(df, col1, col2):
    cherr_ = CharErrorRate()
    df_cols = df[[col1, col2]].dropna()
    col_err = cherr_(df_cols[col1].str.lower(), df_cols[col2].str.lower())
    len_cols = len(df_cols)
    return col_err, len_cols

err_t, len_t = get_char_err(merged_df, "Title", "Title_pred")
err_auth, len_auth = get_char_err(merged_df, "Author", "Author_pred")
err_pub, len_pub = get_char_err(merged_df, "Publisher", "Publisher_pred")
err_date, len_date = get_char_err(merged_df, "Date_published", "Date_published_pred")

print(f'''
      Total number of research papers/articles: {rows}\n
      Error in predicting titles is {err_t * 100:.2f}% for {len_t}/{rows}\n
      Error in predicting authors is {err_auth * 100:.2f}% for {len_auth}/{rows}\n
      Error in predicting publishers is {err_pub * 100:.2f}% for {len_pub}/{rows}\n
      Error in predicting publication dates is {err_date * 100:.2f}% for {len_date}/{rows}
      ''')


      Total number of research papers/articles: 50

      Error in predicting titles is 10.65% for 50/50

      Error in predicting authors is 58.89% for 31/50

      Error in predicting publishers is 79.32% for 24/50

      Error in predicting publication dates is 19.05% for 28/50
      


In [81]:
# Iterate to get error on each column row and store error to dataframe
# Declare a method to compute error rate on each column row
def get_row_err(df, col1, col2):
    cherr_ = CharErrorRate()
    df_cols = df[[col1, col2]].dropna()
    df_cols.reset_index(inplace = True, drop = True) # drop removes old indices
    list_err = []
    
    # iterate and get error for each row
    for i in range(len(df_cols)):
        col_err = cherr_(df_cols[col1].str.lower()[i], df_cols[col2].str.lower()[i])
        cer = f'''{col_err * 100:.2f}''' # turn error to percentage and limit to 2 decimal places
        list_err.append(cer)
    
    # convert list to dataframe
    col_name = f"error_{col1}"
    df_err = pd.DataFrame({col_name: list_err})
    
    # append column to df_cols
    df_cols[col_name] = df_err[col_name]
    
    return df_cols

In [82]:
title_sub_df = get_row_err(merged_df, "Title", "Title_pred")

In [83]:
title_sub_df.head()

Unnamed: 0,Title,Title_pred,error_Title
0,Cultivation and Processing of Potato in Bihar:...,Cultivation and Processing of Potato in Bihar:...,0.0
1,The Potato Value Chain in Bihar: An assessment...,The Potato Value Chain in Bihar: An Assessment...,7.14
2,Potato diseases: a constraint potato cultivation,Potato Diseases: A Constraint in Potato Cultiv...,5.88
3,Training needs of potato growers in Nalanda di...,Training Needs of Potato Growers in Nalanda Di...,0.0
4,Bihar Agriculture Growth and reform initiative,Bihar Agriculture Growth and Reform Initiative...,16.36


In [84]:
# Computing error for author, publisher and publication date
author_sub_df = get_row_err(merged_df, "Author", "Author_pred")
publisher_sub_df = get_row_err(merged_df, "Publisher", "Publisher_pred")
date_sub_df = get_row_err(merged_df, "Date_published", "Date_published_pred")

In [1]:
author_sub_df

NameError: name 'author_sub_df' is not defined

In [106]:
merged_inter_title = pd.merge(merged_df, title_sub_df, how = 'inner', on = ["Title", "Title_pred"])

In [107]:
len(merged_inter_title)

50

In [116]:
# Creating a CSV file after adding error columns to the merged dataset
# merge with title error df
merged_err_df2 = pd.merge(merged_df, title_sub_df["Title"], how = 'inner', on = "Title")

In [117]:
len(merged_err_df2)

50

In [21]:
# Finding authors that were dropped from the author error calc
dropped_authors = merged_df[merged_df["Author"].isnull() | merged_df["Author_pred"].isnull()]

In [24]:
merged_df[["Date_published", "Date_Published_pred", "Publisher", "Publisher_pred"]]

Unnamed: 0,Date_published,Date_Published_pred,Publisher,Publisher_pred
0,May 2014,October 2014,Environment & Ecology 32 (4B),ICAR Research Complex for Eastern Region
1,January 2011,January 2011,Electronic Journal,SSRN Electronic Journal
2,January 2016,January 2016,Indian Agriculture and Farmers,Guru Gobind Singh Indr aprastha University and...
3,August 2013,"August, 2013",Hind Agricultural Research and training institute,Hind Agricultural Research and Training Institute
4,August 2018,"August, 2018",,
5,September 2014,"September, 2014",Indian Res. J. Ext. Edu.,Indian Res. J. Ext. Edu.
6,May 2018,September 2018,ICAR-National Institute of Agricultural Econom...,Indian Journal of Agricultural Sciences
7,March 2020,March 2020,Economic Affairs,Economic Affairs
8,,,,
9,November 2013,3 November 2013,ICAR-RCER,"ICAR-RCER, Patna, India"


#### Predicting chatGPT accuracy on Paper details
This portion is computing the accuracy of the chatGPT model by comparing the paper details (author, title, publisher, and date of publication) gathered manually

##### Summarize document using chatGPT model

In [77]:
# Implement catching connection lost error from chatGPT server being overloaded
def summarize_pdf_gpt(pdf_file_path):
    # Declare a string variable to hold the summary  
    summary = ""

    # Open file
    pdf_file = open(pdf_file_path, 'rb')
    # Read pdf file with PyPDF2
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Iterate over the pages in the PDF file
    for page_num in range(len(pdf_reader.pages)):
        # Extract the text from the page
        page_content = pdf_reader.pages[page_num].extract_text().lower()
        messages = [
            {"role": "system", "content": "You are a helpful research associate"},
            {"role": "user", "content": f"summarize this: {page_content}"},
        ]

        response = openai.ChatCompletion.create(
                        api_key,
                        model = "gpt-3.5-turbo-0613",
                        messages = messages,
        )
        page_summary = response["choices"][0]["message"]["content"]
        summary = summary + page_summary + "\n"

    pdf_file.close()
    return summary.strip()

In [61]:
files_directory = os.listdir('../DGreen_test_papers-20230726T075419Z-001/DGreen_test_papers/')

In [62]:
len(files_directory)

56

In [63]:
files_directory[2]

'02.pdf'

In [64]:
pdf_file_path = '../DGreen_test_papers-20230726T075419Z-001/DGreen_test_papers/06.pdf'

In [68]:
summary = summarize_pdf_gpt(pdf_file_path)

In [69]:
print(summary)

This article discusses a study conducted in the Nalanda district of Bihar, India, to identify the training needs of potato growers. The study found that training in plant protection measures, high-yielding varieties, seed treatment, sowing methods, and sowing time were the top training needs among potato growers. Manure and fertilizer management, irrigation and drainage, and weed management were also identified as important training needs. The study revealed that potato growers wanted to learn about insecticide and pesticide use, insect pest and disease identification, spread causes, and time and method of control. The article emphasizes the need to effectively transfer technology and provide training to improve potato cultivation practices, increase productivity, and address constraints faced by potato growers in terms of production and marketing. The study was conducted in the Nalanda district of Bihar, a major potato-growing region in India.
A study conducted in Bihar, India examine

In [70]:
# Put this in a function and
# Implement catching connection lost error from chatGPT server being overloaded
q_ = "summarize this content"
summary_2 = llm(summary + q_)

In [71]:
# this is a summary of summary
print(summary_2.strip())

This study examined the training needs of potato growers in the Nalanda district of Bihar, India. The top training needs identified by the growers were plant protection measures, high yielding varieties, fungicidal treatment, and sowing methods and time. Other areas such as manure and fertilizer management, irrigation and drainage, and weed management were also considered important. Awareness about insecticides and pesticides, insect pest and disease identification, spread causes, and time and method of control were identified as the key training needs. The study concluded that control of diseases is a major challenge for potato growers and that training in this area is crucial to maintaining crop yields.


##### Summarize paper using langchain

In [72]:
# llm = OpenAI(temperature=0, openai_api_key=api_key)

In [73]:
# function to summarize paper using langchain chain
# Implement catching connection lost error from chatGPT server being overloaded
def summarize_pdf(pdf_file_path):
    loader = PyPDFLoader(pdf_file_path)
    docs = loader.load_and_split()
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    summary_ = chain.run(docs)
    return summary_

In [74]:
print(summarize_pdf(pdf_file_path))

 This study examined the training needs of potato growers in two blocks of Bihar, India. The top most relative need for training was found to be awareness about the use of various insecticides and pesticides, followed by identification of major diseases, cause of spread, time and method of control, preparation of pesticide solution, handling of plant protection implements, and residual effect of insecticides and pesticides.


In [75]:
# Testing multiple files comparing between map_reduce chain and openai ChatCompletion
def summarize_pages_summaries(llm, summary):
    query = "summarize this content"
    return llm(summary + query)

In [76]:
summarize_pages_summaries(llm, summarize_pdf_gpt('../DGreen_test_papers-20230726T075419Z-001/DGreen_test_papers/03.pdf'))

'\nThis research article discusses potato diseases as a constraint in potato cultivation in India. It focuses on two diseases, blackheart and early blight, and provides information on their causes and management strategies. It also mentions two other diseases, leaf curl and late blight, and provides information on their causes and management strategies. The article also discusses potato wart, powdery scab, common scab, black scurf/rhizoctonia canker, and soft rot/blackleg, as well as potato mosaic viruses. Management strategies for these diseases include cultural practices, using resistant varieties, controlling soil pH, and using chemical seed treatments or soil fumigation.'

In [78]:
summarize_pdf('../DGreen_test_papers-20230726T075419Z-001/DGreen_test_papers/03.pdf')

' This article discusses the causes, symptoms, and management of various potato diseases, including Black Heart, Early Blight, Late Blight, Potato Wart, Powdery Scab, Common Scab, Black Scurf, Rhizoctonia Canker, Soft Rot, Blackleg, Leaf Roll, and Leaf Mosaic. It also provides recommendations for disease management, such as crop rotation, field sanitation, resistant varieties, and chemical fungicides. The authors are Ajay Kumar from Guru Gobind Singh Indr aprastha University and Dr Jai P. Rai from Banaras Hindu University.'