Skip to content

Commit

Permalink
Add plagiarism checker and fetch papers data functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
gamingflexer committed Jan 17, 2024
1 parent 9bccfe7 commit db9914d
Showing 1 changed file with 73 additions and 23 deletions.
96 changes: 73 additions & 23 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import pandas as pd
import logging
from scrapper.main import ArxivPaper
from config import *
from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data
from utils import compare_paper_ids

"""
author_obj = ArxivPaper("Andrew Ng")
Expand All @@ -10,26 +13,73 @@
author_obj.get_paper_details_batch(paper_ids=paper_ids, path="./data/papers")
"""

def plagiarism_checker(authors_name):
print(authors_name)
data = {
"paper": ["Collective annotation of wikipedia entities in web text",
"Glister: Generalization based data subset selection for efficient and robust learning",
"Grad-match: Gradient matching based data subset selection for efficient deep model training"],
"paper_id": [2303.13798, 2303.13798, 2303.13798],
"paper_link": ["https://arxiv.org/pdf/2303.13798", "https://arxiv.org/pdf/2303.13798", "https://arxiv.org/pdf/2303.13798"],
"report_link" : ["", "", ""]
}
df = pd.DataFrame(data)
return df

iface = gr.Interface(
fn=plagiarism_checker,
inputs=gr.Textbox(show_copy_button=True, label="Enter Authors Name"),
outputs=gr.Dataframe(headers=["Paper Name", "Paper id", "Paper Link", "Report link"]),
title="Arxiv author's plagiarism check just by entering the arxiv author",
description="Arxiv Plagiarism Checker LLM - Enter Authors Name",
allow_flagging = False,
)

iface.launch()
def plagiarism_checker(authors_name: str,number_of_results=5, progress=gr.Progress()):
progress(0.2, desc="Collecting Links")
author_obj = ArxivPaper(authors_name)
db_author_name = get_correct_author_name(authors_name)
paper_links = author_obj.get_results_google(number_of_results=number_of_results)
paper_ids = author_obj.get_paper_id(paper_links)
progress(0.4, desc="Collecting Papers")
if db_author_name is None:
print("No similar author found in the database")
author_obj.get_paper_details_batch(paper_ids=paper_ids, path="./data/papers")
local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name.replace(" ", "_"))
progress(0.6, desc="Making summary")
data_to_save = []
for paper in os.listdir(local_saved_papers):
paper_path = os.path.join(local_saved_papers, paper)
with open(paper_path, "r") as f:
data_to_save.append(f.read())
else:
print(f"Found similar author in the database: {db_author_name}")
data = fetch_papers_data(db_author_name)
reamining_paper_ids = compare_paper_ids(data,paper_ids)
progress(0.6, desc="Making summary")
data_to_save = []
if reamining_paper_ids != []:
author_obj.get_paper_details_batch(paper_ids=reamining_paper_ids, path="./data/papers")
local_saved_papers = os.path.join(os.getcwd(), "data", "papers", authors_name.replace(" ", "_"))
for paper in os.listdir(local_saved_papers):
paper_path = os.path.join(local_saved_papers, paper)
with open(paper_path, "r") as f:
data_to_save.append(f.read())
else:
print("All papers already present in the database")

progress(0.8, desc="Saving to Database")
insert_papers_data(data_to_save, authors_name)
return "Fetched Latest Papers"

def fetch_papers_data_df(authors_name: str, progress=gr.Progress()):
return pd.DataFrame(fetch_papers_data(authors_name))

with gr.Blocks() as demo:

with gr.Tab("Arxiv Plagiarism Fetcher & Save to DB"):
with gr.Row():
authors_name = gr.Textbox(label="Enter Author's Name")
number_of_results = gr.Number(label="Number of results - Min - 5")
submit_button_tab_1 = gr.Button("Start")
with gr.Row():
completed = gr.Textbox(label="Completed")

with gr.Tab("Get Papers Data"):
with gr.Row():
authors_name = gr.Textbox(label="Enter Author's Name")
submit_button_tab_2 = gr.Button("Start")
with gr.Row():
dataframe_output = gr.Dataframe(headers=['doi_no', 'title', 'summary', 'authors', 'year', 'pdf_link',
'references', 'categories', 'comment', 'journal_ref', 'source',
'primary_category', 'published'])

with gr.Tab("Arxiv Plagiarism Checker"):
with gr.Row():
authors_name = gr.Textbox(label="Enter Author's Name")
number_of_results = gr.Number(label="Number of results - Min - 5")
submit_button = gr.Button("Start")


submit_button_tab_1.click(fn=plagiarism_checker,inputs=[authors_name, number_of_results] ,outputs= completed)
submit_button_tab_2.click(fn=fetch_papers_data_df,inputs=[authors_name] ,outputs=dataframe_output)

demo.launch()

0 comments on commit db9914d

Please sign in to comment.