In [1]:
#!py -m pip install pdfreader

#!py -m pip install requests bs4 PdfReader

In [17]:
import requests
import pdfplumber
import csv
import pandas as pd
from datetime import datetime

def download_pdf_from_url(url, output_file):
    response = requests.get(url)
    with open(output_file, "wb") as file:
        file.write(response.content)

def transform_input_data(uploaded_file) -> pd.DataFrame:

    page_content = []
    page_number = [] 

    with pdfplumber.open(uploaded_file) as pdf:
        pages = pdf.pages
        for i, p in enumerate(pages):
            page_content.append(" ".join(p.extract_text().split()))
            page_number.append(i + 1)

        filename = [uploaded_file] * (i + 1)
        filetype = ["PDF"] * (i + 1)
        try:
            last_modified = [datetime.strptime(pdf.metadata['ModDate'].replace("D:", "").replace("'", ""), "%Y%m%d%H%M%S%z").strftime("%Y-%m-%dT%H:%M:%S")] * (i + 1)
        except ValueError:
            # Default Data in cases value error
            last_modified = ['1900-01-01T00:00:00'] * (i + 1)



    df = pd.DataFrame()

    df["page_content"] = page_content
    df["page_number"] = page_number
    df["filename"] = filename
    df["last_modified"] = last_modified
    df["filetype"] = filetype

    df = df.drop_duplicates(["page_content"])

    return df


def write_to_csv(link, text, csv_file):
    with open(csv_file, "a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow([link, text])


In [1]:
import pandas as pd

df = pd.read_csv("extracted_data.csv")

In [2]:
df.head()

Unnamed: 0,position,rank,patent_id,serpapi_link,title,priority_date,filing_date,grant_date,publication_date,inventor,assignee,publication_number,language,thumbnail,pdf,page
0,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0
1,2,1,patent/US20210234767A1/en,https://serpapi.com/search.json?engine=google_...,Vehicle middleware,2011-11-16,2021-04-16,,2021-07-29,Christopher P. Ricci,Autoconnect Holdings Llc,US20210234767A1,en,https://patentimages.storage.googleapis.com/aa...,https://patentimages.storage.googleapis.com/42...,1.0
2,3,2,patent/US10979875B2/en,https://serpapi.com/search.json?engine=google_...,System and method for wireless interface selec...,2011-01-14,2018-09-11,2021-04-13,2021-04-13,Lillian Lei Dai,"Cisco Technology, Inc.",US10979875B2,en,https://patentimages.storage.googleapis.com/46...,https://patentimages.storage.googleapis.com/31...,1.0
3,4,3,patent/US10320836B2/en,https://serpapi.com/search.json?engine=google_...,Automotive ECU controller and data network hav...,2017-01-03,2018-07-16,2019-06-11,2019-06-11,Tal Efraim Ben David,Karamba Security Ltd.,US10320836B2,en,https://patentimages.storage.googleapis.com/50...,https://patentimages.storage.googleapis.com/40...,1.0
4,5,4,patent/US11790420B2/en,https://serpapi.com/search.json?engine=google_...,Visual discovery tool for automotive manufactu...,2016-10-18,2021-01-04,2023-10-17,2023-10-17,Jeffrey Stuart Cotton,"Autoalert, Llc",US11790420B2,en,https://patentimages.storage.googleapis.com/38...,https://patentimages.storage.googleapis.com/f7...,1.0


In [19]:
from tqdm import tqdm 

# List of links to PDFs
pdf_links = df['pdf'].values[:3]
output_csv = "raw_data/output.csv"


# Initialize an empty list to store dataframes
dfs = []

# Iterate over the list of links
for link in tqdm(pdf_links):
    output_pdf = link.split("/")[-1]  # Extract the filename from the URL
    download_pdf_from_url(link, "raw_pdf/" + output_pdf)
    extract_df = transform_input_data("raw_pdf/" + output_pdf)
    extract_df["link"] = link
    dfs.append(extract_df)

# Concatenate all dataframes in the list
final_df = pd.concat(dfs, ignore_index=True)

# Save the final dataframe as a CSV
final_df.to_csv(output_csv, index=False)



In [20]:
final_df

Unnamed: 0,page_content,page_number,filename,last_modified,filetype,link
0,US010427604B2 (1 2 ) United States Patent ( 10...,1,raw_pdf/US10427604.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/57...
1,"US 10 ,4 27 ,6 04 B2 Page 2 Related U . S . Ap...",2,raw_pdf/US10427604.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/57...
2,"US 10 ,4 27 ,6 04 B2 Page 3 References Cited (...",3,raw_pdf/US10427604.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/57...
3,"US 10 ,4 27 ,6 04 B2 Page 4 (5 6 ) 7 , 188 , 9...",4,raw_pdf/US10427604.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/57...
4,"US 10 ,4 27 ,6 04 B2 Page 5 References Cited (...",5,raw_pdf/US10427604.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/57...
...,...,...,...,...,...,...
222,"US 10,979,875 B2 59 60 turer. In one specific ...",59,raw_pdf/US10979875.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/31...
223,"US 10,979,875 B2 62 61 OBU 30 and assigned min...",60,raw_pdf/US10979875.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/31...
224,"US 10,979,875 B2 64 63 external networks ( e.g...",61,raw_pdf/US10979875.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/31...
225,"US 10,979,875 B2 66 65 pass all such changes ,...",62,raw_pdf/US10979875.pdf,1900-01-01T00:00:00,PDF,https://patentimages.storage.googleapis.com/31...
