In [12]:
# program to retreive table from pdf and store the contents in database table.

import pdfplumber
import pandas as pd
import pyodbc
import datetime

# 1️⃣ Path to your generated PDF
pdf_path = r"C:\Users\Dell\Desktop\Python Practice\source_datafiles\pdfs\generated_tabular_invoice.pdf"

# 2️⃣ Extract all tables from PDF
all_tables = []

with pdfplumber.open(pdf_path) as pdf:
    for page_number, page in enumerate(pdf.pages, start=1):
        tables = page.extract_tables()
        for table_number, table in enumerate(tables, start=1):
            if len(table) > 1:  # Skip empty tables
                df = pd.DataFrame(table[1:], columns=table[0])
                df['Page'] = page_number
                df['Table_Number'] = table_number
                all_tables.append(df)

# Combine all tables
if all_tables:
    final_df = pd.concat(all_tables, ignore_index=True)
else:
    final_df = pd.DataFrame()


# Add create_date and last_update columns
final_df["create_date"] = datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S")
final_df["last_update"] = None

# Optional: Preview
print(final_df.head())

# 3️⃣ SQL Server connection parameters
server = '(localdb)\\MSSQLLocalDB'  # Adjust if using SQLEXPRESS: 'localhost\\SQLEXPRESS'
database = 'Arsipa'                 # Your database name
driver = '{ODBC Driver 17 for SQL Server}'

conn_str = f'DRIVER={driver};SERVER={server};DATABASE={database};Trusted_Connection=yes;'
conn = pyodbc.connect(conn_str)
cursor = conn.cursor()
print("✅ Connected to SQL Server")


# Truncate table
cursor.execute("DROP TABLE PDF_Invoice_Tables;")
conn.commit()
print("Table DROP  successfully.")


# 4️⃣ Create table dynamically based on PDF columns
table_name = "PDF_Invoice_Tables"

# Generate SQL columns from DataFrame columns (all NVARCHAR(MAX) for simplicity)
columns_sql = ", ".join([f"[{col}] NVARCHAR(MAX)" for col in final_df.columns])
create_table_query = f"""
IF NOT EXISTS (SELECT * FROM sysobjects WHERE name='{table_name}' AND xtype='U')
CREATE TABLE {table_name} ({columns_sql})
"""
cursor.execute(create_table_query)
conn.commit()
print(f"✅ Table '{table_name}' checked/created")

# 5️⃣ Insert DataFrame into SQL Server using fast executemany
cursor.fast_executemany = True

# Prepare insert statement
columns_str = ", ".join([f"[{col}]" for col in final_df.columns])
placeholders = ", ".join(["?"] * len(final_df.columns))
insert_query = f"INSERT INTO {table_name} ({columns_str}) VALUES ({placeholders})"

# Convert DataFrame to list of tuples
data_tuples = [tuple(x) for x in final_df.values]
cursor.executemany(insert_query, data_tuples)
conn.commit()
print("✅ PDF table data inserted successfully")

# 6️⃣ Close connection
cursor.close()
conn.close()
print("✅ Connection closed")


   Product Quantity Unit Price         Total Price  Page  Table_Number  \
0  Charger        4     260.92             1043.68     1             1   
1  Monitor        3      161.6  484.79999999999995     1             1   
2   Laptop        3      52.54              157.62     1             1   
3   Laptop        8      277.9              2223.2     1             1   
4  Monitor        1     324.63              324.63     1             1   

           create_date last_update  
0  2025-09-06 14:59:47        None  
1  2025-09-06 14:59:47        None  
2  2025-09-06 14:59:47        None  
3  2025-09-06 14:59:47        None  
4  2025-09-06 14:59:47        None  
✅ Connected to SQL Server
Table DROP  successfully.
✅ Table 'PDF_Invoice_Tables' checked/created
✅ PDF table data inserted successfully
✅ Connection closed


In [13]:
import os
notebook_path = os.getcwd()
print("Current notebook directory:", notebook_path)


Current notebook directory: C:\Users\Dell\Documents\Jupyter
