#Make
it more robust and user-friendly. We’ll add some enhancements, such as:

OCR Integration for Physical Bills: Using a library like pytesseract to extract text from scanned images of physical bills.

Data Validation: Ensuring the extracted data is accurate and complete.

Unified Data Storage: Combining all extracted data into a single DataFrame for easier management.

User Interface (Optional): Adding a simple UI for uploading files or entering data.

In [None]:
# Setup and Imports
import pandas as pd
import re
import pytesseract
from PIL import Image
import os

# Ensure Tesseract is installed and accessible
# pytesseract.pytesseract.tesseract_cmd = r'<path_to_tesseract_executable>'

# Example Data
# SMS/UPI Transactions
sms_data = [
    "Your UPI payment of INR 350.00 to DineOut was successful. Ref ID: 1234567890.",
    "Paid INR 1200.00 to OYO Rooms via UPI. Ref ID: 0987654321.",
    "INR 250.00 paid to Uber via UPI. Ref ID: 1122334455."
]

# Discount App Data
discount_app_data = [
    {"app": "DineOut", "amount": 350.00, "discount": 50.00, "date": "2023-10-01"},
    {"app": "Swiggy", "amount": 500.00, "discount": 100.00, "date": "2023-10-02"}
]

# Physical Bill/Invoice Data (Simulated as image paths)
physical_bill_images = ["bill1.jpg", "bill2.jpg"]  # Replace with actual image paths

# Data Extraction and Processing
def extract_sms_data(sms_list):
    extracted_data = []
    for sms in sms_list:
        amount = re.search(r'INR (\d+\.\d{2})', sms).group(1)
        vendor = re.search(r'to (\w+)', sms).group(1)
        extracted_data.append({"source": "SMS", "vendor": vendor, "amount": float(amount)})
    return pd.DataFrame(extracted_data)

def process_discount_app_data(app_data):
    df = pd.DataFrame(app_data)
    df["source"] = "Discount App"
    return df

def extract_text_from_image(image_path):
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

def extract_physical_bill_data(image_paths):
    extracted_data = []
    for image_path in image_paths:
        text = extract_text_from_image(image_path)
        amount = re.search(r'Amount: (\d+\.\d{2})', text)
        description = re.search(r'Description: ([\w\s]+)', text)
        if amount and description:
            extracted_data.append({
                "source": "Physical Bill",
                "description": description.group(1),
                "amount": float(amount.group(1))
            })
    return pd.DataFrame(extracted_data)

# Processing Data
sms_df = extract_sms_data(sms_data)
discount_app_df = process_discount_app_data(discount_app_data)
physical_bill_df = extract_physical_bill_data(physical_bill_images)

# Combine all data into a single DataFrame
combined_df = pd.concat([sms_df, discount_app_df, physical_bill_df], ignore_index=True)

# Display Extracted Data
print("Combined Extracted Data:")
print(combined_df)

# Saving Extracted Data
combined_df.to_csv('all_transactions.csv', index=False)

print("\nData saved to 'all_transactions.csv'")

#Basic Version Without OCR
Setup and Imports

Example Data

SMS/UPI Transactions

Discount App Data

Physical Bill/Invoice Data

Data Extraction and Processing

Saving Extracted Data

In [None]:
# Setup and Imports
import pandas as pd
import re

# Example Data
# SMS/UPI Transactions
sms_data = [
    "Your UPI payment of INR 350.00 to DineOut was successful. Ref ID: 1234567890.",
    "Paid INR 1200.00 to OYO Rooms via UPI. Ref ID: 0987654321.",
    "INR 250.00 paid to Uber via UPI. Ref ID: 1122334455."
]

# Discount App Data
discount_app_data = [
    {"app": "DineOut", "amount": 350.00, "discount": 50.00, "date": "2023-10-01"},
    {"app": "Swiggy", "amount": 500.00, "discount": 100.00, "date": "2023-10-02"}
]

# Physical Bill/Invoice Data
physical_bill_data = [
    "Invoice No: 789456, Date: 01-10-2023, Amount: 1200.00, Description: OYO Rooms",
    "Invoice No: 123789, Date: 02-10-2023, Amount: 250.00, Description: Uber Ride"
]

# Data Extraction and Processing
def extract_sms_data(sms_list):
    extracted_data = []
    for sms in sms_list:
        amount = re.search(r'INR (\d+\.\d{2})', sms).group(1)
        vendor = re.search(r'to (\w+)', sms).group(1)
        extracted_data.append({"vendor": vendor, "amount": float(amount)})
    return pd.DataFrame(extracted_data)

def process_discount_app_data(app_data):
    return pd.DataFrame(app_data)

def extract_physical_bill_data(bill_list):
    extracted_data = []
    for bill in bill_list:
        amount = re.search(r'Amount: (\d+\.\d{2})', bill).group(1)
        description = re.search(r'Description: ([\w\s]+)', bill).group(1)
        extracted_data.append({"description": description, "amount": float(amount)})
    return pd.DataFrame(extracted_data)

# Processing Data
sms_df = extract_sms_data(sms_data)
discount_app_df = process_discount_app_data(discount_app_data)
physical_bill_df = extract_physical_bill_data(physical_bill_data)

# Display Extracted Data
print("SMS/UPI Transactions:")
print(sms_df)
print("\nDiscount App Data:")
print(discount_app_df)
print("\nPhysical Bill Data:")
print(physical_bill_df)

# Saving Extracted Data
sms_df.to_csv('sms_transactions.csv', index=False)
discount_app_df.to_csv('discount_app_transactions.csv', index=False)
physical_bill_df.to_csv('physical_bills.csv', index=False)

#Explanation
SMS/UPI Transactions: The function extract_sms_data uses regular expressions to extract the amount and vendor from simulated SMS texts.

Discount App Data: The function process_discount_app_data directly converts the list of dictionaries into a DataFrame.

Physical Bill/Invoice Data: The function extract_physical_bill_data extracts the amount and description from simulated invoice texts.

This notebook provides a basic framework for extracting and processing data from different sources. You can expand upon this by adding more sophisticated OCR capabilities for physical bills and integrating APIs for real-time data fetching from SMS and discount apps.