In [None]:
import os, re, json, time
import pdfplumber, requests, pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


In [None]:

# Setup paths
BASE = Path(r"\Project\Python App\CreditCardData")
PDF_DIR = BASE / "pdfs"
OUT_DIR = BASE / "output"
PDF_DIR.mkdir(exist_ok=True)
OUT_DIR.mkdir(exist_ok=True)


In [None]:

# Fields
FIELDS = [
    "Card Name", "Issuer / Bank", "Network(s)", "Min Income Required",
    "Allowed Employment (Salaried | Self‚ÄëEmp | Both)", "Primary Tags",
    "Age Range", "Credit Score (CIBIL/CIR)", "Special Eligibility Note",
    "Joining Fee (‚Çπ)", "Annual Fee (‚Çπ)", "Annual‚ÄëFee Waiver Rule",
    "Forex Mark‚Äëup (%)", "APR on Purchases (% p.a.)", "Cash‚ÄëAdvance Fee",
    "Late‚ÄëPayment Fee Slab", "Reward Redemption Fee", "Rent / Wallet Load Fee",
    "Railway or Fuel Waiver", "Any Other Important Fee",
    "Link to OAicial MITC / T&C PDF", "Base Reward Rate",
    "Reward Redemption Options", "Top‚ÄëLine Benefit List", "Welcome OAer",
    "Insurance Included", "Key USP #1", "Key USP #2", "Key USP #3"
]

# PDF regex patterns
PDF_PATTERNS = {
    "Joining Fee (‚Çπ)": re.compile(r"joining\s+fee[:\-]?\s*(?:Rs\.?\s*)?([‚Çπ]?\d[\d,]*\.?\d*\s*(?:per annum|pa|p\.a\.)?)", re.I),
    "Annual Fee (‚Çπ)": re.compile(r"annual\s+fee[:\-]?\s*(?:Rs\.?\s*)?([‚Çπ]?\d[\d,]*\.?\d*\s*(?:per annum|pa|p\.a\.)?)", re.I),
    "Annual‚ÄëFee Waiver Rule": re.compile(r"fee\s+waiver.*?if.*?(‚Çπ[\d,]+.*)", re.I),
    "Forex Mark‚Äëup (%)": re.compile(r"forex\s+mark[\-‚Äì]?\s*up[:\-]?\s*(\d+(\.\d+)?\s*%)", re.I),
    "APR on Purchases (% p.a.)": re.compile(r"interest\s+rate.*?(\d+(\.\d+)?\s*%.*?per\s+annum)", re.I),
    "Cash‚ÄëAdvance Fee": re.compile(r"cash\s+advance\s+fee[:\-]?\s*([^\n]+)", re.I),
    "Late‚ÄëPayment Fee Slab": re.compile(r"late\s+payment\s+fee[:\-]?\s*([^\n]+)", re.I),
    "Reward Redemption Fee": re.compile(r"reward\s+redemption\s+fee[:\-]?\s*([^\n]+)", re.I),
    "Rent / Wallet Load Fee": re.compile(r"(rent|wallet\s+load).*?fee[:\-]?\s*([^\n]+)", re.I),
    "Railway or Fuel Waiver": re.compile(r"(fuel|railway).*?(waiver|surcharge).*?[:\-]?\s*([^\n]+)", re.I),
    "Any Other Important Fee": re.compile(r"(other\s+important\s+fee|miscellaneous\s+fee)[:\-]?\s*([^\n]+)", re.I),
    "Base Reward Rate": re.compile(r"base\s+reward\s+rate[:\-]?\s*([^\n]+)", re.I),
    "Reward Redemption Options": re.compile(r"reward\s+redemption\s+options[:\-]?\s*([^\n]+)", re.I),
    "Top‚ÄëLine Benefit List": re.compile(r"(benefits|features).*?:?\s*(.*)", re.I),
    "Welcome OAer": re.compile(r"(welcome\s+offer[:\-]?\s*)([^\n]+)", re.I),
    "Insurance Included": re.compile(r"(insurance.*?cover|included).*?:?\s*([^\n]+)", re.I),
    "Credit Score (CIBIL/CIR)": re.compile(r"credit\s+score.*?(CIBIL|Experian|Equifax|CRIF)", re.I),
    "Primary Tags": re.compile(r"(fuel|shopping|dining|travel|utilities|grocery|online food)", re.I),
}

HTML_SELECTORS = {
    "Card Name": lambda soup: soup.find("h1").get_text(strip=True) if soup.find("h1") else None,
    "Annual‚ÄëFee Waiver Rule": lambda soup: next((el.get_text(strip=True) for el in soup.find_all(string=re.compile(r"(waiver.*fee|spending.*‚Çπ)", re.I)) if el), None),
    "Key USP #1": lambda soup: soup.find("ul") and soup.find("ul").find("li").get_text(strip=True),
    "Key USP #2": lambda soup: soup.find_all("ul")[0].find_all("li")[1].get_text(strip=True) if len(soup.find_all("ul")[0].find_all("li")) > 1 else None,
    "Key USP #3": lambda soup: soup.find_all("ul")[0].find_all("li")[2].get_text(strip=True) if len(soup.find_all("ul")[0].find_all("li")) > 2 else None,
    "Top‚ÄëLine Benefit List": lambda soup: (
        soup.find("ul").get_text(separator=", ").strip()
        if soup.find("ul") else None
    ),
    "Issuer / Bank": lambda soup: (
        soup.find("meta", {"name": "og:site_name"})["content"]
        if soup.find("meta", {"name": "og:site_name"}) else "Not mentioned"
    ),
    "Credit Score (CIBIL/CIR)": lambda soup: (
        soup.find(string=re.compile("CIBIL|credit score", re.I)).find_parent().get_text(strip=True)
        if soup.find(string=re.compile("CIBIL|credit score", re.I)) else None
    ),
}


In [None]:


def make_driver():
    opts = Options()
    opts.add_argument("--headless")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    return webdriver.Chrome(options=opts)

def download_file(url, dst):
    try:
        if dst.exists():
            return dst
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        dst.write_bytes(r.content)
        return dst
    except Exception as e:
        print(f"‚ö†Ô∏è  Download failed: {url} ‚Üí {e}")
        return None

def extract_from_pdf(pdf_path):
    data, cites = {}, {}
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_no, page in enumerate(pdf.pages, start=1):
                text = page.extract_text() or ""
                for field, pattern in PDF_PATTERNS.items():
                    if field in data:
                        continue
                    m = pattern.search(text)
                    if m:
                        data[field] = m.group(1).strip()
                        cites[field] = f"p.¬†{page_no}"
    except Exception as e:
        print(f"‚ö†Ô∏è  PDF error {pdf_path.name}: {e}")
    return data, cites

def extract_from_web(url, driver):
    data, cites = {}, {}
    try:
        driver.get(url)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        for field, extractor in HTML_SELECTORS.items():
            if val := extractor(soup):
                data[field] = val
                cites[field] = "web"
    except Exception as e:
        print(f"‚ö†Ô∏è  Selenium error {url}: {e}")
    return data, cites

def process_row(row, driver):
    name, page_url, pdf_url = row["Credit Card Name"], row["Official Product Page URL"], row["PDF Download Link"]
    safe_name = re.sub(r"[^\w\-]+", "_", name).strip("_")
    pdf_path = PDF_DIR / f"{safe_name}.pdf"
    out_path = OUT_DIR / f"{safe_name}.json"
    print(f"\nüîç  Processing: {name}")
    if not download_file(pdf_url, pdf_path):
        print("   ‚Üí skipped")
        return

    pdf_data, pdf_cites = extract_from_pdf(pdf_path)
    web_data, web_cites = extract_from_web(page_url, driver)

    record = {}
    for field in FIELDS:
        if field in pdf_data:
            record[field] = {"value": pdf_data[field], "citation": pdf_cites[field]}
        elif field in web_data:
            record[field] = {"value": web_data[field], "citation": web_cites[field]}
        else:
            record[field] = {"value": "Not mentioned", "citation": "‚Äî"}

    record["Link to OAicial MITC / T&C PDF"] = {"value": pdf_url, "citation": "dataset"}
    out_path.write_text(json.dumps(record, indent=2, ensure_ascii=False))
    print(f"   ‚úî saved ‚Üí {out_path.name}")


In [None]:

def main():
    df = pd.read_excel(BASE / "credit_card_info.xlsx")
    driver = make_driver()
    try:
        df.apply(lambda row: process_row(row, driver), axis=1)
    finally:
        driver.quit()
    print("\n‚úÖ All cards processed. Check the 'output/' folder.")

if __name__ == "__main__":
    main()
