In [49]:
from sqlalchemy import create_engine, text
import pandas as pd
import os
from dotenv import load_dotenv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import paramiko
from scp import SCPClient



In [56]:
def download_remote_db():
    """Download the latest database from remote server using SSH key"""
    try:
        ssh_host = os.environ.get("SSH_HOST")
        ssh_user = os.environ.get("SSH_USER") 
        ssh_key_path = os.environ.get("SSH_KEY_PATH")
        
        print(f"Connecting to {ssh_user}@{ssh_host}...")
        
        # Load SSH key (same pattern as your Start9 example)
        key = paramiko.Ed25519Key(filename=ssh_key_path)
        
        # Connect
        ssh = paramiko.SSHClient()
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh.connect(hostname=ssh_host, username=ssh_user, pkey=key)
        
        # Download the database file
        with SCPClient(ssh.get_transport()) as scp:
            remote_db_path = os.environ.get("REMOTE_DB_PATH")
            local_db_path = "./remote_mavericks.db"
            scp.get(remote_db_path, local_db_path)
        
        ssh.close()
        print("✅ Successfully downloaded remote database")
        return f"sqlite:///{local_db_path}"
        
    except Exception as e:
        print(f"❌ Failed to download remote DB: {e}")
        return None

In [57]:
if 'USE_LOCAL_DB' in os.environ:
    del os.environ['USE_LOCAL_DB']

load_dotenv()

use_local_db = os.environ['USE_LOCAL_DB']

# DB Toggle
if use_local_db == "false":
    print("Downloading remote database via SSH...")
    ssh_host = os.environ.get("SSH_HOST")
    ssh_user = os.environ.get("SSH_USER")
    ssh_key_path = os.environ.get("SSH_KEY_PATH")
    
    DB_URL = download_remote_db()
    if not DB_URL:
        print("Failed to download, falling back to local...")
        DB_URL = os.environ.get("LOCAL_DB_URL", "sqlite:///mavericks.db")
else:
    print("Using local db url...")
    DB_URL = os.environ.get("LOCAL_DB_URL", "sqlite:///mavericks.db")

if DB_URL:
    engine = create_engine(DB_URL)
    print(f"Connected to: {DB_URL}")
else:
    print("ERROR: DB_URL is None or empty!")



Downloading remote database via SSH...
Connecting to ubuntu@23.239.17.215...
✅ Successfully downloaded remote database
Connected to: sqlite:///./remote_mavericks.db


In [25]:
# via SQL (simple)
df = pd.read_sql(text("SELECT * FROM cars"), engine)

In [26]:
df.head()

Unnamed: 0,id,listing,link,mileage,price,is_hybrid,date_found,still_available,year,manual_price
0,1,2024 FORD MAVERICK XLT,https://www.clasificadosonline.com/UDTransDeta...,7332,"$32,995",0,2025-07-22,1,2024,0
1,2,FORD MAVERICK 2022,https://www.clasificadosonline.com/UDTransDeta...,0,,0,2025-07-22,1,2022,0
2,3,FORD MAVERICK LARIAT FX4 AWD 2022 EQUIPADA!,https://www.clasificadosonline.com/UDTransDeta...,19100,,0,2025-07-22,1,2022,0
3,4,FORD MAVERICK HYBRID 2024,https://www.clasificadosonline.com/UDTransDeta...,3,"$29,990",1,2025-07-22,0,2024,0
4,5,FORD MAVERICK XL DESDE 27995,https://www.clasificadosonline.com/UDTransDeta...,11,"$27,995",0,2025-07-22,1,Unknown,0


In [27]:
import re
import pandas as pd
from datetime import date

df = df.copy()

def to_int_price(p):
    if not isinstance(p, str): return pd.NA
    digits = re.sub(r"[^\d]", "", p)
    return int(digits) if digits else pd.NA

def to_int_miles(m):
    if not isinstance(m, str): return pd.NA
    digits = re.sub(r"[^\d]", "", m)
    return int(digits) if digits else pd.NA

def extract_year(text):
    if not isinstance(text, str): return pd.NA
    m = re.search(r"\b(20[12]\d)\b", text)  # 2010–2029 quick grab
    return int(m.group(1)) if m else pd.NA

df["price_num"]   = df["price"].map(to_int_price)
df["mileage_num"] = df["mileage"].map(to_int_miles)
df["year"]        = df["listing"].map(extract_year)
df["is_hybrid"]   = df["is_hybrid"].astype("boolean")
df["days_on_market"] = (pd.Timestamp.today().normalize() - pd.to_datetime(df["date_found"])).dt.days
df.head()


Unnamed: 0,id,listing,link,mileage,price,is_hybrid,date_found,still_available,year,manual_price,price_num,mileage_num,days_on_market
0,1,2024 FORD MAVERICK XLT,https://www.clasificadosonline.com/UDTransDeta...,7332,"$32,995",False,2025-07-22,1,2024.0,0,32995.0,7332,52
1,2,FORD MAVERICK 2022,https://www.clasificadosonline.com/UDTransDeta...,0,,False,2025-07-22,1,2022.0,0,,0,52
2,3,FORD MAVERICK LARIAT FX4 AWD 2022 EQUIPADA!,https://www.clasificadosonline.com/UDTransDeta...,19100,,False,2025-07-22,1,2022.0,0,,19100,52
3,4,FORD MAVERICK HYBRID 2024,https://www.clasificadosonline.com/UDTransDeta...,3,"$29,990",True,2025-07-22,0,2024.0,0,29990.0,3,52
4,5,FORD MAVERICK XL DESDE 27995,https://www.clasificadosonline.com/UDTransDeta...,11,"$27,995",False,2025-07-22,1,,0,27995.0,11,52


In [28]:
hybrids = df.query("is_hybrid == True").copy()
hybrids_sorted = hybrids.sort_values(["price_num", "mileage_num"], ascending=[True, True])

cols = ["listing", "year", "price", "mileage_num", "days_on_market", "link"]
hybrids_sorted[cols].head(50)  # show top 50; adjust as you like


Unnamed: 0,listing,year,price,mileage_num,days_on_market,link
20,2023 Ford Maverick XLT Hibrida,2023.0,"$27,100",31000.0,52,https://www.clasificadosonline.com/UDTransDeta...
119,"Ford Maverick XLT hybrid 2023 | 37,036 | CPO",2023.0,"$27,990",37036.0,52,https://www.clasificadosonline.com/UDTransDeta...
362,FORD MAVERICK XL HYBRID,,"$28,990",9.0,5,https://www.clasificadosonline.com/UDTransDeta...
27,NUEVA Ford Maverick Hybrid 2024 Cactus Gray,2024.0,"$28,995",0.0,52,https://www.clasificadosonline.com/UDTransDeta...
29,NUEVA Ford Maverick XL Hybrid 2024 LIQUIDACIÓ,2024.0,"$28,995",0.0,52,https://www.clasificadosonline.com/UDTransDeta...
19,Ford Maverick XL Hybrid 2024,2024.0,"$29,871",,52,https://www.clasificadosonline.com/UDTransDeta...
160,MAVERIK XL HYBRID 2024,2024.0,"$29,891",14.0,52,https://www.clasificadosonline.com/UDTransDeta...
392,"2024 Ford Maverick HYBRID, Solo 6k millas!",2024.0,"$29,987",6347.0,5,https://www.clasificadosonline.com/UDTransDeta...
30,Ford Maverick 2024 XL Hybrida ShadowBlack,2024.0,"$29,987",,52,https://www.clasificadosonline.com/UDTransDeta...
238,Ford Maverick 2024 XL Hybrida ShadowBlack,2024.0,"$29,987",,33,https://www.clasificadosonline.com/UDTransDeta...


In [29]:
PRICE_CAP = 30000
AGED_DAYS = 14

aged_hybrids = hybrids_sorted.query("days_on_market >= @AGED_DAYS")
aged_under_cap = aged_hybrids.query("price_num.notna() and price_num <= @PRICE_CAP")

print("Hybrids total:", len(hybrids))
print("Aged hybrids (>=14d):", len(aged_hybrids))
print("Aged hybrids under $30k:", len(aged_under_cap))

aged_under_cap[cols].head(25)


Hybrids total: 39
Aged hybrids (>=14d): 26
Aged hybrids under $30k: 10


Unnamed: 0,listing,year,price,mileage_num,days_on_market,link
20,2023 Ford Maverick XLT Hibrida,2023,"$27,100",31000.0,52,https://www.clasificadosonline.com/UDTransDeta...
119,"Ford Maverick XLT hybrid 2023 | 37,036 | CPO",2023,"$27,990",37036.0,52,https://www.clasificadosonline.com/UDTransDeta...
27,NUEVA Ford Maverick Hybrid 2024 Cactus Gray,2024,"$28,995",0.0,52,https://www.clasificadosonline.com/UDTransDeta...
29,NUEVA Ford Maverick XL Hybrid 2024 LIQUIDACIÓ,2024,"$28,995",0.0,52,https://www.clasificadosonline.com/UDTransDeta...
19,Ford Maverick XL Hybrid 2024,2024,"$29,871",,52,https://www.clasificadosonline.com/UDTransDeta...
160,MAVERIK XL HYBRID 2024,2024,"$29,891",14.0,52,https://www.clasificadosonline.com/UDTransDeta...
30,Ford Maverick 2024 XL Hybrida ShadowBlack,2024,"$29,987",,52,https://www.clasificadosonline.com/UDTransDeta...
238,Ford Maverick 2024 XL Hybrida ShadowBlack,2024,"$29,987",,33,https://www.clasificadosonline.com/UDTransDeta...
3,FORD MAVERICK HYBRID 2024,2024,"$29,990",3.0,52,https://www.clasificadosonline.com/UDTransDeta...
84,FORD MAVERICK XL HYBRID 2024,2024,"$29,990",3.0,52,https://www.clasificadosonline.com/UDTransDeta...


In [30]:
from fuzzywuzzy import fuzz

def looks_hybrid(text: str) -> bool:
    if not isinstance(text, str):
        return False
    t = text.lower()
    return (
        fuzz.partial_ratio(t, "hybrid")  >= 70 or
        fuzz.partial_ratio(t, "hibrido") >= 70 or
        "híbrido" in t or               # accent exact match
        "hibrida" in t or               # feminine spelling
        "híbrida" in t or
        "hev" in t or                   # common shorthand
        "hyb" in t                      # very loose
    )

df["hybrid_from_text"] = df["listing"].map(looks_hybrid)
print("DB is_hybrid True count:", int(df["is_hybrid"].sum(skipna=True)))
print("Text-derived hybrid count:", int(df["hybrid_from_text"].sum()))
df.loc[df["hybrid_from_text"] & ~df["is_hybrid"], ["listing","price","link"]].head(10)


DB is_hybrid True count: 39
Text-derived hybrid count: 39


Unnamed: 0,listing,price,link


In [31]:
cols = ["listing","price","mileage","date_found","link"]

print("== current hybrids ==")
display(df.loc[df["is_hybrid"] == True, cols].head(20))

print("== rows that mention hyb-like terms in listing (not flagged) ==")
mask_near = df["listing"].str.contains(r"hyb|h[ií]br[ií]d", case=False, na=False)
display(df.loc[mask_near & (df["is_hybrid"] != True), cols].head(20))


== current hybrids ==


Unnamed: 0,listing,price,mileage,date_found,link
3,FORD MAVERICK HYBRID 2024,"$29,990",3.0,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
18,Ford Maverick XL Hibrida 2.5 Premium nueva,,10.0,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
19,Ford Maverick XL Hybrid 2024,"$29,871",,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
20,2023 Ford Maverick XLT Hibrida,"$27,100",31000.0,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
25,Ford Maverick XL Hybrid 24,"$35,995",16.0,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
27,NUEVA Ford Maverick Hybrid 2024 Cactus Gray,"$28,995",0.0,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
29,NUEVA Ford Maverick XL Hybrid 2024 LIQUIDACIÓ,"$28,995",0.0,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
30,Ford Maverick 2024 XL Hybrida ShadowBlack,"$29,987",,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
56,Ford Maverick XLT Hybrid 2024,"$31,990",,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...
62,NUEVA Ford Maverick XLT Hybrid 2024 Sunroof,"$34,995",0.0,2025-07-22,https://www.clasificadosonline.com/UDTransDeta...


== rows that mention hyb-like terms in listing (not flagged) ==


Unnamed: 0,listing,price,mileage,date_found,link


In [32]:
# Save full dataset to CSV
df.to_csv("maverick_listings.csv", index=False, encoding="utf-8-sig")
print("✅ CSV saved as maverick_listings.csv")


✅ CSV saved as maverick_listings.csv


In [33]:
# all hybrids, cheapest first
hybrids_sorted = (
    df[df["is_hybrid"] == True]
    .sort_values(["price_num","mileage_num"], ascending=[True, True])
)
hybrid_rows_html = "".join(
    f"<tr><td>{r['listing']}</td><td>{r['price']}</td><td>{r['mileage']}</td>"
    f"<td>{r['days_on_market']}</td><td><a href='{r['link']}'>link</a></td></tr>"
    for _, r in hybrids_sorted.head(12).iterrows()
)

# aged hybrids under cap
PRICE_CAP, AGED_DAYS = 30000, 14
aged_under_cap = hybrids_sorted.query("price_num.notna() and price_num<=@PRICE_CAP and days_on_market>=@AGED_DAYS")
aged_rows_html = "".join(
    f"<tr><td>{r['listing']}</td><td>{r['price']}</td><td>{r['mileage']}</td>"
    f"<td>{r['days_on_market']}</td><td><a href='{r['link']}'>link</a></td></tr>"
    for _, r in aged_under_cap.head(12).iterrows()
)
