In [1]:
# set the root folder as ../
import os

os.chdir("../../")
# get current working directory
print("Current working directory:", os.getcwd())

Current working directory: /home/ettore/projects/hackathons/MVA_hackathon_2025/vintage_ai


In [2]:
from datetime import date
import pandas as pd
from pytrends.request import TrendReq


def fetch_trends_global(
    query: str, start_date: str = "2006-01-01", granularity: str = "weekly"
) -> pd.DataFrame:
    """
    Fetch global (worldwide) Google Trends interest over time (0-100) for a given query.

    Args:
        query (str): The search term or topic.
        start_date (str): Start date in 'YYYY-MM-DD' format.
        granularity (str): One of 'weekly', 'monthly', or 'yearly' for resampling.

    Returns:
        pd.DataFrame: Time-indexed interest levels (0-100), or empty DataFrame on error.
    """
    try:
        end_date = date.today().isoformat()
        timeframe = f"{start_date} {end_date}"
        pytrends = TrendReq(hl="en-US", tz=360)

        suggestions = pytrends.suggestions(query)
        if not suggestions:
            return pd.DataFrame()

        topic_id = suggestions[0]["mid"]
        pytrends.build_payload([topic_id], timeframe=timeframe, geo="")  # Global

        iot = pytrends.interest_over_time()
        if iot.empty or "isPartial" not in iot.columns:
            return pd.DataFrame()

        series = iot.drop(columns="isPartial")[topic_id].rename("Global")

        # Resampling
        if granularity == "monthly":
            return series.resample("M").mean().round(2).to_frame()
        elif granularity == "yearly":
            return series.resample("Y").mean().round(2).to_frame()
        else:  # default: weekly
            return series.to_frame()

    except Exception as e:
        # Optionally log error: print(f"Error: {e}")
        return pd.DataFrame()

In [24]:
fetch_trends_global("Ferrari Dino 246", start_date="2006-01-01", granularity="yearly")

In [25]:
from datetime import date, timedelta
import time
import pandas as pd
from pytrends.request import TrendReq


def fetch_trends_global_robust(
    query: str,
    start_date: str = "2006-01-01",
    granularity: str = "weekly",
    slice_years: int = 5,
    retry_pause: int = 2,
    max_retries: int = 3,
) -> pd.DataFrame:
    """
    Robust Google Trends fetcher.
    1) Try Topic MID; if that fails → raw keyword.
    2) If the full-range request is empty → pull in `slice_years`-year chunks and stitch.
    3) Returns NaNs (not empty) if absolutely nothing is available.

    Args
    ----
    query : str            • Search phrase (e.g. "Ferrari Dino 246")
    start_date : str       • YYYY-MM-DD for earliest slice
    granularity : str      • 'weekly' (default), 'monthly', 'yearly' (resampled)
    slice_years : int      • Window size for fallback slicing
    retry_pause : int      • Seconds to wait between retries
    max_retries : int      • How many API retries before giving up

    Returns
    -------
    pd.DataFrame           • Index = date (or year), column = "Global"
    """

    def _build_and_pull(kw_list, tf):
        for _ in range(max_retries):
            try:
                py.build_payload(kw_list, timeframe=tf, geo="")
                out = py.interest_over_time()
                if not out.empty:
                    out = out.drop(columns="isPartial")
                return out
            except Exception:
                time.sleep(retry_pause)
        return pd.DataFrame()

    end_date = date.today().isoformat()
    timeframe_all = f"{start_date} {end_date}"
    py = TrendReq(hl="en-US", tz=360)

    # --- Step 1: Topic MID first
    mid = None
    suggestions = py.suggestions(query)
    if suggestions:
        mid = suggestions[0]["mid"]
        df = _build_and_pull([mid], timeframe_all)
        if df.empty:
            mid = None  # will try keyword next
    else:
        df = pd.DataFrame()

    # --- Step 2: Raw keyword fallback
    if df.empty and mid is None:
        df = _build_and_pull([query], timeframe_all)

    # --- Step 3: Slice the timeframe if still empty
    if df.empty:
        print("⚠️  empty result, switching to sliced windows …")
        start = pd.to_datetime(start_date)
        today = pd.to_datetime(end_date)
        frames = []
        while start < today:
            stop = min(
                start + pd.DateOffset(years=slice_years) - timedelta(days=1), today
            )
            tf = f"{start.date()} {stop.date()}"
            chunk = _build_and_pull([query], tf)
            if not chunk.empty:
                frames.append(chunk)
            start = stop + timedelta(days=1)
        if frames:
            df = pd.concat(frames).sort_index()

    # --- Step 4: final tidy-up / resample / NaNs
    if df.empty:
        print("🤷 No Google Trends data available for this query.")
        return pd.DataFrame({"Global": []})  # clear but not None

    if granularity == "monthly":
        df = df.resample("M").mean()
    elif granularity == "yearly":
        df = df.resample("Y").mean()

    return df.rename(columns={df.columns[0]: "Global"})

In [26]:
print(os.getcwd())

/home/ettore/projects/hackathons/MVA_hackathon_2025/vintage_ai


In [27]:
df_asset_classic = pd.read_excel(
    "/home/ettore/projects/hackathons/MVA_hackathon_2025/vintage_ai/data/raw/ asset_classic_car_prices.xlsx",
    skiprows=4,
    index_col=None,
)

In [None]:
df_asset_classic.head()

0     2006
1     2007
2     2008
3     2009
4     2010
5     2011
6     2012
7     2013
8     2014
9     2015
10    2016
11    2017
12    2018
13    2019
14    2020
15    2021
16    2022
17    2023
18    2024
19    2025
Name: Year, dtype: int64

In [6]:
car_cols = df_asset_classic.columns.tolist()[1:]
car_cols

['Ferrari Testarossa',
 'Ferrari Daytona GTB',
 'Ferrari 550 Maranello',
 'Ferrari 355 GTB',
 'Ferrari 308 GTB',
 'Ferrari 330 GTC',
 'Ferrari 250 GTE',
 'Ferrari Dino 246',
 'Ferrari F40',
 'Lamborghini 350 GT',
 'Lamborghini 400 GT',
 'Lamborghini Miura P400',
 'Lamborghini Countach 25th',
 'Lamborghini LM002',
 'Lamborghini Diablo VT',
 'Maserati 3500 GT',
 'Maserati Mistral',
 'Maserati MC12',
 'Bugatti EB110',
 'Bugatti Type 57']

In [7]:
def merge_prices_with_popularity(
    df_prices: pd.DataFrame, df_popularity: pd.DataFrame, car_name: str
) -> pd.DataFrame:
    """
    Merge classic car prices with Google Trends popularity data on the 'Year' column.
    If years are missing in either DataFrame, NaN is used.

    Args:
        df_prices (pd.DataFrame): DataFrame with columns 'Year' and car price columns.
        df_popularity (pd.DataFrame): DataFrame with index or column 'Year' and column 'Global'.
        car_name (str): Column name of the car in df_prices to merge.

    Returns:
        pd.DataFrame: DataFrame with 'Year', 'Price', and 'Popularity' columns.
    """
    df_prices = df_prices.copy()
    if "Year" not in df_prices.columns:
        raise ValueError("df_prices must have a 'Year' column.")

    # Normalize df_popularity to have a 'Year' column
    if df_popularity.index.name == "Year" or isinstance(
        df_popularity.index, pd.DatetimeIndex
    ):
        df_popularity = df_popularity.copy()
        df_popularity = df_popularity.reset_index()

    df_popularity["Year"] = df_popularity["Year"].astype(int)
    df_prices["Year"] = df_prices["Year"].astype(int)

    # Merge
    df = pd.merge(
        df_prices[["Year", car_name]],
        df_popularity[["Year", "Global"]],
        on="Year",
        how="outer",
    )

    return df.rename(columns={car_name: "Price", "Global": "Popularity"})

In [12]:
def add_popularity_columns(
    df_prices: pd.DataFrame,
    car_names: list,
    start_date: str = "2006-01-01",
    granularity: str = "yearly",
) -> pd.DataFrame:
    """
    Adds a '<car>_Popularity' column for each car in car_names,
    by merging Google Trends popularity into the price DataFrame.

    Args:
        df_prices (pd.DataFrame): Original price DataFrame with 'Year' and car price columns.
        car_names (list): List of car column names to fetch popularity for.
        start_date (str): Start date for trends.
        granularity (str): One of 'weekly', 'monthly', or 'yearly'.

    Returns:
        pd.DataFrame: Original df_prices with additional '<car>_Popularity' columns.
    """
    df_prices = df_prices.copy()

    # Ensure 'Year' exists and is int
    if "Year" not in df_prices.columns:
        raise ValueError("'Year' column is missing in df_prices.")
    df_prices["Year"] = df_prices["Year"].astype(int)

    for car in car_names:
        popularity_df = fetch_trends_global(
            car, start_date=start_date, granularity=granularity
        )

        if popularity_df.empty:
            print(f"⚠️ Popularity not found for: {car}")
            df_prices[f"{car}_Popularity"] = pd.NA
            continue

        # Prepare popularity DataFrame with 'Year' and renamed popularity column
        popularity_df = popularity_df.copy()
        # Convert DatetimeIndex to int years
        popularity_df.index = popularity_df.index.year
        popularity_df.index.name = "Year"
        popularity_df = popularity_df.reset_index().rename(
            columns={"Global": f"{car}_Popularity"}
        )
        popularity_df["Year"] = popularity_df["Year"].astype(int)

        # Merge popularity into the prices DataFrame
        df_prices = pd.merge(
            df_prices,
            popularity_df[["Year", f"{car}_Popularity"]],
            on="Year",
            how="left",
        )

    return df_prices

In [13]:
df_augmented = add_popularity_columns(df_asset_classic, car_cols)
df_augmented

  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  return series.resample("Y").mean().round(2).to_frame()
  df = df.fillna(False)
  re

Unnamed: 0,Year,Ferrari Testarossa,Ferrari Daytona GTB,Ferrari 550 Maranello,Ferrari 355 GTB,Ferrari 308 GTB,Ferrari 330 GTC,Ferrari 250 GTE,Ferrari Dino 246,Ferrari F40,...,Lamborghini 400 GT_Popularity,Lamborghini Miura P400_Popularity,Lamborghini Countach 25th_Popularity,Lamborghini LM002_Popularity,Lamborghini Diablo VT_Popularity,Maserati 3500 GT_Popularity,Maserati Mistral_Popularity,Maserati MC12_Popularity,Bugatti EB110_Popularity,Bugatti Type 57_Popularity
0,2006,43828.78915,178116.9882,85364.23041,57522.27684,133531.3912,124199.489,84770.29704,73892.69081,273470.4,...,19.33,29.83,16.08,26.25,16.42,48.67,27.17,80.0,57.58,6.58
1,2007,46400.360332,194066.5417,139850.7717,53838.8931,139174.1398,132162.8084,94209.58791,97571.97861,299355.3,...,24.5,16.75,15.17,25.17,0.0,58.17,33.92,61.5,55.17,7.0
2,2008,43075.570441,232021.4504,92058.16048,49934.45962,153436.8881,133604.8417,98687.31285,93435.65853,371628.2,...,20.67,15.0,13.08,20.83,0.0,48.42,27.17,48.75,42.0,7.67
3,2009,40210.030937,202434.2997,69831.55685,45838.02099,156104.9916,124577.7128,87276.90625,77137.96978,358935.6,...,18.5,17.08,13.92,19.83,0.0,49.58,29.17,45.17,40.33,20.33
4,2010,37678.659025,214796.487,58292.15067,41041.99251,168577.0912,146554.7207,77756.76146,89946.00198,351637.9,...,15.67,16.17,12.75,18.25,1.92,38.0,28.0,34.17,36.17,15.67
5,2011,36693.518376,230947.9067,50820.87823,41117.95398,182117.9308,159669.5673,89161.79645,98781.91749,360274.5,...,15.33,15.67,12.92,18.5,0.0,36.67,34.5,27.58,28.33,14.58
6,2012,41871.628114,269514.8421,50746.60919,40416.32217,215861.2972,271130.8295,122885.6664,127087.5467,515570.6,...,15.58,17.17,13.08,30.5,0.0,38.83,31.17,25.67,25.25,11.83
7,2013,47455.255422,331012.8668,50195.42695,46330.67565,251973.8486,335404.9668,173875.0126,159250.5568,571862.2,...,18.25,22.58,13.5,16.0,0.0,41.17,33.17,21.75,26.92,15.58
8,2014,56201.047252,493855.2597,53921.58858,41997.37989,304531.0884,516729.5197,279996.4071,222195.7939,710166.2,...,18.33,19.33,13.5,14.08,0.0,40.92,38.25,22.0,26.33,15.67
9,2015,96762.818919,651325.8228,85178.16689,50744.69105,369514.9233,700148.362,363673.2162,288756.9609,1081445.0,...,17.17,21.25,12.58,17.92,0.0,43.75,31.0,18.08,27.42,11.42


In [29]:
# save df augmented to csv
df_augmented.to_csv(
    "/home/ettore/projects/hackathons/MVA_hackathon_2025/vintage_ai/data/processed/asset_classic_car_prices_with_popularity.csv",
    index=False,
)

In [14]:
from scipy.stats import pearsonr
import pandas as pd


def compute_normalized_pearson(df: pd.DataFrame, car_names: list) -> pd.DataFrame:
    """
    Computes normalized Pearson correlation between price and popularity for each car.

    Args:
        df (pd.DataFrame): DataFrame with '<car>' and '<car>_Popularity' columns.
        car_names (list): List of car base names (without '_Popularity').

    Returns:
        pd.DataFrame: Correlation results with 'car', 'pearson_r', and 'p_value'.
    """
    results = []

    for car in car_names:
        price_col = car
        pop_col = f"{car}_Popularity"

        if price_col not in df.columns or pop_col not in df.columns:
            continue

        sub_df = df[[price_col, pop_col]].dropna()

        if sub_df.empty or sub_df.shape[0] < 3:
            results.append({"car": car, "pearson_r": None, "p_value": None})
            continue

        # Normalize price (z-score)
        price_norm = (sub_df[price_col] - sub_df[price_col].mean()) / sub_df[
            price_col
        ].std()

        # Popularity is already in 0–100 scale, but we'll keep normalization consistent
        pop_norm = (sub_df[pop_col] - sub_df[pop_col].mean()) / sub_df[pop_col].std()

        r, p = pearsonr(price_norm, pop_norm)
        results.append({"car": car, "pearson_r": round(r, 3), "p_value": round(p, 4)})

    return pd.DataFrame(results)

In [15]:
correlation_df = compute_normalized_pearson(df_augmented, car_cols)
print(correlation_df.sort_values("pearson_r", ascending=False))

                          car  pearson_r  p_value
8                 Ferrari F40      0.722   0.0003
6             Ferrari 250 GTE      0.658   0.0016
11     Lamborghini Miura P400      0.563   0.0097
12  Lamborghini Countach 25th      0.547   0.0126
5             Ferrari 330 GTC      0.519   0.0191
14      Lamborghini Diablo VT      0.356   0.1237
0          Ferrari Testarossa      0.269   0.2519
9          Lamborghini 350 GT      0.141   0.5525
10         Lamborghini 400 GT      0.140   0.5558
19            Bugatti Type 57      0.121   0.6108
1         Ferrari Daytona GTB      0.118   0.6198
3             Ferrari 355 GTB     -0.130   0.5838
2       Ferrari 550 Maranello     -0.320   0.1691
18              Bugatti EB110     -0.496   0.0261
16           Maserati Mistral     -0.529   0.0164
7            Ferrari Dino 246     -0.573   0.0083
15           Maserati 3500 GT     -0.700   0.0006
17              Maserati MC12     -0.718   0.0004
13          Lamborghini LM002     -0.806   0.0000


In [18]:
from scipy.stats import pearsonr
import pandas as pd


def generate_car_insight(
    df_prices: pd.DataFrame, df_popularity: pd.DataFrame, car_name: str
) -> dict:
    """
    Generate insight scores for a single car: current popularity, correlation, and predictive score.
    """
    df_pop = df_popularity.copy()

    # Handle index to ensure 'Year' column exists
    if isinstance(df_pop.index, pd.DatetimeIndex):
        df_pop.index = df_pop.index.year
    df_pop.index.name = "Year"

    if "Year" not in df_pop.columns:
        df_pop = df_pop.reset_index()

    if "Global" not in df_pop.columns or car_name not in df_prices.columns:
        return {
            "car": car_name,
            "current_popularity": None,
            "correlation_score": None,
            "p_value": None,
            "predictive_score": None,
        }

    df_prices = df_prices.copy()
    df_prices["Year"] = df_prices["Year"].astype(int)
    df_pop["Year"] = df_pop["Year"].astype(int)

    df_merged = pd.merge(df_prices[["Year", car_name]], df_pop, on="Year", how="inner")
    df_merged = df_merged.dropna()

    if df_merged.empty or len(df_merged) < 3:
        return {
            "car": car_name,
            "current_popularity": (
                int(df_pop["Global"].iloc[-1]) if not df_pop.empty else None
            ),
            "correlation_score": None,
            "p_value": None,
            "predictive_score": None,
        }

    # Normalize
    price_z = (df_merged[car_name] - df_merged[car_name].mean()) / df_merged[
        car_name
    ].std()
    pop_z = (df_merged["Global"] - df_merged["Global"].mean()) / df_merged[
        "Global"
    ].std()

    # Pearson correlation
    r, p = pearsonr(price_z, pop_z)

    # Momentum (popularity trend)
    momentum = df_merged["Global"].diff().rolling(3).mean().iloc[-1]
    significance_boost = max(0, (1 - min(p, 0.05) / 0.05))
    predictive_score = max(0, min(100, momentum * significance_boost * 5))

    return {
        "car": car_name,
        "current_popularity": int(df_pop["Global"].iloc[-1]),
        "correlation_score": round(r * 100, 1),
        "p_value": round(p, 4),
        "predictive_score": round(predictive_score, 1),
    }

In [23]:
df = fetch_trends_global(car_name, start_date="2006-01-01", granularity="yearly")
df

In [22]:
car_name = "Ferrari Dino 246"
generate_car_insight(
    df_prices=df_asset_classic,
    df_popularity=fetch_trends_global(
        car_name, start_date="2006-01-01", granularity="yearly"
    ),
    car_name=car_name,
)

{'car': 'Ferrari Dino 246',
 'current_popularity': None,
 'correlation_score': None,
 'p_value': None,
 'predictive_score': None}

In [41]:
from datetime import date, timedelta
import time
import pandas as pd
from pytrends.request import TrendReq


def fetch_trends_global_robust(
    query: str,
    start_date: str = "2006-01-01",
    granularity: str = "weekly",
    slice_years: int = 5,
    retry_pause: int = 2,
    max_retries: int = 3,
) -> pd.DataFrame:
    """
    Robust Google Trends fetcher.
    1) Try Topic MID; if that fails → raw keyword.
    2) If the full-range request is empty → pull in `slice_years`-year chunks and stitch.
    3) Returns NaNs (not empty) if absolutely nothing is available.

    Args
    ----
    query : str            • Search phrase (e.g. "Ferrari Dino 246")
    start_date : str       • YYYY-MM-DD for earliest slice
    granularity : str      • 'weekly' (default), 'monthly', 'yearly' (resampled)
    slice_years : int      • Window size for fallback slicing
    retry_pause : int      • Seconds to wait between retries
    max_retries : int      • How many API retries before giving up

    Returns
    -------
    pd.DataFrame           • Index = date (or year), column = "Global"
    """

    def _build_and_pull(kw_list, tf):
        for _ in range(max_retries):
            try:
                py.build_payload(kw_list, timeframe=tf, geo="")
                out = py.interest_over_time()
                if not out.empty:
                    out = out.drop(columns="isPartial")
                return out
            except Exception:
                time.sleep(retry_pause)
        return pd.DataFrame()

    end_date = date.today().isoformat()
    timeframe_all = f"{start_date} {end_date}"
    py = TrendReq(
        hl="en-US",
        tz=360,
        timeout=(10, 25),
        backoff_factor=0.1,
        requests_args={"verify": False},
    )

    # --- Step 1: Topic MID first
    mid = None
    suggestions = py.suggestions(query)
    if suggestions:
        mid = suggestions[0]["mid"]
        df = _build_and_pull([mid], timeframe_all)
        if df.empty:
            mid = None  # will try keyword next
    else:
        df = pd.DataFrame()

    # --- Step 2: Raw keyword fallback
    if df.empty and mid is None:
        df = _build_and_pull([query], timeframe_all)

    # --- Step 3: Slice the timeframe if still empty
    if df.empty:
        print("⚠️  empty result, switching to sliced windows …")
        start = pd.to_datetime(start_date)
        today = pd.to_datetime(end_date)
        frames = []
        while start < today:
            stop = min(
                start + pd.DateOffset(years=slice_years) - timedelta(days=1), today
            )
            tf = f"{start.date()} {stop.date()}"
            chunk = _build_and_pull([query], tf)
            if not chunk.empty:
                frames.append(chunk)
            start = stop + timedelta(days=1)
        if frames:
            df = pd.concat(frames).sort_index()

    # --- Step 4: final tidy-up / resample / NaNs
    if df.empty:
        print("🤷 No Google Trends data available for this query.")
        return pd.DataFrame({"Global": []})  # clear but not None

    if granularity == "monthly":
        df = df.resample("M").mean()
    elif granularity == "yearly":
        df = df.resample("Y").mean()

    return df.rename(columns={df.columns[0]: "Global"})

In [42]:
df_pop = fetch_trends_global_robust(
    "Bugatti Type 57",  # try Topic first, then keyword
    start_date="2015-01-01",
    granularity="yearly",
)



TypeError: Retry.__init__() got an unexpected keyword argument 'method_whitelist'

In [33]:
from scipy.stats import pearsonr
import pandas as pd


def generate_single_car_popularity_metrics(df: pd.DataFrame, car_name: str) -> dict:
    """
    Generate popularity metrics for a single car:
    - Current popularity
    - Correlation between price and popularity (z-scored)
    - P-value of the correlation
    - Predictive score: trend momentum * significance

    Args:
        df (pd.DataFrame): DataFrame with 'Year', price column, and popularity column for the car.
        car_name (str): Base name of the car (e.g., 'Ferrari Dino 246').

    Returns:
        dict: Metrics for the car.
    """
    price_col = car_name
    pop_col = f"{car_name}_Popularity"

    if price_col not in df.columns or pop_col not in df.columns:
        return {
            "car": car_name,
            "current_popularity": None,
            "correlation_score": None,
            "p_value": None,
            "predictive_score": None,
        }

    sub_df = df[["Year", price_col, pop_col]].dropna()

    if sub_df.empty or len(sub_df) < 3:
        return {
            "car": car_name,
            "current_popularity": None,
            "correlation_score": None,
            "p_value": None,
            "predictive_score": None,
        }

    price_z = (sub_df[price_col] - sub_df[price_col].mean()) / sub_df[price_col].std()
    pop_z = (sub_df[pop_col] - sub_df[pop_col].mean()) / sub_df[pop_col].std()

    r, p = pearsonr(price_z, pop_z)

    momentum = sub_df[pop_col].diff().rolling(3).mean().iloc[-1]
    significance_boost = max(0, (1 - min(p, 0.05) / 0.05))
    predictive_score = max(0, min(100, momentum * significance_boost * 5))

    return {
        "car": car_name,
        "current_popularity": int(sub_df[pop_col].iloc[-1]),
        "correlation_score": round(r * 100, 1),
        "p_value": round(p, 4),
        "predictive_score": round(predictive_score, 1),
    }

In [34]:
generate_single_car_popularity_metrics(df=df_augmented, car_name="Ferrari Dino 246")

{'car': 'Ferrari Dino 246',
 'current_popularity': 28,
 'correlation_score': np.float64(-57.3),
 'p_value': np.float64(0.0083),
 'predictive_score': 0}