In [98]:
import requests
import zipfile
import io
import pandas as pd
import re

import pandas as pd

import pandas as pd

class FamaFrenchFactor:

    @classmethod
    def get_data(cls, annual=True, region="North_America", factors="5"):
        factors_str = str(factors).upper()

        # Se è una lista di regioni
        if isinstance(region, list):
            all_dfs = []
            for r in region:
                df_single = cls.get_data(annual=annual, region=r, factors=factors_str)
                # Rinomina colonne: WML → Region_WML, ecc.
                renamed = {col: f"{r}_{col}" for col in df_single.columns}
                df_single.rename(columns=renamed, inplace=True)
                all_dfs.append(df_single)

            # Merge per indice
            return pd.concat(all_dfs, axis=1)

        # Validazioni su input singolo
        region = region.strip()
        FAMA_FRENCH_REGIONS = [
            "US", "North_America", "Europe", "Japan",
            "Asia_Pacific_ex_Japan", "Developed", "Developed_ex_US"
        ]
        FAMA_FRENCH_FACTORS = ["3", "5", "MOM"]

        if region not in FAMA_FRENCH_REGIONS:
            raise ValueError(f"Region '{region}' not supported. Choose from {FAMA_FRENCH_REGIONS}")
        if factors_str not in FAMA_FRENCH_FACTORS:
            raise ValueError(f"Factor '{factors}' not valid. Choose from {FAMA_FRENCH_FACTORS}")

        # Link dinamico
        if region == "US" and factors_str == "3":
            url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_CSV.zip"
        elif region == "US" and factors_str == "5":
            url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_5_Factors_2x3_CSV.zip"
        elif region == "US" and factors_str == "MOM":
            url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Momentum_Factor_CSV.zip"
        elif factors_str == "MOM":
            url = f"https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/{region}_Mom_Factor_CSV.zip"
        else:
            url = f"https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/{region}_{factors_str}_Factors_CSV.zip"

        response = requests.get(url)
        response.raise_for_status()

        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            filename = z.namelist()[0]
            with z.open(filename) as f:
                return cls._parse_csv(f, annual)

    @staticmethod
    def _parse_csv(file, annual):
        lines = [line.decode("latin1") if isinstance(line, bytes) else line for line in file]

        header_line_index = next(
            i for i, l in enumerate(lines) if re.search(r",\s*(Mkt-RF|WML|Mom)", l)
        )
        selected_lines = lines[header_line_index:]

        df = pd.read_csv(io.StringIO("".join(selected_lines)), index_col=0)
        df.columns = df.columns.str.strip()
        df.index = df.index.astype(str).str.strip()

        if annual:
            df = df[df.index.str.fullmatch(r"\d{4}")]
            df.index = pd.to_datetime(df.index, format="%Y") + pd.offsets.YearEnd(0)
        else:
            df = df[df.index.str.fullmatch(r"\d{6}")]
            df.index = pd.to_datetime(df.index, format="%Y%m")

        df = df.apply(pd.to_numeric, errors="coerce")
        df.dropna(how="all", inplace=True)

        # Normalizza colonna momentum
        if "Mom" in df.columns:
            df.rename(columns={"Mom": "WML"}, inplace=True)

        return df

In [105]:
df = FamaFrenchFactor.get_data(
    annual=True,
    region=["Japan", "Europe"],
    factors= 5
)

display(df)
# Output: Index(['North_America_WML', 'Europe_WML'], dtype='object')


Unnamed: 0,Japan_Mkt-RF,Japan_SMB,Japan_HML,Japan_RMW,Japan_CMA,Japan_RF,Europe_Mkt-RF,Europe_SMB,Europe_HML,Europe_RMW,Europe_CMA,Europe_RF
1991-12-31,0.98,0.98,3.94,-1.98,3.75,5.6,4.55,-10.11,-6.87,18.77,-2.24,5.6
1992-12-31,-27.06,-5.55,4.12,3.03,8.72,3.51,-11.29,-11.38,-1.31,8.89,2.71,3.51
1993-12-31,21.57,-5.66,5.39,2.92,-4.18,2.9,25.52,5.14,22.25,-4.54,13.48,2.9
1994-12-31,18.84,8.94,18.79,-14.33,13.9,3.9,-0.48,9.33,6.68,2.7,4.28,3.9
1995-12-31,-7.98,-4.07,-4.04,-2.95,3.19,5.6,13.33,-9.73,-6.66,8.73,-7.44,5.6
1996-12-31,-21.31,-6.23,7.52,10.13,-6.9,5.21,16.52,-1.27,2.01,15.7,-3.08,5.21
1997-12-31,-33.89,-25.7,-8.71,15.16,-13.85,5.26,14.85,-12.0,12.51,-1.31,1.27,5.26
1998-12-31,2.73,9.13,-1.55,6.17,0.6,4.86,20.52,-15.14,-0.95,2.06,-6.0,4.86
1999-12-31,77.05,-21.79,-65.29,43.33,-75.43,4.68,15.03,11.75,-23.26,11.1,-20.79,4.68
2000-12-31,-38.94,7.44,38.38,-8.32,13.92,5.89,-15.88,-3.54,27.73,-1.78,15.67,5.89


In [20]:
import pandas as pd
import requests
import zipfile
import io
import re

class FamaFrenchDownloader:
    BASE_URL = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/"

    @classmethod
    def get_data(cls, region: str = "Developed", factors: str = "3", frequency: str = "monthly") -> pd.DataFrame:
        """
        Scarica i fattori Fama-French in base a regione, fattori e frequenza.

        Args:
            region (str): 'Developed', 'Europe', ecc.
            factors (str): '3', '5' o 'momentum'
            frequency (str): 'monthly' o 'daily'

        Returns:
            pd.DataFrame: Dati dei fattori con indice datetime
        """
        url = cls._build_url(region, factors, frequency)

        response = requests.get(url)
        response.raise_for_status()

        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            filename = z.namelist()[0]
            with z.open(filename) as f:
                return cls._parse_csv(f, frequency)

    @staticmethod
    def _build_url(region: str, factors: str, frequency: str) -> str:
        # Normalize
        region_clean = region.replace(" ", "_")
        freq_suffix = "_daily" if frequency == "daily" else ""

        if factors == "momentum":
            url = f"{region_clean}_Mom_Factor{freq_suffix}_CSV.zip"
        elif factors in {"3", "5"}:
            url = f"{region_clean}_{factors}_Factors{freq_suffix}_CSV.zip"
        else:
            raise ValueError(f"Tipo di fattori non valido: {factors} (usa '3', '5', 'momentum')")

        return FamaFrenchDownloader.BASE_URL + url

    @staticmethod
    def _parse_csv(file, frequency: str) -> pd.DataFrame:
        lines = [line.decode("latin1") if isinstance(line, bytes) else line for line in file]

        # Cerca intestazione tipo: ,Mkt-RF, SMB, HML...
        header_idx = next(i for i, l in enumerate(lines) if re.match(r",?((Mkt|Rm)?-?RF|Mom)", l))

        selected_lines = lines[header_idx:]
        df = pd.read_csv(io.StringIO("".join(selected_lines)), index_col=0)

        df.index = df.index.astype(str).str.strip()

        if frequency == "daily":
            pattern = r"\d{8}"
            date_format = "%Y%m%d"
        else:
            pattern = r"\d{6}"
            date_format = "%Y%m"

        df = df[df.index.str.fullmatch(pattern)]
        df.index = pd.to_datetime(df.index, format=date_format)
        df = df.apply(pd.to_numeric, errors="coerce")
        df.dropna(how="all", inplace=True)

        return df


In [31]:
df_3f = FamaFrenchDownloader.get_data("Japan", factors="5", frequency="year")
# df_mom = FamaFrenchDownloader.get_data("Europe", factors="momentum", frequency="daily")

display(df_3f)
# print(df_mom.tail())

Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA,RF
1990-07-01,0.10,6.27,3.63,0.60,0.24,0.68
1990-08-01,-11.88,-4.97,0.33,1.71,-0.96,0.66
1990-09-01,-17.38,0.64,0.14,-1.12,-0.11,0.60
1990-10-01,24.90,0.71,-4.55,-0.12,4.75,0.68
1990-11-01,-14.12,-5.20,-0.20,3.86,-2.18,0.57
...,...,...,...,...,...,...
2024-11-01,0.43,0.92,2.84,-1.09,0.81,0.40
2024-12-01,-1.21,-1.16,2.35,0.84,-0.53,0.37
2025-01-01,0.87,-0.12,0.42,-1.09,0.28,0.37
2025-02-01,-1.06,2.08,2.55,-0.75,3.18,0.33
