In [None]:
class AIMetricsProcessor:
    """
    Processes earnings call data to extract AI metrics and uploads the results to S3.
    
    Assumes an external LLM instance is provided (which works with invoke_json_former
    that already handles retries).
    """

    @staticmethod
    def _setup_logging() -> None:
        """
        Sets up logging with Loguru.
        This method removes default handlers and adds a new one configured to log INFO level.
        """
        logger.remove()  # Remove default logger configuration.
        logger.add(sink=lambda msg: print(msg, end=""), level="INFO")

    def __init__(self, llm: Any, batch_size: int = 50):
        """
        Initialize the processor with an LLM instance and a default batch size.
        """
        self._setup_logging()
        self.llm = llm
        self.batch_size = batch_size
        self.s3_clients: Dict[str, Any] = {}
        self.mappings: Dict[str, Dict] = {}

    def _get_s3_client(self, bucket: str) -> Any:
        """
        Return an S3 client instance for the given bucket.
        """
        credentials = get_s3_credentials()  # Replace with your actual credentials function.
        s3_client = S3(credentials=credentials, bucket=bucket)
        return s3_client

    def _load_mappings(self, bucket: str, mapping_file: str = "mapping.json") -> Dict:
        """
        Load and reverse the mapping from the specified S3 bucket.
        """
        s3_client = self._get_s3_client(bucket)
        mapping = json.loads(s3_client.read_file(mapping_file))
        # Reverse mapping: key becomes value and vice versa.
        mapping = {v: k for k, v in mapping.items()}
        return mapping

    def _load_earnings_data(self, start_date: dt.date, end_date: dt.date) -> pd.DataFrame:
        """
        Load earnings data via Spark (using load_earnings_call_data_ai_metrics) and convert to Pandas.
        """
        df = load_earnings_call_data_ai_metrics(start_date, end_date)
        return df.toPandas()

    def _process_batch(self, batch: pd.DataFrame, metric: Any, mapping: Dict) -> pd.DataFrame:
        """
        Process a batch of earnings data and extract AI metrics.
        """
        results = []
        for _, row in batch.iterrows():
            try:
                # invoke_json_former is assumed to handle retries.
                metrics_data = invoke_json_former(self.llm, row["eventBody"], metric)
                row_copy = row.copy()
                row_copy["Metrics"] = metrics_data.get("value")
                row_copy["Detail"] = metrics_data.get("detail")
                if row_copy.get("ID_BB_COMPANY") in mapping:
                    row_copy["ID_BB_COMPANY"] = mapping[row_copy["ID_BB_COMPANY"]]
                results.append(row_copy)
            except Exception as e:
                logger.error(f"Error processing row {row.get('companyTicker', 'unknown')}: {e}")
        return pd.DataFrame(results)

    def get_ai_metrics(
        self,
        metric: Any,
        s3_bucket_id: str,
        start_date: dt.date,
        end_date: dt.date,
        batch_size: Optional[int] = None,
        output_prefix: Optional[str] = None
    ) -> Optional[str]:
        """
        Extract AI metrics from earnings data and upload as a parquet file to S3.
        
        The earnings data is loaded via Spark (for the given date range) and processed in batches.
        If no output_prefix is provided, one is generated based on the date range.
        """
        s3_client = self._get_s3_client(s3_bucket_id)
        mapping = self._load_mappings("bbg_cid_refinitiv_id_map")
        df = self._load_earnings_data(start_date, end_date)
        if df.empty:
            logger.warning("No earnings data to process")
            return None

        # Rename columns as needed for downstream processing.
        df.rename(columns={
            "FILE_DATE": "Document Date",
            "SOURCE_TIMESTAMP": "Event Time",
            "companyId": "ID_BB_COMPANY",
            "year": "Year",
            "quarter": "Quarter",
        }, inplace=True)

        total_rows = len(df)
        processed_df = pd.DataFrame()
        batch_size = batch_size or self.batch_size

        for i in range(0, total_rows, batch_size):
            batch = df.iloc[i:i + batch_size]
            logger.info(f"Processing batch {(i // batch_size) + 1} ({len(batch)} rows)")
            processed_batch = self._process_batch(batch, metric, mapping)
            if not processed_batch.empty:
                processed_df = pd.concat([processed_df, processed_batch], ignore_index=True)

        if not processed_df.empty:
            # Generate output prefix if not provided.
            if output_prefix is None:
                output_prefix = f"fluentai_{start_date}_{end_date}"
            output_path = f"{output_prefix}.parquet"
            parquet_buffer = BytesIO()
            processed_df.to_parquet(parquet_buffer, index=False)
            parquet_buffer.seek(0)
            s3_client.write_file(output_path, parquet_buffer.getvalue())
            logger.info(f"Results saved to S3 at: {output_path}")
            return output_path
        else:
            logger.warning("No data processed")
            return None