In [1]:
!pip install firebase-admin



In [5]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0
    Uninstalling pip-25.0:
      Successfully uninstalled pip-25.0
Successfully installed pip-25.0.1


In [6]:
!pip install --upgrade firebase-admin



In [3]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import pandas as pd
import matplotlib as plt
import numpy as np

# Initialize Firebase Admin SDK (Only do this ONCE in your application)
cred = credentials.Certificate("octogone-2024-firebase-adminsdk-c9abj-01878c54f4.json")
try:
  firebase_admin.initialize_app(cred)
except Exception as e:
  print(f"Firebase initialization error: {e}")
  exit()

In [6]:
!pip install --upgrade google-cloud-firestore



In [7]:
import csv
import pandas as pd
import firebase_admin
from google.cloud.firestore_v1.field_path import FieldPath

def clean_string(text):
    cleaned_text = text.replace('\r', '').replace('\n', '').replace('\t', '')  # Remove \r, \n, and \t
    cleaned_text = ''.join(char for char in cleaned_text if char.isalnum() or char.isspace())
    return cleaned_text

def extract_product_data(client_id, page_size=3000, start_after=None):
    try:
        # Create a Firestore client
        db = firestore.client()
        products_ref = db.collection(f"clients/{client_id}/establishments/global/products")
        query = products_ref.order_by(FieldPath.document_id())

        if start_after:
            query = query.start_after(start_after)

        products = query.limit(page_size).stream()

        all_product_data = []
        last_product = None  # To store the last document of the current page

        for product in products:
            last_product = product # Store the current product for pagination
            product_data = product.to_dict()
            product_id = product.id
            brand = product_data.get("brand")
            product_name = product_data.get("name")
            is_active = product_data.get("isActive")
            upc = product_data.get("upc")
            category_id = product_data.get("categoryId")
            has_all_establishments_access = product_data.get("hasAllEstablishmentAccess")

            if product_name:
                product_name_cl = clean_string(product_name)
            else:
                print(f"Warning: Product {product_id} for client {client_id} has no name.")
                product_name_cl = None

            ingredient_info_list = []
            ingredient_info_ref = product.reference.collection("ingredientInfo")
            
            for ingredient_info in ingredient_info_ref.stream():
                ingredient_info_count = sum(1 for _ in ingredient_info_ref.stream())
                print(f"Found {ingredient_info_count} ingredientInfo entries for product {product_id}.")
                ingredient_info_data = ingredient_info.to_dict()
                unit_equivalence_list = []
                volume_equivalence_list = []
                weight_equivalence_list = []

                unit_equivalence_ref = ingredient_info.reference.collection("unitEquivalence")
                for unit_equivalence in unit_equivalence_ref.stream():
                    unit_equivalence_list.append(unit_equivalence.to_dict())

                volume_equivalence_ref = ingredient_info.reference.collection("volumeEquivalence")
                for volume_equivalence in volume_equivalence_ref.stream():
                    volume_equivalence_list.append(volume_equivalence.to_dict())

                weight_equivalence_ref = ingredient_info.reference.collection("weightEquivalence")
                for weight_equivalence in weight_equivalence_ref.stream():
                    weight_equivalence_list.append(weight_equivalence.to_dict())

                ingredient_info_list.append({
                    **ingredient_info_data,
                    "unitEquivalence": unit_equivalence_list,
                    "volumeEquivalence": volume_equivalence_list,
                    "weightEquivalence": weight_equivalence_list
                })

            all_product_data.append({
                "client_id": client_id,
                "product_id": product_id,
                "brand": brand,
                "product_name": product_name_cl,
                "is_active": is_active,
                "upc": upc,
                "category_id": category_id,
                "has_all_establishments_access": has_all_establishments_access,
                "ingredientInfo": ingredient_info_list,
            })


        print(f"extract_product_data({client_id}) returned {len(all_product_data)} products.")
        return all_product_data, last_product  # Return data and the last document

    except Exception as e:
        print(f"Error extracting product data: {e}")
        return None, None

def extract_products(csv_filepath, output_filepath):
    try:
        with open(csv_filepath, 'r') as csvfile:
            reader = csv.reader(csvfile)
            next(reader, None)  # skip the header
            client_ids = [row[0] for row in reader] # List comprehension for efficiency
    
    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_filepath}")
        return
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return
    
    df_data = []
    
    for client_id in client_ids:
        all_products = []
        next_page_token = None
        while True:
            product_data, last_doc = extract_product_data(client_id, page_size=3000, start_after=next_page_token)
            if product_data is None:  # Error handling
                print(f"Error retrieving products for client {client_id}")
                break  # Exit the inner loop if there's an error
            
            all_products.extend(product_data)

            if len(product_data) < 3000:
                break
            next_page_token = last_doc
        print(f"Number of products for client {client_id}: {len(all_products)}")
     
        for product in all_products:
            # Check if 'ingredientInfo' exists and is a non-empty list
            ingredient_info = product.get("ingredientInfo", [])
            print(f"No ingredientInfo {len(ingredient_info)}")
    
            if ingredient_info:  # Only process if ingredientInfo is present
               for ingredient in ingredient_info:
                   # For each ingredient, handle the presence of each child equivalence type
                   unit_equivalence = ingredient.get("unitEquivalence", [])
                   weight_equivalence = ingredient.get("weightEquivalence", [])
                   volume_equivalence = ingredient.get("volumeEquivalence", [])
                                       
                   # Ensure at least one of the equivalence lists is non-empty
                   if unit_equivalence:
                      # If unit_equivalence exists, check for weight_equivalence or volume_equivalence
                      if weight_equivalence:
                         # Process both unit_equivalence and weight_equivalence
                         for unit_eq in unit_equivalence:
                             for weight_eq in weight_equivalence:
                                 df_data.append({
                                 "product_id": product["product_id"],
                                 "brand": product.get("brand"),
                                 "product_name": product.get("product_name"),
                                 "is_active": product.get("is_active"),
                                 "upc": product.get("upc"),
                                 "category_id": product.get("category_id"),
                                 "has_all_establishments_access": product.get("has_all_establishments_access"),
                                 "ingredient_is_active": ingredient.get("isActive"),
                                 "ingredient_is_unit": ingredient.get("isUnit"),
                                 "ingredient_is_volume": ingredient.get("isVolume"),
                                 "unit_eq_abbreviation": unit_eq.get("abbreviation"),
                                 "unit_eq_is_active": unit_eq.get("isActive"),
                                 "unit_eq_measure_type": unit_eq.get("measureType"),
                                 "unit_eq_measure_name": unit_eq.get("name"),
                                 "weight_eq_measure_id": weight_eq.get("measureId"),
                                 "weight_eq_qty_of_measure": weight_eq.get("qtyOfMeasure"),
                                 })
                      elif volume_equivalence:
                           # Process both unit_equivalence and volume_equivalence
                           for unit_eq in unit_equivalence:
                               for vol_eq in volume_equivalence:
                                   df_data.append({
                                   "product_id": product["product_id"],
                                   "brand": product.get("brand"),
                                   "product_name": product.get("product_name"),
                                   "is_active": product.get("is_active"),
                                   "upc": product.get("upc"),
                                   "category_id": product.get("category_id"),
                                   "has_all_establishments_access": product.get("has_all_establishments_access"),
                                   "ingredient_is_active": ingredient.get("isActive"),
                                   "ingredient_is_unit": ingredient.get("isUnit"),
                                   "ingredient_is_volume": ingredient.get("isVolume"),
                                   "unit_eq_abbreviation": unit_eq.get("abbreviation"),
                                   "unit_eq_is_active": unit_eq.get("isActive"),
                                   "unit_eq_measure_type": unit_eq.get("measureType"),
                                   "unit_eq_measure_name": unit_eq.get("name"),
                                   "vol_eq_measure_id": vol_eq.get("measureId"),
                                   "vol_eq_qty_of_measure": vol_eq.get("qtyOfMeasure")
                                   })
                      else:
                           # Process only unit_equivalence if no weight or volume equivalence exists
                           for unit_eq in unit_equivalence:
                               df_data.append({
                               "product_id": product["product_id"],
                               "brand": product.get("brand"),
                               "product_name": product.get("product_name"),
                               "is_active": product.get("is_active"),
                               "upc": product.get("upc"),
                               "category_id": product.get("category_id"),
                               "has_all_establishments_access": product.get("has_all_establishments_access"),
                               "ingredient_is_active": ingredient.get("isActive"),
                               "ingredient_is_unit": ingredient.get("isUnit"),
                               "ingredient_is_volume": ingredient.get("isVolume"),
                               "unit_eq_abbreviation": unit_eq.get("abbreviation"),
                               "unit_eq_is_active": unit_eq.get("isActive"),
                               "unit_eq_measure_type": unit_eq.get("measureType"),
                               "unit_eq_measure_name": unit_eq.get("name"),
                               })
                   elif weight_equivalence:
                        # If unit_equivalence doesn't exist, check for weight_equivalence
                        for weight_eq in weight_equivalence:
                            if volume_equivalence:
                               # Process weight_equivalence and volume_equivalence
                               for vol_eq in volume_equivalence:
                                   df_data.append({
                                   "product_id": product["product_id"],
                                   "brand": product.get("brand"),
                                   "product_name": product.get("product_name"),
                                   "is_active": product.get("is_active"),
                                   "upc": product.get("upc"),
                                   "category_id": product.get("category_id"),
                                   "has_all_establishments_access": product.get("has_all_establishments_access"),
                                   "ingredient_is_active": ingredient.get("isActive"),
                                   "ingredient_is_unit": ingredient.get("isUnit"),
                                   "ingredient_is_volume": ingredient.get("isVolume"),
                                   "weight_eq_measure_id": weight_eq.get("measureId"),
                                   "weight_eq_qty_of_measure": weight_eq.get("qtyOfMeasure"),
                                   "vol_eq_measure_id": vol_eq.get("measureId"),
                                   "vol_eq_qty_of_measure": vol_eq.get("qtyOfMeasure")
                                   })
                            else:
                                   # Process only weight_equivalence if no volume_equivalence exists
                                   df_data.append({
                                   "product_id": product["product_id"],
                                   "brand": product.get("brand"),
                                   "product_name": product.get("product_name"),
                                   "is_active": product.get("is_active"),
                                   "upc": product.get("upc"),
                                   "category_id": product.get("category_id"),
                                   "has_all_establishments_access": product.get("has_all_establishments_access"),
                                   "ingredient_is_active": ingredient.get("isActive"),
                                   "ingredient_is_unit": ingredient.get("isUnit"),
                                   "ingredient_is_volume": ingredient.get("isVolume"),
                                   "weight_eq_measure_id": weight_eq.get("measureId"),
                                   "weight_eq_qty_of_measure": weight_eq.get("qtyOfMeasure")
                                   })
                   elif volume_equivalence:
                        # Process only volume_equivalence if it exists and others do not
                        for vol_eq in volume_equivalence:
                            df_data.append({
                            "product_id": product["product_id"],
                            "brand": product.get("brand"),
                            "product_name": product.get("product_name"),
                            "is_active": product.get("is_active"),
                            "upc": product.get("upc"),
                            "category_id": product.get("category_id"),
                            "has_all_establishments_access": product.get("has_all_establishments_access"),
                            "ingredient_is_active": ingredient.get("isActive"),
                            "ingredient_is_unit": ingredient.get("isUnit"),
                            "ingredient_is_volume": ingredient.get("isVolume"),
                            "vol_eq_measure_id": vol_eq.get("measureId"),
                            "vol_eq_qty_of_measure": vol_eq.get("qtyOfMeasure")
                            })
            else:
                 pass  # print(f"No ingredientInfo for product {product.get('product_id')}")    
    df = pd.DataFrame(df_data)
    if df.shape[0] > 0:
       print(f"The DataFrame has {df.shape[0]} rows and {df.shape[1]} columns.")
       print(df.head())  # Optionally print the first few rows to check the contents
    else:
       print("The DataFrame is empty.")

    try:
        # Convert 'name' column to string type to handle potential mixed types
        df['product_name'] = df['product_name'].astype(str)
        # Replace multiple spaces with a single space and then strip leading/trailing spaces
        df['product_name'] = df['product_name'].str.replace(r'\s+', ' ', regex=True).str.strip() 
        # Filter out rows where 'name' is empty after cleaning (meaning it was all spaces) or purely numeric
        df = df[(df['product_name'] != '') & (~df['product_name'].str.isnumeric())]      
        df.to_csv(output_filepath, index=False, encoding='utf-8')
        print(f"Products saved to {output_filepath}")
    except Exception as e:
        print(f"Error saving to CSV: {e}")

# Example usage:
csv_file = "octogone_clients_cleaned.csv"
output_file = "octogone_products_with_measures.csv"
extract_products(csv_file, output_file)

extract_product_data(1CizJe6Mc8lUi3I6ul9n) returned 349 products.
Number of products for client 1CizJe6Mc8lUi3I6ul9n: 349
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 0
No ingredientInfo 

KeyboardInterrupt: 

In [None]:
import csv
import pandas as pd
import firebase_admin
from google.cloud.firestore_v1.field_path import FieldPath

def clean_string(text):
    cleaned_text = text.replace('\r', '').replace('\n', '').replace('\t', '')  # Remove \r, \n, and \t
    cleaned_text = ''.join(char for char in cleaned_text if char.isalnum() or char.isspace())
    return cleaned_text

def extract_product_data(client_id, page_size=3000, start_after=None):
    try:
        # Create a Firestore client
        db = firestore.client()
        products_ref = db.collection(f"clients/{client_id}/establishments/global/products")
        query = products_ref.order_by(FieldPath.document_id())

        if start_after:
            query = query.start_after(start_after)

        products = query.limit(page_size).stream()

        all_product_data = []
        last_product = None  # To store the last document of the current page

        for product in products:
            last_product = product # Store the current product for pagination
            product_data = product.to_dict()
            product_id = product.id
            brand = product_data.get("brand")
            product_name = product_data.get("name")
            is_active = product_data.get("isActive")
            upc = product_data.get("upc")
            category_id = product_data.get("categoryId")
            has_all_establishments_access = product_data.get("hasAllEstablishmentAccess")

            if product_name:
                product_name_cl = clean_string(product_name)
            else:
                print(f"Warning: Product {product_id} for client {client_id} has no name.")
                product_name_cl = None

            ingredient_info_list = []
            ingredient_info_ref = product.reference.collection("ingredientInfo")
            for ingredient_info in ingredient_info_ref.stream():
                ingredient_info_data = ingredient_info.to_dict()
                unit_equivalence_list = []
                volume_equivalence_list = []
                weight_equivalence_list = []

                unit_equivalence_ref = ingredient_info.reference.collection("unitEquivalence")
                for unit_equivalence in unit_equivalence_ref.stream():
                    unit_equivalence_list.append(unit_equivalence.to_dict())

                volume_equivalence_ref = ingredient_info.reference.collection("volumeEquivalence")
                for volume_equivalence in volume_equivalence_ref.stream():
                    volume_equivalence_list.append(volume_equivalence.to_dict())

                weight_equivalence_ref = ingredient_info.reference.collection("weightEquivalence")
                for weight_equivalence in weight_equivalence_ref.stream():
                    weight_equivalence_list.append(weight_equivalence.to_dict())

                ingredient_info_list.append({
                    **ingredient_info_data,
                    "unitEquivalence": unit_equivalence_list,
                    "volumeEquivalence": volume_equivalence_list,
                    "weightEquivalence": weight_equivalence_list
                })

            all_product_data.append({
                "client_id": client_id,
                "product_id": product_id,
                "brand": brand,
                "product_name": product_name_cl,
                "is_active": is_active,
                "upc": upc,
                "category_id": category_id,
                "has_all_establishments_access": has_all_establishments_access,
                "ingredientInfo": ingredient_info_list,
            })


        print(f"extract_product_data({client_id}) returned {len(all_product_data)} products.")
        return all_product_data, last_product  # Return data and the last document

    except Exception as e:
        print(f"Error extracting product data: {e}")
        return None, None

def extract_products(csv_filepath, output_filepath):
    try:
        with open(csv_filepath, 'r') as csvfile:
            reader = csv.reader(csvfile)
            next(reader, None)  # skip the header
            client_ids = [row[0] for row in reader] # List comprehension for efficiency

    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_filepath}")
        return
    except Exception as e:
        print(f"Error reading CSV: {e}")
        return

    df_data = []

    for client_id in client_ids:
        all_products = []
        next_page_token = None
        while True:
            product_data, last_doc = extract_product_data(client_id, page_size=3000, start_after=next_page_token)
            if product_data is None:  # Error handling
                print(f"Error retrieving products for client {client_id}")
                break  # Exit the inner loop if there's an error
            
            all_products.extend(product_data)

            if len(product_data) < 3000:
                break
            next_page_token = last_doc

        if all_products:
            for product in all_products:
                for ingredient in product.get("ingredientInfo", []): # Handle missing ingredientInfo
                    for unit_eq in ingredient.get("unitEquivalence", []):
                        for weight_eq in ingredient.get("weightEquivalence", []):
                            for vol_eq in ingredient.get("volumeEquivalence", []):
                                df_data.append({
                                    "product_id": product["product_id"],
                                    "brand": product.get("brand"),  # Handle missing brand
                                    "product_name": product.get("product_name"), # Handle missing product_name
                                    "is_active": product.get("is_active"), # Handle missing is_active
                                    "upc": product.get("upc"), # Handle missing upc
                                    "category_id": product.get("category_id"), # Handle missing category_id
                                    "has_all_establishments_access": product.get("has_all_establishments_access"), # Handle missing has_all_establishments_access
                                    "ingredient_is_active": ingredient.get("isActive"),
                                    "ingredient_is_unit": ingredient.get("isUnit"),
                                    "ingredient_is_volume": ingredient.get("isVolume"),
                                    "unit_eq_abbreviation": unit_eq.get("abbreviation"),
                                    "unit_eq_is_active": unit_eq.get("isActive"),
                                    "unit_eq_measure_type": unit_eq.get("measureType"),
                                    "unit_eq_measure_name": unit_eq.get("name"),
                                    "weight_eq_measure_id": weight_eq.get("measureId"),
                                    "weight_eq_qty_of_measure": weight_eq.get("qtyOfMeasure"),
                                    "vol_eq_measure_id": vol_eq.get("measureId"),
                                    "vol_eq_qty_of_measure": vol_eq.get("qtyOfMeasure")
                                })
        else:
            print(f"No product data found for client ID: {client_id}")

    if df_data: # Check if df_data is not empty
        df = pd.DataFrame(df_data)

        try:
            # Convert 'name' column to string type to handle potential mixed types
            df['product_name'] = df['product_name'].astype(str)
            # Replace multiple spaces with a single space and then strip leading/trailing spaces
            df['product_name'] = df['product_name'].str.replace(r'\s+', ' ', regex=True).str.strip() 
            # Filter out rows where 'name' is empty after cleaning (meaning it was all spaces) or purely numeric
            df = df[(df['product_name'] != '') & (~df['product_name'].str.isnumeric())]      
            df.to_csv(output_filepath, index=False, encoding='utf-8')
            print(f"Products saved to {output_filepath}")
        except Exception as e:
            print(f"Error saving to CSV: {e}")
    else:
        print("No product data to save.")

# Example usage:
csv_file = "octogone_clients_cleaned.csv"
output_file = "octogone_products_with_measures.csv"
extract_products(csv_file, output_file)