In [80]:
# IMPORT LIBRARIES

import os
import re
import pandas as pd
from google.cloud import bigquery

In [81]:
# SET CREDENTIALS

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/dariaserbichenko/code/DariaSerb/key-gcp/trash-optimizer-479913-91e59ecc96c9.json"

PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
client = bigquery.Client(project=PROJECT)

print("="*60)
print("Creating optimized trash collection points table")
print("="*60)

Creating optimized trash collection points table


In [82]:
# 1ST QUERY FOR ALIMENTARY GARBAGE (FOOD WASTE)

# Table of BIGQUERY (definition all columns with 0/1 values for each waste type)

print("="*60)
print("\n1. Querying alimentary garbage (food waste)")
print("="*60)

query1 = f"""
SELECT
  ROW_NUMBER() OVER () as ID,
  CONCAT('Food Waste - ', COALESCE(commune, 'Nantes')) as Name,
  COALESCE(adresse, 'Address not specified') as Address,
  lon as Longitude,
  lat as Latitude,
  0 as Is_Cardboard_enabled,
  1 as Is_Food_enabled,
  0 as Is_Glass_enabled,
  0 as Is_Metal_enabled,
  0 as Is_Paper_enabled,
  0 as Is_Wood_enabled,
  0 as Is_Plastic_enabled,
  0 as Is_Textile_enabled,
  0 as Is_Vegetation_enabled,
  0 as Is_Neon_enabled,
  0 as Is_Cartridge_enabled,
  0 as Is_Lamp_Light_enabled,
  0 as Is_Pile_enabled,
  0 as Is_Battery_enabled,
  0 as Is_Car_Battery_enabled,
  0 as Is_Miscellanous_Trash_enabled,
  0 as Is_Pharmacy_enabled,
  0 as Is_Tire_enabled,
  0 as Is_Ressourcerie_enabled
FROM `{PROJECT}.{DATASET}.alimentary_garbage_clean`
WHERE lat IS NOT NULL AND lon IS NOT NULL
"""


1. Querying alimentary garbage (food waste)


In [83]:
try:
    df1 = client.query(query1).to_dataframe()
    print(f"Retrieved {len(df1):,} food waste locations")
except Exception as e:
    print(f"Error: {e}")
    df1 = pd.DataFrame()

Retrieved 1,644 food waste locations


In [86]:
# 2ND QUERY FOR ECOPOINTS WITH ACTUAL COLUMNS

print("="*60)
print("\n2. Querying ecopoints with actual columns")
print("="*60)

# From inspection: columns are ['bois', 'carton', 'ferraille', 'cartouche', 'neon', 'papier', 'textile', 'verre' etc.]

query2 = f"""
SELECT
  ROW_NUMBER() OVER () + 10000 as ID,
  CONCAT('Recycling Center - ', COALESCE(nom, commune, 'Ecopoint')) as Name,
  COALESCE(adresse, 'Address not specified') as Address,
  lon as Longitude,
  lat as Latitude,

  -- Use actual columns found

  CASE WHEN UPPER(carton) = 'OUI' THEN 1 ELSE 0 END as Is_Cardboard_enabled,
  0 as Is_Food_enabled,     -- 0 in food waste column
  CASE WHEN UPPER(verre) = 'OUI' THEN 1 ELSE 0 END as Is_Glass_enabled,
  CASE WHEN UPPER(ferraille) = 'OUI' THEN 1 ELSE 0 END as Is_Metal_enabled,
  CASE WHEN UPPER(papier) = 'OUI' THEN 1 ELSE 0 END as Is_Paper_enabled,
  CASE WHEN UPPER(bois) = 'OUI' THEN 1 ELSE 0 END as Is_Wood_enabled,
  0 as Is_Plastic_enabled,  -- 0 in plastique column
  CASE WHEN UPPER(textile) = 'OUI' THEN 1 ELSE 0 END as Is_Textile_enabled,
  CASE WHEN UPPER(dechet_vert) = 'OUI' THEN 1 ELSE 0 END as Is_Vegetation_enabled,
  CASE WHEN UPPER(neon) = 'OUI' THEN 1 ELSE 0 END as Is_Neon_enabled,
  CASE WHEN UPPER(cartouche) = 'OUI' THEN 1 ELSE 0 END as Is_Cartridge_enabled,
  0 as Is_Lamp_Light_enabled,  -- 0 in ampoule column
  CASE WHEN UPPER(pile) = 'OUI' THEN 1 ELSE 0 END as Is_Pile_enabled,
  CASE WHEN UPPER(batterie) = 'OUI' THEN 1 ELSE 0 END as Is_Car_Battery_enabled,
  0 as Is_Miscellanous_Trash_enabled,  -- 0 in divers column
  0 as Is_Pharmacy_enabled,            -- 0 in pharmacie column
  CASE WHEN UPPER(pneus) = 'OUI' THEN 1 ELSE 0 END as Is_Tire_enabled,
  0 as Is_Ressourcerie_enabled         -- 0 in ressourcerie column

FROM `{PROJECT}.{DATASET}.ecopoints`
WHERE lat IS NOT NULL AND lon IS NOT NULL
"""


2. Querying ecopoints with actual columns


In [87]:
try:
    df2 = client.query(query2).to_dataframe()
    print(f"Retrieved {len(df2)} recycling centers with actual waste types")

    # Check acceptance rates

    waste_cols = [col for col in df2.columns if col.startswith('Is_')]
    print(f"- Waste acceptance in recycling centers:")
    for col in waste_cols:
        count = df2[col].sum()
        if count > 0:
            waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
            print(f"   {waste_name}: {count}/{len(df2)} locations")

except Exception as e:
    print(f"Error: {e}")
    df2 = pd.DataFrame()

Retrieved 15 recycling centers with actual waste types
- Waste acceptance in recycling centers:
   Cardboard: 15/15 locations
   Glass: 14/15 locations
   Metal: 14/15 locations
   Paper: 15/15 locations
   Wood: 15/15 locations
   Textile: 9/15 locations
   Vegetation: 14/15 locations
   Neon: 8/15 locations
   Cartridge: 15/15 locations
   Pile: 13/15 locations
   Car Battery: 14/15 locations
   Tire: 2/15 locations


In [None]:
# 3RD QUERY FOR GLASS COLLECTION COLUMNS (TAKE VERRE ONLY)
# Using localisation des colonnes d‚Äôapports volontaires de Nantes M√©tropole

print("="*60)
print("\n3. Querying glass collection columns (Verre only) using Nantes M√©tropole data")
print("="*60)

query3 = f"""
SELECT
  ROW_NUMBER() OVER () + 30000 as ID,  # Start from 30000 for glass columns
  CONCAT(
    'Drop-off points - ',
    COALESCE(
      CASE
        WHEN type_colonne IS NOT NULL THEN
          CASE type_colonne
            WHEN 'colonne enterr√©e' THEN 'Underground'
            WHEN 'colonne a√©rienne' THEN 'Above-ground'
            ELSE INITCAP(type_colonne)
          END
        ELSE ''
      END,
      'Glass Collection'
    ),
    CASE
      WHEN commune IS NOT NULL THEN CONCAT(' - ', commune)
      ELSE ' - Nantes'
    END
  ) as Name,
  COALESCE(adresse, 'Nantes M√©tropole') as Address,
  lat as Latitude,
  lon as Longitude,

  -- Waste type capabilities: ONLY GLASS ENABLED
  0 as Is_Cardboard_enabled,
  0 as Is_Food_enabled,
  1 as Is_Glass_enabled, -- Only glass collection points
  0 as Is_Metal_enabled,
  0 as Is_Paper_enabled,
  0 as Is_Wood_enabled,
  0 as Is_Plastic_enabled,
  0 as Is_Textile_enabled,
  0 as Is_Vegetation_enabled,
  0 as Is_Neon_enabled,
  0 as Is_Cartridge_enabled,
  0 as Is_Lamp_Light_enabled,
  0 as Is_Pile_enabled,
  0 as Is_Battery_enabled,
  0 as Is_Car_Battery_enabled,
  0 as Is_Miscellanous_Trash_enabled,
  0 as Is_Pharmacy_enabled,
  0 as Is_Tire_enabled,
  0 as Is_Ressourcerie_enabled

FROM `{PROJECT}.{DATASET}.location_dropoff_points_nantes`
WHERE
  lat IS NOT NULL
  AND lon IS NOT NULL
  AND LOWER(TRIM(type_dechet)) = 'verre'  # Only glass collection points
"""

try:
    df3 = client.query(query3).to_dataframe()
    print(f"Retrieved {len(df3):,} glass collection columns")

    # Show summary
    print("="*60)
    print(f"GLASS COLUMNS SUMMARY:")
    print("="*60)
    print(f"  Total glass columns: {len(df3):,}")

    # Check coordinate validity
    valid_coords = df3['Latitude'].notna().sum()
    print(f"  With valid coordinates: {valid_coords:,}")

    # Show sample
    print(f"SAMPLE GLASS COLUMNS (first 3):")
    for i in range(min(3, len(df3))):
        row = df3.iloc[i]
        print(f"  {i+1}. {row['Name']}")
        print(f"     Address: {row['Address'][:60]}")
        print(f"     Location: ({row['Latitude']:.6f}, {row['Longitude']:.6f})")
        print(f"     Glass enabled: {'‚úì' if row['Is_Glass_enabled'] == 1 else '‚úó'}")

    # Show waste type summary
    print(f"WASTE TYPE ENABLEMENT (should be Glass only):")
    waste_cols = [col for col in df3.columns if col.startswith('Is_')]
    for col in waste_cols:
        count = df3[col].sum()
        if count > 0:
            waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
            print(f"  {waste_name}: {count:,}/{len(df3):,} ({count/len(df3)*100:.1f}%)")

except Exception as e:
    print(f"Error querying glass columns: {e}")

    # Debug: Check what types of waste exist in the table

    print("Debug: Checking available waste types in the table...")
    try:
        debug_query = f"""
        SELECT
          type_dechet,
          COUNT(*) as count
        FROM `{PROJECT}.{DATASET}.location_dropoff_points_nantes`
        WHERE type_dechet IS NOT NULL
        GROUP BY type_dechet
        ORDER BY count DESC
        LIMIT 10
        """
        waste_types = client.query(debug_query).to_dataframe()
        print(f"Available waste types in table:")
        print(waste_types.to_string(index=False))
    except:
        print("Could not check waste types")

    df3 = pd.DataFrame()


3. Querying glass collection columns (Verre only)
Retrieved 1,079 glass collection columns
GLASS COLUMNS SUMMARY:
  Total glass columns: 1,079
  With valid coordinates: 1,079
SAMPLE GLASS COLUMNS (first 3):
  1. Drop-off points - Underground - Nantes
     Address: Rue de la petite Sensive
     Location: (47.260437, -1.561580)
     Glass enabled: ‚úì
  2. Drop-off points - Underground - Nantes
     Address: Rue Blaise Pascal
     Location: (47.256204, -1.566761)
     Glass enabled: ‚úì
  3. Drop-off points - Underground - Nantes
     Address: 2 Rue de Concarneau
     Location: (47.264360, -1.578521)
     Glass enabled: ‚úì
WASTE TYPE ENABLEMENT (should be Glass only):
  Glass: 1,079/1,079 (100.0%)


In [73]:
# 4TH QUERY FOR NON-GLASS WASTE TYPES USING ECOPOINTS
# Add detailed waste type names and capabilities based on type_dechet

print("="*60)
print("\n4. Querying non-glass waste columns with waste type names")
print("="*60)

query4 = f"""
SELECT
  ROW_NUMBER() OVER () +
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%papier%carton%' THEN 40000
    WHEN LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%' THEN 50000
    WHEN LOWER(TRIM(type_dechet)) LIKE '%ordure m√©nag√®re%' THEN 60000
    ELSE 70000
  END as ID,

  CONCAT(
    CASE
      WHEN LOWER(TRIM(type_dechet)) LIKE '%papier%carton%' THEN 'Paper/Cardboard'
      WHEN LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%' THEN 'Recyclable Waste'
      WHEN LOWER(TRIM(type_dechet)) LIKE '%ordure m√©nag√®re%' THEN 'Household Waste'
      ELSE INITCAP(type_dechet)
    END,
    ' Drop-off Point - ',
    COALESCE(commune, 'Nantes'),
    CASE
      WHEN type_colonne IS NOT NULL THEN CONCAT(' (',
        CASE type_colonne
          WHEN 'colonne enterr√©e' THEN 'Underground'
          WHEN 'colonne a√©rienne' THEN 'Above-ground'
          ELSE INITCAP(type_colonne)
        END, ')')
      ELSE ''
    END
  ) as Name,

  COALESCE(adresse, 'Nantes M√©tropole') as Address,
  lat as Latitude,
  lon as Longitude,

  -- Paper/Cardboard columns
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%papier%carton%' THEN 1
    WHEN LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%' THEN 1
    ELSE 0
  END as Is_Cardboard_enabled,

  -- Food (only for household waste)
  0 as Is_Food_enabled,

  -- Glass (for recyclable and household waste - but NOT paper/cardboard)
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%verre%' THEN 1
    ELSE 0
  END as Is_Glass_enabled,

  -- Metal (for recyclable and household waste)
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%' THEN 1
    ELSE 0
  END as Is_Metal_enabled,

  -- Paper (for paper/cardboard, recyclable, and household)
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%papier%carton%' THEN 1
    WHEN LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%' THEN 1
    ELSE 0
  END as Is_Paper_enabled,

  -- Plastic (for recyclable and household)
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%' THEN 1
    ELSE 0
  END as Is_Plastic_enabled,

  -- Others (for household waste)
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%ordure m√©nag√®re%' THEN 1
    ELSE 0
  END as Is_Miscellanous_Trash_enabled,

  -- Textile (only for household)
  0 as Is_Textile_enabled,

  -- Vegetation (only for household)
  0 as Is_Vegetation_enabled,

  -- Special waste types (none for these columns)
  0 as Is_Neon_enabled,
  0 as Is_Cartridge_enabled,
  0 as Is_Lamp_Light_enabled,

  type_dechet as Original_Waste_Type,
  type_colonne as Original_Column_Type,
  commune as Commune

FROM `{PROJECT}.{DATASET}.location_dropoff_points_nantes`
WHERE
  lat IS NOT NULL
  AND lon IS NOT NULL
  AND (
    LOWER(TRIM(type_dechet)) LIKE '%papier%carton%'
    OR LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%'
    OR LOWER(TRIM(type_dechet)) LIKE '%ordure m√©nag√®re%'
  )
ORDER BY
  CASE
    WHEN LOWER(TRIM(type_dechet)) LIKE '%papier%carton%' THEN 1
    WHEN LOWER(TRIM(type_dechet)) LIKE '%d√©chet recyclable%' THEN 2
    WHEN LOWER(TRIM(type_dechet)) LIKE '%ordure m√©nag√®re%' THEN 3
    ELSE 4
  END,
  commune
"""

try:
    df4 = client.query(query4).to_dataframe()
    print(f"Retrieved {len(df4):,} non-glass waste columns")

    # Show breakdown

    print(f"BREAKDOWN BY WASTE TYPE:")
    if 'Original_Waste_Type' in df4.columns:
        # Group by cleaned waste type name
        df4['Waste_Category'] = df4['Original_Waste_Type'].apply(
            lambda x: 'Paper/Cardboard' if 'papier' in str(x).lower() and 'carton' in str(x).lower()
            else 'Recyclable Waste' if 'd√©chet recyclable' in str(x).lower()
            else 'Household Waste' if 'ordure m√©nag√®re' in str(x).lower()
            else 'Other'
        )

        waste_counts = df4['Waste_Category'].value_counts()
        for waste_type, count in waste_counts.items():
            percentage = (count / len(df4)) * 100
            print(f"  {waste_type}: {count:,} columns ({percentage:.1f}%)")

    # Show what each type accepts

    print(f"CAPABILITIES BY WASTE TYPE:")
    waste_categories = df4['Waste_Category'].unique() if 'Waste_Category' in df4.columns else df4['Original_Waste_Type'].unique()

    for category in waste_categories:
        if 'Waste_Category' in df4.columns:
            subset = df4[df4['Waste_Category'] == category]
        else:
            subset = df4[df4['Original_Waste_Type'] == category]

        if len(subset) > 0:
            print(f"\n  {category} columns accept:")
            enabled_types = []
            for col in [c for c in subset.columns if c.startswith('Is_') and c not in ['Is_Neon_enabled', 'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled']]:
                if subset[col].iloc[0] == 1:
                    waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
                    enabled_types.append(waste_name)

            if enabled_types:
                print(f"    ‚úì {', '.join(enabled_types)}")
            else:
                print(f"    ‚úó No specific waste types enabled")

    # Show samples

    print(f"SAMPLES (one of each type):")
    sample_shown = set()

    for _, row in df4.iterrows():
        waste_type = row['Original_Waste_Type']
        if waste_type not in sample_shown:
            sample_shown.add(waste_type)

            category = row.get('Waste_Category', waste_type)
            print(f"\n  {category}:")
            print(f"    Name: {row['Name']}")
            print(f"    Original Type: {row['Original_Waste_Type']}")
            print(f"    Location: {row['Commune']}")
            print(f"    Coordinates: ({row['Latitude']:.6f}, {row['Longitude']:.6f})")

            # Show enabled types

            enabled = []
            for col in [c for c in row.index if c.startswith('Is_') and row[col] == 1]:
                waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
                enabled.append(waste_name)
            if enabled:
                print(f"    Accepts: {', '.join(enabled)}")

            # Limit to 3 samples

            if len(sample_shown) >= 3:
                break

    # Show distribution by commune

    print(f"DISTRIBUTION BY COMMUNE (top 5):")
    if 'Commune' in df4.columns:
        commune_counts = df4['Commune'].value_counts().head(5)
        for commune, count in commune_counts.items():
            print(f"  {commune}: {count:,} columns")

except Exception as e:
    print(f"Error: {e}")

    # Try exact match if LIKE doesn't work

    print("Trying with exact matches")
    try:
        query4_exact = f"""
        SELECT
          ROW_NUMBER() OVER () +
          CASE
            WHEN type_dechet = 'Papier-carton' THEN 40000
            WHEN type_dechet = 'D√©chet recyclable' THEN 50000
            WHEN type_dechet = 'Ordure m√©nag√®re' THEN 60000
            ELSE 70000
          END as ID,

          CONCAT(
            CASE
              WHEN type_dechet = 'Papier-carton' THEN 'Paper/Cardboard'
              WHEN type_dechet = 'D√©chet recyclable' THEN 'Recyclable Waste'
              WHEN type_dechet = 'Ordure m√©nag√®re' THEN 'Household Waste'
              ELSE type_dechet
            END,
            ' Column - ',
            COALESCE(commune, 'Nantes')
          ) as Name,

          COALESCE(adresse, 'Nantes M√©tropole') as Address,
          lat as Latitude,
          lon as Longitude,

          -- Capabilities (simplified for testing)
          1 as Is_Cardboard_enabled,
          0 as Is_Food_enabled,
          0 as Is_Glass_enabled,
          0 as Is_Metal_enabled,
          1 as Is_Paper_enabled,
          0 as Is_Plastic_enabled,
          0 as Is_Textile_enabled,
          0 as Is_Vegetation_enabled,
          0 as Is_Neon_enabled,
          0 as Is_Cartridge_enabled,
          0 as Is_Lamp_Light_enabled,

          type_dechet as Original_Waste_Type,
          commune as Commune

        FROM `{PROJECT}.{DATASET}.location_dropoff_points_nantes`
        WHERE lat IS NOT NULL AND lon IS NOT NULL
          AND type_dechet IN ('Papier-carton', 'D√©chet recyclable', 'Ordure m√©nag√®re')
        LIMIT 100
        """

        df4 = client.query(query4_exact).to_dataframe()
        print(f"Retrieved {len(df4):,} columns with exact matching")

    except Exception as e2:
        print(f"Exact match also failed: {e2}")
        df4 = pd.DataFrame()


4. Querying non-glass waste columns with waste type names
Retrieved 1,490 non-glass waste columns
BREAKDOWN BY WASTE TYPE:
  Household Waste: 843 columns (56.6%)
  Recyclable Waste: 564 columns (37.9%)
  Paper/Cardboard: 83 columns (5.6%)
CAPABILITIES BY WASTE TYPE:

  Paper/Cardboard columns accept:
    ‚úì Cardboard, Paper

  Recyclable Waste columns accept:
    ‚úì Cardboard, Metal, Paper, Plastic

  Household Waste columns accept:
    ‚úì Miscellanous Trash
SAMPLES (one of each type):

  Paper/Cardboard:
    Name: Paper/Cardboard Drop-off Point - Nantes (Above-ground)
    Original Type: Papier-carton
    Location: Nantes
    Coordinates: (47.229835, -1.519756)

  Recyclable Waste:
    Name: Recyclable Waste Drop-off Point - Basse-Goulaine (Underground)
    Original Type: D√©chet recyclable
    Location: Basse-Goulaine
    Coordinates: (47.208462, -1.466821)

  Household Waste:
    Name: Household Waste Drop-off Point - Basse-Goulaine (Underground)
    Original Type: Ordure m√©nag√

In [76]:
# 5TH QUERY FOR ECOSYSTEM POINTS

print("="*60)
print("\n5. Querying ecosystem collection points")
print("="*60)

query5 = f"""
SELECT
  ID,
  Name,
  Address,
  Longitude,
  Latitude,
  Is_Neon_enabled,
  Is_Cartridge_enabled,
  Is_Lamp_Light_enabled,
  Is_Battery_enabled,
  Is_Car_Battery_enabled,
  Is_Pile_enabled
FROM `{PROJECT}.{DATASET}.ecosystem_collection_points_with_coords`
WHERE Latitude IS NOT NULL AND Longitude IS NOT NULL
"""

try:
    df5 = client.query(query5).to_dataframe()
    print(f"Retrieved {len(df5):,} Ecosystem collection points")
except Exception as e:
    print(f"Error retrieving ecosystem points: {e}")
    df5 = pd.DataFrame()


5. Querying ecosystem collection points
Retrieved 110 Ecosystem collection points


In [None]:
# COMBINE AND CREATE CURRENT TABLE WITH ECOSYSTEM POINTS

print("="*60)
print("CREATING CURRENT TRASH COLLECTION POINTS TABLE WITH ECOSYSTEM POINTS")
print("="*60)

# Initialize variables
all_dataframes = []
available_dfs = []

# Check which dataframes exist and are not empty
dataframe_names = ['df1', 'df2', 'df3', 'df4', 'df5']

for df_name in dataframe_names:
    if df_name in locals() and isinstance(locals()[df_name], pd.DataFrame):
        df = locals()[df_name]
        if not df.empty:
            all_dataframes.append(df)
            available_dfs.append(df_name)
            print(f"‚úì {df_name}: {len(df):,} points")
        else:
            print(f"‚ö†Ô∏è {df_name}: Empty dataframe")
    else:
        print(f"‚ö†Ô∏è {df_name}: Not found or not a DataFrame")

if all_dataframes:
    # Show what we're working with
    print(f"\nüìä DATASET COMPOSITION:")
    print(f"  Number of dataframes to combine: {len(all_dataframes)}")

    for i, (df_name, df) in enumerate(zip(available_dfs, all_dataframes), 1):
        print(f"  {df_name}: {len(df)} rows, {len(df.columns)} columns")
        if i == 1:  # Show first few column names for first dataframe
            print(f"    Columns: {df.columns.tolist()[:10]}...")

    # Combine all data
    combined_df = pd.concat(all_dataframes, ignore_index=True, sort=False)
    print(f"\n‚úÖ Combined dataset: {len(combined_df)} rows, {len(combined_df.columns)} columns")

    # Reset ID to be sequential
    combined_df['ID'] = range(1, len(combined_df) + 1)

    # Define final structure
    final_columns = [
        'ID', 'Name', 'Address', 'Longitude', 'Latitude',
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    print(f"\nüîÑ STANDARDIZING COLUMNS...")
    print(f"  Target columns: {len(final_columns)}")

    # Ensure all columns exist (add missing ones)
    columns_added = []
    for col in final_columns:
        if col not in combined_df.columns:
            if col.startswith('Is_'):
                combined_df[col] = 0
                columns_added.append(col)
            elif col in ['Longitude', 'Latitude']:
                combined_df[col] = None
                columns_added.append(col)
            elif col == 'ID':
                # Already added above
                pass
            else:
                combined_df[col] = ''
                columns_added.append(col)

    if columns_added:
        print(f"  Added columns: {columns_added}")

    # Standardize coordinate column names
    print(f"\nüìç STANDARDIZING COORDINATES...")

    # Find all possible coordinate columns
    coord_mapping = {}
    for col in combined_df.columns:
        col_lower = str(col).lower()
        if 'lat' in col_lower and col != 'Latitude':
            coord_mapping['Latitude'] = col
        elif ('lon' in col_lower or 'long' in col_lower) and col != 'Longitude':
            coord_mapping['Longitude'] = col

    if coord_mapping:
        print(f"  Found alternate coordinate columns: {coord_mapping}")

        # Copy values from alternate columns to standard ones
        if 'Latitude' in coord_mapping:
            print(f"  Copying {coord_mapping['Latitude']} ‚Üí Latitude")
            combined_df['Latitude'] = combined_df[coord_mapping['Latitude']]

        if 'Longitude' in coord_mapping:
            print(f"  Copying {coord_mapping['Longitude']} ‚Üí Longitude")
            combined_df['Longitude'] = combined_df[coord_mapping['Longitude']]

        # Drop the alternate columns
        drop_cols = list(coord_mapping.values())
        combined_df = combined_df.drop(columns=drop_cols, errors='ignore')
        print(f"  Dropped alternate columns: {drop_cols}")

    # Convert to proper types
    print(f"\nüîÑ CONVERTING DATA TYPES...")

    # Convert waste type columns to integers
    waste_cols = [col for col in combined_df.columns if col.startswith('Is_') and col.endswith('_enabled')]
    for col in waste_cols:
        if col in combined_df.columns:
            initial_nulls = combined_df[col].isna().sum()
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce').fillna(0).astype(int)
            final_nulls = combined_df[col].isna().sum()
            print(f"  ‚úì {col}: {initial_nulls} nulls ‚Üí {final_nulls} nulls")

    # Convert coordinates
    coord_conversion = {}
    for col in ['Longitude', 'Latitude']:
        if col in combined_df.columns:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
            valid_count = combined_df[col].notna().sum()
            total_count = len(combined_df)
            percentage = (valid_count / total_count * 100) if total_count > 0 else 0
            coord_conversion[col] = (valid_count, percentage)

    print(f"  Coordinate validity:")
    for col, (valid, pct) in coord_conversion.items():
        print(f"    {col}: {valid:,}/{len(combined_df):,} valid ({pct:.1f}%)")

    # Reorder columns - only keep final columns
    available_columns = [col for col in final_columns if col in combined_df.columns]
    missing_columns = [col for col in final_columns if col not in combined_df.columns]

    if missing_columns:
        print(f"\n‚ö†Ô∏è WARNING: Missing expected columns: {missing_columns}")

    combined_df = combined_df[available_columns]
    print(f"  Final columns: {len(available_columns)}")

    total_locations = len(combined_df)
    print(f"\n‚úÖ CURRENT TABLE CREATED: {total_locations:,} total trash collection points")

    # Calculate statistics
    print("\nüìà CURRENT DATASET STATISTICS:")
    print("-" * 40)

    # Points with coordinates
    has_coords = combined_df['Latitude'].notna() & combined_df['Longitude'].notna()
    coord_count = has_coords.sum()
    coord_pct = (coord_count / total_locations * 100) if total_locations > 0 else 0
    print(f"  ‚Ä¢ Points with coordinates: {coord_count:,}/{total_locations:,} ({coord_pct:.1f}%)")

    # Points without coordinates (if any)
    no_coords = total_locations - coord_count
    if no_coords > 0:
        print(f"  ‚Ä¢ Points WITHOUT coordinates: {no_coords:,} ({no_coords/total_locations*100:.1f}%)")

    # Waste type coverage
    print(f"\n  ‚Ä¢ Waste type coverage:")

    # Define waste types based on actual columns
    waste_types = []
    for col in combined_df.columns:
        if col.startswith('Is_') and col.endswith('_enabled'):
            waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
            waste_types.append((waste_name, col))

    # Sort by count descending
    waste_counts = []
    for waste_name, col_name in waste_types:
        count = int(combined_df[col_name].sum())
        if count > 0:
            percentage = (count / total_locations) * 100 if total_locations > 0 else 0
            waste_counts.append((waste_name, col_name, count, percentage))

    # Sort by count descending
    waste_counts.sort(key=lambda x: x[2], reverse=True)

    for waste_name, col_name, count, percentage in waste_counts:
        print(f"    {waste_name:22s}: {count:6,d} points ({percentage:5.1f}%)")

    # Save to CSV for backup
    output_csv = "current_trash_collection_points.csv"
    try:
        combined_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
        print(f"\nüíæ Data saved locally: '{output_csv}' ({len(combined_df):,} rows)")

        # Verify CSV was saved correctly
        csv_check = pd.read_csv(output_csv, nrows=5)
        print(f"  CSV verification: {len(csv_check)} rows read successfully")
    except Exception as e:
        print(f"\n‚ùå Error saving CSV: {e}")

    # UPLOAD TO BIGQUERY
    print(f"\n‚òÅÔ∏è UPLOADING TO BIGQUERY...")

    # Use consistent table name
    table_id = f"{PROJECT}.{DATASET}.trash_collection_points"
    print(f"  Table: {table_id}")

    # Check if BigQuery client is available
    if 'client' not in locals():
        print("‚ùå BigQuery client not found. Creating local backup only.")
        print(f"   Data saved to: {output_csv}")
    else:
        try:
            job_config = bigquery.LoadJobConfig(
                write_disposition="WRITE_TRUNCATE",
                autodetect=True,
                max_bad_records=100
            )

            # Upload the dataframe
            print(f"  Uploading {len(combined_df):,} rows...")
            job = client.load_table_from_dataframe(combined_df, table_id, job_config=job_config)
            job.result()

            # Verify upload
            table = client.get_table(table_id)
            print(f"‚úÖ BigQuery table created: {table_id}")
            print(f"   Rows: {table.num_rows:,}")
            print(f"   Size: {table.num_bytes / (1024*1024):.2f} MB")

            # Run verification query
            print(f"\nüîç VERIFICATION QUERY:")

            # Build dynamic verification query based on available columns
            select_parts = ["COUNT(*) as total_points"]

            # Add coordinate check
            select_parts.append("SUM(CASE WHEN Latitude IS NOT NULL AND Longitude IS NOT NULL THEN 1 ELSE 0 END) as points_with_coords")

            # Add checks for each waste type column
            waste_cols = [col for col in combined_df.columns if col.startswith('Is_') and col.endswith('_enabled')]
            for col in waste_cols:
                clean_name = col.replace('Is_', '').replace('_enabled', '')
                select_parts.append(f"SUM({col}) as {clean_name}_points")

            verify_query = f"""
            SELECT
              {', '.join(select_parts)}
            FROM `{table_id}`
            """

            try:
                result = client.query(verify_query).to_dataframe().iloc[0]
                print(f"  ‚Ä¢ Total points: {result['total_points']:,}")
                print(f"  ‚Ä¢ With coordinates: {result['points_with_coords']:,}")

                # Show top waste types by count
                print(f"\n  ‚Ä¢ Top waste type counts:")
                waste_counts = []
                for key, value in result.items():
                    if key.endswith('_points') and key != 'points_with_coords':
                        waste_name = key.replace('_points', '').replace('_', ' ').title()
                        waste_counts.append((waste_name, int(value)))

                # Sort by count descending and show top 10
                waste_counts.sort(key=lambda x: x[1], reverse=True)
                for waste_name, count in waste_counts[:10]:
                    if count > 0:
                        pct = (count / result['total_points'] * 100) if result['total_points'] > 0 else 0
                        print(f"     - {waste_name:18s}: {count:6,d} ({pct:5.1f}%)")

                if len(waste_counts) > 10:
                    print(f"     ... and {len(waste_counts) - 10} more waste types")

            except Exception as query_error:
                print(f"  ‚ùå Verification query failed: {query_error}")

        except Exception as upload_error:
            print(f"‚ùå BigQuery upload failed: {upload_error}")
            print(f"   Data saved locally: '{output_csv}'")

            # Try alternative method using pandas_gbq
            try:
                print("\nüîÑ Trying alternative CSV upload method...")
                import pandas_gbq

                pandas_gbq.to_gbq(
                    combined_df,
                    destination_table=table_id,
                    project_id=PROJECT,
                    if_exists='replace',
                    progress_bar=False
                )
                print("‚úÖ Upload successful via pandas_gbq!")
            except Exception as e2:
                print(f"‚ùå Alternative also failed: {e2}")
                print("\nüìã Manual upload instructions:")
                print(f"   1. Go to BigQuery Console")
                print(f"   2. Select dataset: {DATASET}")
                print(f"   3. Create table 'trash_collection_points'")
                print(f"   4. Upload file: {output_csv}")
                print(f"   5. Enable schema autodetection")

else:
    print("‚ùå No data to combine!")
    print("   Check if df1, df2, df3, df4, df5 are defined and not empty")

print("\n" + "="*60)
print("üéØ UNIFIED TRASH COLLECTION DATABASE CREATION COMPLETE!")
print("="*60)

if 'total_locations' in locals():
    print(f"\nSUMMARY:")
    print(f"  ‚Ä¢ Total collection points: {total_locations:,}")
    print(f"  ‚Ä¢ Datasets combined: {len(available_dfs)}")

    if 'table_id' in locals():
        print(f"  ‚Ä¢ BigQuery table: {table_id}")

    if 'output_csv' in locals():
        print(f"  ‚Ä¢ Local backup: {output_csv}")

    # Final data quality check
    print(f"\nüìä FINAL DATA QUALITY CHECK:")
    if 'combined_df' in locals():
        print(f"  ‚Ä¢ Rows: {len(combined_df):,}")
        print(f"  ‚Ä¢ Columns: {len(combined_df.columns):,}")

        # Check for nulls in critical columns
        critical_cols = ['Name', 'Latitude', 'Longitude']
        for col in critical_cols:
            if col in combined_df.columns:
                null_count = combined_df[col].isna().sum()
                if null_count == 0:
                    print(f"  ‚Ä¢ ‚úÖ {col}: No null values")
                else:
                    pct = (null_count / len(combined_df)) * 100
                    print(f"  ‚Ä¢ ‚ö†Ô∏è {col}: {null_count:,} nulls ({pct:.1f}%)")

        # Check waste type columns
        waste_cols = [col for col in combined_df.columns if col.startswith('Is_')]
        total_waste_points = 0
        for col in waste_cols:
            total_waste_points += combined_df[col].sum()

        avg_waste_types = total_waste_points / len(combined_df) if len(combined_df) > 0 else 0
        print(f"  ‚Ä¢ Average waste types per point: {avg_waste_types:.2f}")

        # Check ID uniqueness
        if 'ID' in combined_df.columns:
            unique_ids = combined_df['ID'].nunique()
            if unique_ids == len(combined_df):
                print(f"  ‚Ä¢ ‚úÖ All IDs are unique")
            else:
                print(f"  ‚Ä¢ ‚ùå ID duplicates: {len(combined_df) - unique_ids}")

print("\n‚úÖ Process completed!")

In [90]:
# COMBINE AND CREATE CURRENT TABLE WITH ECOSYSTEM POINTS

print("="*60)
print("CREATING CURRENT TRASH COLLECTION POINTS TABLE WITH ECOSYSTEM POINTS")
print("="*60)

all_dataframes = []

if not df1.empty:
    all_dataframes.append(df1)
    print(f"‚úì Food waste points: {len(df1):,}")
if not df2.empty:
    all_dataframes.append(df2)
    print(f"‚úì Recycling centers: {len(df2):,}")
if not df3.empty:
    all_dataframes.append(df3)
    print(f"‚úì Underground containers: {len(df3):,}")
if not df4.empty:
    all_dataframes.append(df4)
    print(f"‚úì Underground containers (other waste types): {len(df4):,}")
if not df5.empty:
    all_dataframes.append(df5)
    print(f"‚úì Ecosystem special waste points: {len(df5):,}")

if all_dataframes:
    # Combine all data
    combined_df = pd.concat(all_dataframes, ignore_index=True, sort=False)

    print(f"\nüìä DATASET COMPOSITION BEFORE PROCESSING:")
    dataset_sources = []
    if not df1.empty:
        dataset_sources.append(f"Food Waste: {len(df1):,}")
    if not df2.empty:
        dataset_sources.append(f"Recycling Centers: {len(df2):,}")
    if not df3.empty:
        dataset_sources.append(f"Glass Containers: {len(df3):,}")
    if not df4.empty:
        dataset_sources.append(f"Other Waste Containers: {len(df4):,}")
    if not df5.empty:
        dataset_sources.append(f"Ecosystem Points: {len(df5):,}")
    print("  " + " | ".join(dataset_sources))

    # Reset ID to be sequential
    combined_df['ID'] = range(1, len(combined_df) + 1)

    # Define final structure - CORRECTED TYPO: strash ‚Üí trash
    final_columns = [
        'ID', 'Name', 'Address', 'Longitude', 'Latitude',
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Wood_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled', 'Is_Pile_enabled',
        'Is_Battery_enabled', 'Is_Car_Battery_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    # Ensure all columns exist (add missing ones)
    for col in final_columns:
        if col not in combined_df.columns:
            if col.startswith('Is_'):
                combined_df[col] = 0
            elif col in ['Longitude', 'Latitude']:
                combined_df[col] = None

    # Handle coordinate column names - ecosystem df5 uses 'Latitude'/'Longitude'
    # while other datasets might use 'latitude'/'longitude' or 'lon'/'lat'
    print("\nüìç CHECKING COORDINATE COLUMNS...")
    coord_columns = {}
    for col in combined_df.columns:
        col_lower = col.lower()
        if 'lat' in col_lower:
            coord_columns['latitude'] = col
        elif 'lon' in col_lower or 'long' in col_lower:
            coord_columns['longitude'] = col

    print(f"Found coordinate columns: {coord_columns}")

    # Standardize coordinate column names
    if 'latitude' in coord_columns and coord_columns['latitude'] != 'Latitude':
        print(f"Copying {coord_columns['latitude']} to Latitude column")
        combined_df['Latitude'] = combined_df[coord_columns['latitude']]
    if 'longitude' in coord_columns and coord_columns['longitude'] != 'Longitude':
        print(f"Copying {coord_columns['longitude']} to Longitude column")
        combined_df['Longitude'] = combined_df[coord_columns['longitude']]

    # Convert to proper types
    print("\nüîÑ CONVERTING DATA TYPES...")
    for col in combined_df.columns:
        if col.startswith('Is_'):
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce').fillna(0).astype(int)
            print(f"  ‚úì Converted {col} to int")
        elif col in ['Longitude', 'Latitude']:
            combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
            valid_coords = combined_df[col].notna().sum()
            print(f"  ‚úì Converted {col}: {valid_coords:,}/{len(combined_df):,} valid values")

    # Reorder columns - only keep columns that exist
    available_columns = [col for col in final_columns if col in combined_df.columns]
    combined_df = combined_df[available_columns]

    total_locations = len(combined_df)
    print(f"\n‚úÖ CURRENT TABLE CREATED: {total_locations:,} total trash collection points")

    # Calculate statistics
    print("\nüìà CURRENT DATASET STATISTICS:")

    # Points with coordinates
    has_coords = combined_df['Latitude'].notna() & combined_df['Longitude'].notna()
    coord_count = has_coords.sum()
    print(f"  ‚Ä¢ Points with coordinates: {coord_count:,}/{total_locations:,} ({coord_count/total_locations*100:.1f}%)")

    # Waste type coverage
    print(f"  ‚Ä¢ Waste type coverage:")
    waste_types = [
        ('Cardboard', 'Is_Cardboard_enabled'),
        ('Food', 'Is_Food_enabled'),
        ('Glass', 'Is_Glass_enabled'),
        ('Metal', 'Is_Metal_enabled'),
        ('Paper', 'Is_Paper_enabled'),
        ('Plastic', 'Is_Plastic_enabled'),
        ('Textile', 'Is_Textile_enabled'),
        ('Vegetation', 'Is_Vegetation_enabled'),
        ('Neon', 'Is_Neon_enabled'),
        ('Cartridge', 'Is_Cartridge_enabled'),
        ('Lamp/Light', 'Is_Lamp_Light_enabled'),
        ('Pile', 'Is_Pile_enabled'),
        ('Battery', 'Is_Battery_enabled'),
        ('Car Battery', 'Is_Car_Battery_enabled'),
        ('Miscellanous Trash', 'Is_Miscellanous_Trash_enabled'),
        ('Pharmacy', 'Is_Pharmacy_enabled'),
        ('Tire', 'Is_Tire_enabled'),
        ('Ressourcerie', 'Is_Ressourcerie_enabled')
    ]

    for waste_name, col_name in waste_types:
        if col_name in combined_df.columns:
            count = combined_df[col_name].sum()
            if count > 0:
                percentage = (count / total_locations) * 100
                print(f"    {waste_name:18s}: {count:6,d} points ({percentage:5.1f}%)")

    # Show sample from each source
    print(f"\nüëÄ SAMPLE FROM EACH SOURCE:")
    sample_size = 2

    if not df1.empty:
        print(f"\n  Food Waste (df1):")
        sample = df1.head(sample_size)
        for i, row in sample.iterrows():
            name = row.get('Name', 'Unknown')[:40] if 'Name' in row else 'Unknown'
            print(f"    ‚Ä¢ {name}...")

    if not df2.empty:
        print(f"\n  Recycling Centers (df2):")
        sample = df2.head(sample_size)
        for i, row in sample.iterrows():
            name = row.get('Name', 'Unknown')[:40] if 'Name' in row else 'Unknown'
            print(f"    ‚Ä¢ {name}...")

    if not df3.empty:
        print(f"\n  Glass Containers (df3):")
        sample = df3.head(sample_size)
        for i, row in sample.iterrows():
            name = row.get('Name', 'Unknown')[:40] if 'Name' in row else 'Unknown'
            print(f"    ‚Ä¢ {name}...")

    if not df4.empty:
        print(f"\n  Other Waste Containers (df4):")
        sample = df4.head(sample_size)
        for i, row in sample.iterrows():
            name = row.get('Name', 'Unknown')[:40] if 'Name' in row else 'Unknown'
            print(f"    ‚Ä¢ {name}...")

    if not df5.empty:
        print(f"\n  Ecosystem Points (df5):")
        sample = df5.head(sample_size)
        for i, row in sample.iterrows():
            name = row.get('Name', 'Unknown')[:40] if 'Name' in row else 'Unknown'

            # Check what waste types are enabled
            enabled = []
            for waste_name, col_name in waste_types:
                if col_name in row and row[col_name] == 1:
                    enabled.append(waste_name)
            waste_info = f" ({', '.join(enabled[:2])})" if enabled else ""
            print(f"    ‚Ä¢ {name}...{waste_info}")

    # Save to CSV for backup
    output_csv = "current_trash_collection_points.csv"
    combined_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Data saved locally: '{output_csv}'")

    # UPLOAD TO BIGQUERY
    print(f"\n‚òÅÔ∏è UPLOADING TO BIGQUERY...")

    # CORRECTED TABLE NAME: strash ‚Üí trash
    table_id = f"{PROJECT}.{DATASET}.trash_collection_points"

    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
        autodetect=True,
        max_bad_records=100
    )

    try:
        job = client.load_table_from_dataframe(combined_df, table_id, job_config=job_config)
        job.result()

        table = client.get_table(table_id)
        print(f"‚úÖ BigQuery table created: {table_id}")
        print(f"   Rows: {table.num_rows:,}")
        print(f"   Size: {table.num_bytes / (1024*1024):.2f} MB")

        # Verification query
        print(f"\nüîç VERIFICATION QUERY:")
        verify_query = f"""
        SELECT
          COUNT(*) as total_points,
          SUM(CASE WHEN Latitude IS NOT NULL AND Longitude IS NOT NULL THEN 1 ELSE 0 END) as points_with_coords,
          SUM(Is_Neon_enabled) as neon_points,
          SUM(Is_Cartridge_enabled) as cartridge_points,
          SUM(Is_Battery_enabled) as battery_points,
          SUM(Is_Pile_enabled) as pile_points,
          SUM(Is_Glass_enabled) as glass_points,
          SUM(Is_Food_enabled) as food_points,
          SUM(Is_Miscellanous_Trash_enabled) as dechetteries,
          SUM(Is_Pharmacy_enabled) as pharmacy_points,
          SUM(Is_Tire_enabled) as tire_points,
          SUM(Is_Ressourcerie_enabled) as ressourcerie_points
        FROM `{table_id}`
        """

        result = client.query(verify_query).to_dataframe().iloc[0]
        print(f"  ‚Ä¢ Total points: {result['total_points']:,}")
        print(f"  ‚Ä¢ With coordinates: {result['points_with_coords']:,}")
        print(f"\n  ‚Ä¢ Special waste points:")
        print(f"     - Neon/Lamp: {result['neon_points']:,}")
        print(f"     - Cartridges: {result['cartridge_points']:,}")
        print(f"     - Batteries: {result['battery_points']:,}")
        print(f"     - Piles: {result['pile_points']:,}")
        print(f"     - Glass: {result['glass_points']:,}")
        print(f"     - Food: {result['food_points']:,}")
        print(f"     - D√©chetteries: {result['dechetteries']:,}")
        print(f"     - Pharmacy: {result['pharmacy_points']:,}")
        print(f"     - Tire: {result['tire_points']:,}")
        print(f"     - Ressourcerie: {result['ressourcerie_points']:,}")

    except Exception as e:
        print(f"‚ùå BigQuery upload failed: {e}")
        print(f"   Data saved locally: '{output_csv}'")

        # Try alternative method
        try:
            print("\nüîÑ Trying alternative CSV upload method via pandas_gbq...")
            import pandas_gbq
            pandas_gbq.to_gbq(
                combined_df,
                destination_table=table_id,
                project_id=PROJECT,
                if_exists='replace',
                progress_bar=True
            )
            print("‚úÖ Upload successful via pandas_gbq!")
        except Exception as e2:
            print(f"‚ùå Alternative also failed: {e2}")

else:
    print("‚ùå No data to combine!")

print("\n" + "="*60)
print("üéØ UNIFIED TRASH COLLECTION DATABASE CREATED!")
print("="*60)
if 'total_locations' in locals():
    print(f"Total collection points: {total_locations:,}")
    print(f"Datasets combined: {len([df for df in [df1, df2, df3, df4, df5] if not df.empty])}")
    print(f"BigQuery table: {table_id if 'table_id' in locals() else 'N/A'}")
    print(f"Local backup: {output_csv if 'output_csv' in locals() else 'N/A'}")

CREATING CURRENT TRASH COLLECTION POINTS TABLE WITH ECOSYSTEM POINTS
‚úì Food waste points: 1,644
‚úì Recycling centers: 15
‚úì Underground containers: 1,079
‚úì Underground containers (other waste types): 1,490
‚úì Ecosystem special waste points: 110

üìä DATASET COMPOSITION BEFORE PROCESSING:
  Food Waste: 1,644 | Recycling Centers: 15 | Glass Containers: 1,079 | Other Waste Containers: 1,490 | Ecosystem Points: 110

üìç CHECKING COORDINATE COLUMNS...
Found coordinate columns: {'longitude': 'Longitude', 'latitude': 'Latitude'}

üîÑ CONVERTING DATA TYPES...
  ‚úì Converted Longitude: 4,338/4,338 valid values
  ‚úì Converted Latitude: 4,338/4,338 valid values
  ‚úì Converted Is_Cardboard_enabled to int
  ‚úì Converted Is_Food_enabled to int
  ‚úì Converted Is_Glass_enabled to int
  ‚úì Converted Is_Metal_enabled to int
  ‚úì Converted Is_Paper_enabled to int
  ‚úì Converted Is_Wood_enabled to int
  ‚úì Converted Is_Plastic_enabled to int
  ‚úì Converted Is_Textile_enabled to int
  ‚

In [91]:
import re

# ADD TEXTILE, PHARMACY, GARAGE, AND RESSOURCERIE COLLECTION POINTS TO UNIFIED DATASET

print("=" * 60)
print("ADDING MULTIPLE COLLECTION POINTS TO UNIFIED DATASET")
print("=" * 60)

# Initialize BigQuery client (add your credentials path)
# credentials = service_account.Credentials.from_service_account_file('your-credentials.json')
# client = bigquery.Client(credentials=credentials, project=PROJECT)

# Or if using environment authentication:
client = bigquery.Client(project=PROJECT)

# 1. LOAD AND CLEAN TEXTILE DATA
print("\n1. LOADING TEXTILE DATA")

textile_file = "Textile_relais.csv"
textile_df = pd.DataFrame()
textile_final = pd.DataFrame()

if os.path.exists(textile_file):
    try:
        # Try multiple encodings
        encodings = ['latin-1', 'utf-8', 'iso-8859-1', 'cp1252']
        textile_df = None

        for encoding in encodings:
            try:
                textile_df = pd.read_csv(textile_file, encoding=encoding, on_bad_lines='skip')
                print(f"   Successfully loaded with {encoding} encoding")
                break
            except:
                continue

        if textile_df is None:
            raise Exception("Could not read file with any encoding")

        print(f"   Loaded {len(textile_df)} rows, {len(textile_df.columns)} columns")
        print(f"   Original columns: {textile_df.columns.tolist()}")

        # Standardize column names
        column_mapping = {}
        for col in textile_df.columns:
            col_lower = str(col).lower().strip()

            if any(x in col_lower for x in ['name', 'nom', 'lieu']):
                column_mapping[col] = 'Name'
            elif any(x in col_lower for x in ['adresse', 'address', 'street', 'rue']):
                column_mapping[col] = 'Address'
            elif any(x in col_lower for x in ['latitude', 'lat']):
                column_mapping[col] = 'Latitude'
            elif any(x in col_lower for x in ['longitude', 'lon', 'long']):
                column_mapping[col] = 'Longitude'
            elif any(x in col_lower for x in ['ville', 'city']):
                column_mapping[col] = 'City'
            elif any(x in col_lower for x in ['code_postal', 'zip', 'postal']):
                column_mapping[col] = 'Postal_Code'

        print(f"   Column mapping: {column_mapping}")
        textile_df = textile_df.rename(columns=column_mapping)

        # Show sample
        if len(textile_df) > 0:
            print(f"\n   SAMPLE TEXTILE DATA (3 rows):")
            display_cols = [c for c in ['Name', 'Address', 'Latitude', 'Longitude'] if c in textile_df.columns]
            if display_cols:
                print(textile_df[display_cols].head(3).to_string(index=False))

    except Exception as e:
        print(f"   ‚ùå Error loading textile data: {e}")
        textile_df = pd.DataFrame()
else:
    print(f"   ‚ö†Ô∏è Textile file not found: {textile_file}")

# 2. LOAD AND CLEAN PHARMACY/GARAGE/RESSOURCERIE DATA
print("\n2. LOADING PHARMACY/GARAGE/RESSOURCERIE DATA")

mixed_file = "pharmacies_garages_ressourceries_nantes.csv"
mixed_df = pd.DataFrame()
mixed_final = pd.DataFrame()

if os.path.exists(mixed_file):
    try:
        # Try multiple encodings
        encodings = ['utf-8', 'latin-1', 'utf-8-sig', 'cp1252']
        mixed_df = None

        for encoding in encodings:
            try:
                mixed_df = pd.read_csv(mixed_file, encoding=encoding, on_bad_lines='skip')
                print(f"   Successfully loaded with {encoding} encoding")
                break
            except Exception as e:
                continue

        if mixed_df is None:
            # Try with different delimiters
            try:
                mixed_df = pd.read_csv(mixed_file, sep=';', encoding='latin-1')
                print("   Successfully loaded with ';' delimiter")
            except:
                raise Exception("Could not read mixed data file")

        print(f"   Loaded {len(mixed_df)} rows, {len(mixed_df.columns)} columns")
        print(f"   Columns: {mixed_df.columns.tolist()}")

        # Standardize column names
        column_mapping = {}
        for col in mixed_df.columns:
            col_lower = str(col).lower().strip()

            if any(x in col_lower for x in ['name', 'nom', '√©tablissement', 'etablissement']):
                column_mapping[col] = 'Name'
            elif any(x in col_lower for x in ['type', 'categorie', 'category', 'type_dechet']):
                column_mapping[col] = 'Type'
            elif any(x in col_lower for x in ['latitude', 'lat']):
                column_mapping[col] = 'Latitude'
            elif any(x in col_lower for x in ['longitude', 'lon', 'long']):
                column_mapping[col] = 'Longitude'
            elif any(x in col_lower for x in ['adresse', 'address', 'adresse_complete']):
                column_mapping[col] = 'Address'
            elif any(x in col_lower for x in ['ville', 'city']):
                column_mapping[col] = 'City'
            elif any(x in col_lower for x in ['code_postal', 'zip', 'postal']):
                column_mapping[col] = 'Postal_Code'

        if column_mapping:
            print(f"   Renaming columns: {column_mapping}")
            mixed_df = mixed_df.rename(columns=column_mapping)
        else:
            print("   No column mapping needed")

        # Show unique types
        if 'Type' in mixed_df.columns:
            unique_types = mixed_df['Type'].dropna().unique()
            print(f"\n   UNIQUE TYPES FOUND ({len(unique_types)}):")
            for type_val in sorted(unique_types)[:20]:  # Show first 20
                count = (mixed_df['Type'] == type_val).sum()
                print(f"     ‚Ä¢ '{type_val}': {count} points")

            if len(unique_types) > 20:
                print(f"     ... and {len(unique_types) - 20} more types")

        # Show sample
        if len(mixed_df) > 0:
            print(f"\n   SAMPLE MIXED DATA (5 rows):")
            display_cols = [c for c in ['Name', 'Type', 'Latitude', 'Longitude', 'Address'] if c in mixed_df.columns]
            if display_cols:
                print(mixed_df[display_cols].head(5).to_string(index=False))

    except Exception as e:
        print(f"   ‚ùå Error loading mixed data: {e}")
        import traceback
        traceback.print_exc()
        mixed_df = pd.DataFrame()
else:
    print(f"   ‚ö†Ô∏è Mixed file not found: {mixed_file}")

# 3. PROCESS TEXTILE DATA
if not textile_df.empty:
    print("\n3. PROCESSING TEXTILE DATA")

    textile_clean = textile_df.copy()
    initial_count = len(textile_clean)

    # Ensure required columns exist
    if 'Name' not in textile_clean.columns:
        textile_clean['Name'] = 'Textile Collection Point'

    if 'Address' not in textile_clean.columns:
        textile_clean['Address'] = ''

    # Clean text fields
    textile_clean['Name'] = textile_clean['Name'].fillna('Textile Collection Point').astype(str).str.strip()
    textile_clean['Address'] = textile_clean['Address'].fillna('').astype(str).str.strip()

    # Clean coordinates
    coord_errors = 0
    if 'Latitude' in textile_clean.columns:
        textile_clean['Latitude'] = pd.to_numeric(textile_clean['Latitude'], errors='coerce')
        coord_errors += textile_clean['Latitude'].isna().sum()

    if 'Longitude' in textile_clean.columns:
        textile_clean['Longitude'] = pd.to_numeric(textile_clean['Longitude'], errors='coerce')
        coord_errors += textile_clean['Longitude'].isna().sum()

    # Filter out points without valid coordinates
    if 'Latitude' in textile_clean.columns and 'Longitude' in textile_clean.columns:
        textile_clean = textile_clean[textile_clean['Latitude'].notna() & textile_clean['Longitude'].notna()].copy()

    print(f"   Cleaned {initial_count} rows, kept {len(textile_clean)} with valid coordinates")
    if coord_errors > 0:
        print(f"   ‚ö†Ô∏è {coord_errors} coordinate conversion errors")

    # Add waste type capabilities
    waste_types = {
        'Is_Textile_enabled': 1,
        'Is_Cardboard_enabled': 0,
        'Is_Food_enabled': 0,
        'Is_Glass_enabled': 0,
        'Is_Metal_enabled': 0,
        'Is_Paper_enabled': 0,
        'Is_Plastic_enabled': 0,
        'Is_Vegetation_enabled': 0,
        'Is_Neon_enabled': 0,
        'Is_Cartridge_enabled': 0,
        'Is_Lamp_Light_enabled': 0,
        'Is_Miscellanous_Trash_enabled': 0,
        'Is_Pharmacy_enabled': 0,
        'Is_Tire_enabled': 0,
        'Is_Ressourcerie_enabled': 0
    }

    for col, default_value in waste_types.items():
        textile_clean[col] = default_value

    # Add source information
    textile_clean['Source'] = 'Textile_relais.csv'
    textile_clean['Data_Type'] = 'Textile_Collection'

    textile_final = textile_clean
    print(f"   ‚úÖ Processed {len(textile_final)} textile collection points")
else:
    print("\n3. TEXTILE DATA: No data to process")
    textile_final = pd.DataFrame()

# 4. PROCESS MIXED DATA (PHARMACY/GARAGE/RESSOURCERIE)
if not mixed_df.empty:
    print("\n4. PROCESSING MIXED DATA")

    mixed_clean = mixed_df.copy()
    initial_count = len(mixed_clean)

    # Ensure required columns exist
    if 'Name' not in mixed_clean.columns:
        mixed_clean['Name'] = ''

    if 'Type' not in mixed_clean.columns:
        mixed_clean['Type'] = 'unknown'

    if 'Address' not in mixed_clean.columns:
        mixed_clean['Address'] = ''

    # Clean fields
    mixed_clean['Name'] = mixed_clean['Name'].fillna('').astype(str).str.strip()
    mixed_clean['Type'] = mixed_clean['Type'].fillna('unknown').astype(str).str.lower().str.strip()
    mixed_clean['Address'] = mixed_clean['Address'].fillna('').astype(str).str.strip()

    # Clean coordinates
    coord_errors = 0
    if 'Latitude' in mixed_clean.columns:
        mixed_clean['Latitude'] = pd.to_numeric(mixed_clean['Latitude'], errors='coerce')
        coord_errors += mixed_clean['Latitude'].isna().sum()

    if 'Longitude' in mixed_clean.columns:
        mixed_clean['Longitude'] = pd.to_numeric(mixed_clean['Longitude'], errors='coerce')
        coord_errors += mixed_clean['Longitude'].isna().sum()

    # Filter out points without valid coordinates
    if 'Latitude' in mixed_clean.columns and 'Longitude' in mixed_clean.columns:
        mixed_clean = mixed_clean[mixed_clean['Latitude'].notna() & mixed_clean['Longitude'].notna()].copy()

    print(f"   Cleaned {initial_count} rows, kept {len(mixed_clean)} with valid coordinates")
    if coord_errors > 0:
        print(f"   ‚ö†Ô∏è {coord_errors} coordinate conversion errors")

    # Initialize all waste type columns to 0
    waste_columns = [
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    for col in waste_columns:
        mixed_clean[col] = 0

    # Classify points by type
    print("\n   CLASSIFYING POINTS BY TYPE:")

    # Pharmacy points
    pharmacy_keywords = ['pharmacy', 'pharmacie', 'm√©dicament', 'medicament', 'drug']
    pharmacy_pattern = '|'.join(pharmacy_keywords)
    pharmacy_mask = mixed_clean['Type'].str.contains(pharmacy_pattern, case=False, na=False)
    mixed_clean.loc[pharmacy_mask, 'Is_Pharmacy_enabled'] = 1
    mixed_clean.loc[pharmacy_mask, 'Data_Type'] = 'Pharmacy'
    pharmacy_count = pharmacy_mask.sum()
    print(f"     ‚Ä¢ Pharmacy: {pharmacy_count} points")

    # Car repair/garage points (tire)
    tire_keywords = ['garage', 'car_repair', 'auto', 'voiture', 'tire', 'pneu', 'v√©hicule', 'vehicle']
    tire_pattern = '|'.join(tire_keywords)
    tire_mask = mixed_clean['Type'].str.contains(tire_pattern, case=False, na=False)
    mixed_clean.loc[tire_mask, 'Is_Tire_enabled'] = 1
    mixed_clean.loc[tire_mask, 'Data_Type'] = 'Car_Repair'
    tire_count = tire_mask.sum()
    print(f"     ‚Ä¢ Car repair/tire: {tire_count} points")

    # Ressourcerie points
    ressourcerie_keywords = ['ressourcerie', 'recyclerie', 'recup', 'recycl', 'reuse', 'r√©emploi']
    ressourcerie_pattern = '|'.join(ressourcerie_keywords)
    ressourcerie_mask = mixed_clean['Type'].str.contains(ressourcerie_pattern, case=False, na=False)
    mixed_clean.loc[ressourcerie_mask, 'Is_Ressourcerie_enabled'] = 1
    mixed_clean.loc[ressourcerie_mask, 'Data_Type'] = 'Ressourcerie'
    ressourcerie_count = ressourcerie_mask.sum()
    print(f"     ‚Ä¢ Ressourcerie: {ressourcerie_count} points")

    # Handle overlaps (if a point matches multiple categories)
    overlap_mask = (pharmacy_mask.astype(int) + tire_mask.astype(int) + ressourcerie_mask.astype(int)) > 1
    overlap_count = overlap_mask.sum()
    if overlap_count > 0:
        print(f"     ‚ö†Ô∏è {overlap_count} points match multiple categories")
        # For overlaps, prioritize in this order: Pharmacy > Ressourcerie > Car Repair
        for idx in mixed_clean[overlap_mask].index:
            if pharmacy_mask[idx]:
                mixed_clean.at[idx, 'Data_Type'] = 'Pharmacy'
            elif ressourcerie_mask[idx]:
                mixed_clean.at[idx, 'Data_Type'] = 'Ressourcerie'
            elif tire_mask[idx]:
                mixed_clean.at[idx, 'Data_Type'] = 'Car_Repair'

    # Check for unclassified points
    classified_mask = pharmacy_mask | tire_mask | ressourcerie_mask
    unclassified = mixed_clean[~classified_mask]
    if len(unclassified) > 0:
        print(f"\n   ‚ö†Ô∏è UNCLASSIFIED POINTS ({len(unclassified)}):")
        unique_unclassified_types = unclassified['Type'].unique()[:10]
        for t in unique_unclassified_types:
            count = (unclassified['Type'] == t).sum()
            print(f"     ‚Ä¢ '{t}': {count} points")

        # Default unclassified to 'Other'
        mixed_clean.loc[~classified_mask, 'Data_Type'] = 'Other'

    # Add source information
    mixed_clean['Source'] = 'pharmacies_garages_ressourceries_nantes.csv'
    if 'Data_Type' not in mixed_clean.columns:
        mixed_clean['Data_Type'] = 'Unknown'

    mixed_final = mixed_clean
    print(f"\n   ‚úÖ Processed {len(mixed_final)} mixed collection points")

    # Final breakdown
    print(f"\n   FINAL BREAKDOWN:")
    if 'Is_Pharmacy_enabled' in mixed_final.columns:
        print(f"     ‚Ä¢ Pharmacy: {mixed_final['Is_Pharmacy_enabled'].sum()}")
    if 'Is_Tire_enabled' in mixed_final.columns:
        print(f"     ‚Ä¢ Car repair/tire: {mixed_final['Is_Tire_enabled'].sum()}")
    if 'Is_Ressourcerie_enabled' in mixed_final.columns:
        print(f"     ‚Ä¢ Ressourcerie: {mixed_final['Is_Ressourcerie_enabled'].sum()}")

else:
    print("\n4. MIXED DATA: No data to process")
    mixed_final = pd.DataFrame()

# 5. LOAD EXISTING UNIFIED TABLE
print("\n5. LOADING EXISTING UNIFIED TABLE")

existing_table_name = f"{PROJECT}.{DATASET}.trash_collection_points"
backup_table_name = f"{PROJECT}.{DATASET}.trash_collection_points_backup_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"

try:
    # First, check if table exists
    print(f"   Checking table: {existing_table_name}")

    # Try to get table metadata
    try:
        table = client.get_table(existing_table_name)
        print(f"   Table exists with {table.num_rows} rows")

        # Create backup
        backup_query = f"""
        CREATE OR REPLACE TABLE `{backup_table_name}` AS
        SELECT * FROM `{existing_table_name}`
        """
        client.query(backup_query).result()
        print(f"   Created backup: {backup_table_name}")

        # Load existing data
        existing_query = f"SELECT * FROM `{existing_table_name}`"
        existing_df = client.query(existing_query).to_dataframe()

    except Exception as e:
        print(f"   Table doesn't exist or error: {e}")
        print("   Creating new table")
        existing_df = pd.DataFrame()

    if not existing_df.empty:
        print(f"   ‚úÖ Loaded {len(existing_df)} existing rows")

        # Show current ID range
        if 'ID' in existing_df.columns:
            min_id = existing_df['ID'].min()
            max_id = existing_df['ID'].max()
            print(f"   Current ID range: {min_id} to {max_id}")
        else:
            print("   ‚ö†Ô∏è No ID column in existing table")
            # Create ID column if missing
            existing_df['ID'] = range(1, len(existing_df) + 1)

        # Check for missing waste type columns and add them
        required_columns = [
            'ID', 'Name', 'Address', 'Longitude', 'Latitude',
            'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
            'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
            'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
            'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
            'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
            'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
        ]

        print("\n   CHECKING REQUIRED COLUMNS:")
        for col in required_columns:
            if col not in existing_df.columns:
                if col.startswith('Is_'):
                    existing_df[col] = 0
                    print(f"     ‚ö†Ô∏è Added missing: {col} (default 0)")
                else:
                    existing_df[col] = None
                    print(f"     ‚ö†Ô∏è Added missing: {col}")

        # FIX: Identify and mark d√©chetteries
        print("\n   IDENTIFYING D√âCHETTERIES:")

        # Check for d√©chetteries in Name
        d√©chetterie_patterns = [
            'd√©ch√®terie', 'd√©chetterie', 'dechetterie', 'decheterie',
            'recycling center', 'centre de recyclage', 'waste center'
        ]
        pattern = '|'.join([re.escape(p) for p in d√©chetterie_patterns])

        name_mask = existing_df['Name'].astype(str).str.contains(pattern, case=False, na=False)
        d√©chetterie_count = name_mask.sum()

        # Mark as miscellaneous trash enabled
        existing_df.loc[name_mask, 'Is_Miscellanous_Trash_enabled'] = 1

        # Also check if already marked
        already_marked = existing_df['Is_Miscellanous_Trash_enabled'].sum()

        print(f"     ‚Ä¢ Found {d√©chetterie_count} by name pattern")
        print(f"     ‚Ä¢ Total marked as d√©chetteries: {already_marked}")

        if d√©chetterie_count > 0:
            print(f"\n     SAMPLE D√âCHETTERIES:")
            sample = existing_df[name_mask].head(3)
            for i, row in sample.iterrows():
                name = str(row.get('Name', 'Unknown'))[:50]
                print(f"       ‚Ä¢ {name}")

        # Check for ressourceries in existing data
        if 'Is_Ressourcerie_enabled' in existing_df.columns:
            ressourcerie_names = ['ressourcerie', 'recyclerie', 'recup', 'recycl']
            ressourcerie_pattern = '|'.join([re.escape(n) for n in ressourcerie_names])
            ressourcerie_mask = existing_df['Name'].astype(str).str.contains(ressourcerie_pattern, case=False, na=False)

            existing_df.loc[ressourcerie_mask, 'Is_Ressourcerie_enabled'] = 1
            print(f"     ‚Ä¢ Found {ressourcerie_mask.sum()} ressourceries in existing data")

        # Ensure all waste columns are integers
        waste_cols = [col for col in existing_df.columns if col.startswith('Is_') and col.endswith('_enabled')]
        for col in waste_cols:
            existing_df[col] = pd.to_numeric(existing_df[col], errors='coerce').fillna(0).astype(int)

        print("   ‚úÖ Existing data prepared")

    else:
        print("   ‚ÑπÔ∏è No existing data or empty table")

except Exception as e:
    print(f"   ‚ùå Error loading existing table: {e}")
    import traceback
    traceback.print_exc()
    print("   Creating new empty dataframe")
    existing_df = pd.DataFrame()

# 6. COMBINE ALL DATASETS
print("\n6. COMBINING ALL DATASETS")

# Collect all new data
new_datasets = []
if not textile_final.empty:
    new_datasets.append(textile_final)
    print(f"   Textile: {len(textile_final)} points")
if not mixed_final.empty:
    new_datasets.append(mixed_final)
    print(f"   Mixed: {len(mixed_final)} points")

if new_datasets:
    # Combine new datasets
    if len(new_datasets) == 1:
        new_data_combined = new_datasets[0]
    else:
        new_data_combined = pd.concat(new_datasets, ignore_index=True, sort=False)

    print(f"\n   Total new points to add: {len(new_data_combined)}")

    # Determine next available ID
    if not existing_df.empty and 'ID' in existing_df.columns:
        next_id = int(existing_df['ID'].max()) + 1
    else:
        next_id = 1

    print(f"   Next available ID: {next_id}")

    # Assign IDs to new points
    new_data_combined['ID'] = range(next_id, next_id + len(new_data_combined))
    print(f"   Assigned IDs {next_id} to {next_id + len(new_data_combined) - 1}")

    # Ensure both dataframes have compatible columns
    print("\n   ALIGNING COLUMNS:")

    # Define core columns (always needed)
    core_columns = [
        'ID', 'Name', 'Address', 'Longitude', 'Latitude',
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    # Remove metadata columns from new data that might conflict
    metadata_cols_to_remove = ['City', 'Postal_Code', 'Source', 'Data_Type', 'Type']
    for col in metadata_cols_to_remove:
        if col in new_data_combined.columns:
            new_data_combined = new_data_combined.drop(columns=[col], errors='ignore')

    # Add missing columns to existing data
    if not existing_df.empty:
        for col in core_columns:
            if col not in existing_df.columns:
                if col.startswith('Is_'):
                    existing_df[col] = 0
                else:
                    existing_df[col] = None

    # Add missing columns to new data
    for col in core_columns:
        if col not in new_data_combined.columns:
            if col.startswith('Is_'):
                new_data_combined[col] = 0
            else:
                new_data_combined[col] = None

    # Ensure all waste columns are integers
    waste_cols = [col for col in core_columns if col.startswith('Is_')]
    for col in waste_cols:
        if col in new_data_combined.columns:
            new_data_combined[col] = pd.to_numeric(new_data_combined[col], errors='coerce').fillna(0).astype(int)
        if col in existing_df.columns and not existing_df.empty:
            existing_df[col] = pd.to_numeric(existing_df[col], errors='coerce').fillna(0).astype(int)

    # Combine datasets
    print(f"\n   COMBINING DATASETS:")
    print(f"     Existing data: {len(existing_df)} rows")
    print(f"     New data: {len(new_data_combined)} rows")

    if existing_df.empty:
        combined_df = new_data_combined[core_columns].copy()
        print("     Combined: New data only (no existing)")
    else:
        combined_df = pd.concat([
            existing_df[core_columns],
            new_data_combined[core_columns]
        ], ignore_index=True)
        print("     Combined: Existing + New data")

    # Final cleanup
    combined_df = combined_df.drop_duplicates(subset=['ID'], keep='first')
    combined_df = combined_df.sort_values('ID').reset_index(drop=True)

    print(f"\n   ‚úÖ Combined dataset: {len(combined_df)} total points")
    print(f"     ‚Ä¢ Previously existing: {len(existing_df)}")
    print(f"     ‚Ä¢ Newly added: {len(new_data_combined)}")

    # Show what was added
    print(f"\n   NEW ADDITIONS SUMMARY:")
    if not textile_final.empty:
        print(f"     ‚Ä¢ Textile: {len(textile_final)} points")

    if not mixed_final.empty:
        # Count by type from mixed data
        type_counts = {}
        if 'Is_Pharmacy_enabled' in mixed_final.columns:
            type_counts['Pharmacy'] = mixed_final['Is_Pharmacy_enabled'].sum()
        if 'Is_Tire_enabled' in mixed_final.columns:
            type_counts['Car Repair/Tire'] = mixed_final['Is_Tire_enabled'].sum()
        if 'Is_Ressourcerie_enabled' in mixed_final.columns:
            type_counts['Ressourcerie'] = mixed_final['Is_Ressourcerie_enabled'].sum()

        for type_name, count in type_counts.items():
            print(f"     ‚Ä¢ {type_name}: {count} points")

else:
    print("   ‚ö†Ô∏è No new data to add")
    if not existing_df.empty:
        combined_df = existing_df
        print(f"   Keeping existing {len(combined_df)} points")
    else:
        combined_df = pd.DataFrame()
        print("   ‚ö†Ô∏è No data at all!")

# 7. FINAL STATISTICS
if not combined_df.empty:
    print("\n7. FINAL STATISTICS")
    print("-" * 40)

    print(f"Total collection points: {len(combined_df):,}")

    # Count by waste type
    print(f"\nWASTE TYPE COVERAGE:")
    waste_stats = {}
    for col in combined_df.columns:
        if col.startswith('Is_') and col.endswith('_enabled'):
            count = int(combined_df[col].sum())
            if count > 0:
                waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
                percentage = (count / len(combined_df)) * 100
                waste_stats[waste_name] = (count, percentage)

    # Sort by count descending
    for waste, (count, percentage) in sorted(waste_stats.items(), key=lambda x: x[1][0], reverse=True):
        print(f"  ‚Ä¢ {waste:25s}: {count:5,d} points ({percentage:5.1f}%)")

    # Special focus on new types
    print(f"\nSPECIALTY COLLECTION POINTS:")
    specialty_types = ['Pharmacy', 'Tire', 'Ressourcerie', 'Textile']
    for type_name in specialty_types:
        col_name = f'Is_{type_name}_enabled'
        if col_name in combined_df.columns:
            count = int(combined_df[col_name].sum())
            if count > 0:
                percentage = (count / len(combined_df)) * 100
                print(f"  ‚Ä¢ {type_name:25s}: {count:5,d} points ({percentage:5.1f}%)")

    # D√©chetterie statistics
    print(f"\nD√âCHETTERIE STATISTICS:")
    if 'Is_Miscellanous_Trash_enabled' in combined_df.columns:
        dechetterie_count = int(combined_df['Is_Miscellanous_Trash_enabled'].sum())
        dechetterie_percentage = (dechetterie_count / len(combined_df)) * 100
        print(f"  ‚Ä¢ Flagged as d√©chetteries: {dechetterie_count:,} ({dechetterie_percentage:.1f}%)")

        # Also check by name
        d√©chetterie_patterns = ['d√©ch√®terie', 'd√©chetterie', 'dechetterie']
        pattern = '|'.join([re.escape(p) for p in d√©chetterie_patterns])
        name_based_count = combined_df['Name'].astype(str).str.contains(pattern, case=False, na=False).sum()
        print(f"  ‚Ä¢ With 'D√©ch√®terie' in name: {name_based_count:,}")

        if 'Is_Ressourcerie_enabled' in combined_df.columns:
            dechetterie_ressourcerie = combined_df[
                (combined_df['Is_Miscellanous_Trash_enabled'] == 1) &
                (combined_df['Is_Ressourcerie_enabled'] == 1)
            ].shape[0]
            print(f"  ‚Ä¢ D√©chetteries with ressourcerie: {dechetterie_ressourcerie:,}")

    # Data quality check
    print(f"\nDATA QUALITY CHECK:")
    print(f"  ‚Ä¢ Total rows: {len(combined_df):,}")

    # Check for nulls in critical columns
    critical_cols = ['Name', 'Longitude', 'Latitude']
    for col in critical_cols:
        if col in combined_df.columns:
            null_count = combined_df[col].isna().sum()
            if null_count == 0:
                print(f"  ‚Ä¢ ‚úÖ {col}: No null values")
            else:
                print(f"  ‚Ä¢ ‚ö†Ô∏è {col}: {null_count:,} null values ({null_count/len(combined_df)*100:.1f}%)")

    # Check waste columns for nulls
    waste_null_count = 0
    waste_cols = [col for col in combined_df.columns if col.startswith('Is_') and col.endswith('_enabled')]
    for col in waste_cols:
        null_count = combined_df[col].isna().sum()
        if null_count > 0:
            waste_null_count += null_count

    if waste_null_count == 0:
        print(f"  ‚Ä¢ ‚úÖ All waste type columns: No null values")
    else:
        print(f"  ‚Ä¢ ‚ö†Ô∏è Waste type columns: {waste_null_count:,} total null values")

    # ID check
    if 'ID' in combined_df.columns:
        duplicate_ids = combined_df['ID'].duplicated().sum()
        if duplicate_ids == 0:
            print(f"  ‚Ä¢ ‚úÖ No duplicate IDs")
        else:
            print(f"  ‚Ä¢ ‚ùå {duplicate_ids:,} duplicate IDs!")

        id_range = f"{combined_df['ID'].min()} to {combined_df['ID'].max()}"
        print(f"  ‚Ä¢ ID range: {id_range}")
else:
    print("\n7. STATISTICS: No data available")

# 8. SAVE RESULTS
print("\n8. SAVING RESULTS")
print("-" * 40)

if not combined_df.empty:
    # Save to CSV
    output_csv = "trash_collection_points_complete.csv"

    # Ensure proper column order
    core_columns = [
        'ID', 'Name', 'Address', 'Longitude', 'Latitude',
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    # Filter to only include columns that exist
    final_columns = [col for col in core_columns if col in combined_df.columns]
    combined_df = combined_df[final_columns]

    # Save to CSV
    combined_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"‚úÖ CSV saved: '{output_csv}' ({len(combined_df)} rows)")

    # Show CSV sample
    print(f"\nCSV SAMPLE (first 3 rows):")
    try:
        csv_preview = pd.read_csv(output_csv, nrows=3)
        for i, row in csv_preview.iterrows():
            point_id = row.get('ID', 'N/A')
            name = str(row.get('Name', 'Unnamed'))[:40]

            # Determine point type
            point_types = []
            if row.get('Is_Pharmacy_enabled', 0) == 1:
                point_types.append("Pharmacy")
            if row.get('Is_Tire_enabled', 0) == 1:
                point_types.append("Car Repair")
            if row.get('Is_Ressourcerie_enabled', 0) == 1:
                point_types.append("Ressourcerie")
            if row.get('Is_Textile_enabled', 0) == 1:
                point_types.append("Textile")
            if row.get('Is_Miscellanous_Trash_enabled', 0) == 1:
                point_types.append("D√©chetterie")

            point_type_str = ", ".join(point_types) if point_types else "Other"
            print(f"  ID {point_id}: {name}... [{point_type_str}]")
    except Exception as e:
        print(f"  Could not read CSV sample: {e}")

    # Upload to BigQuery
    print(f"\nUPLOADING TO BIGQUERY...")

    # Use the original table name for consistency
    bq_table_name = f"{PROJECT}.{DATASET}.trash_collection_points"

    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
        autodetect=True,
        max_bad_records=10
    )

    try:
        # Upload dataframe
        job = client.load_table_from_dataframe(
            combined_df,
            bq_table_name,
            job_config=job_config
        )
        job.result()

        # Verify upload
        table = client.get_table(bq_table_name)
        print(f"‚úÖ BigQuery table updated: {bq_table_name}")
        print(f"   ‚Ä¢ Rows: {table.num_rows:,}")
        print(f"   ‚Ä¢ Size: {table.num_bytes / (1024*1024):.2f} MB")

        # Run verification query
        print(f"\nVERIFICATION QUERY RESULTS:")
        verify_query = f"""
        SELECT
          COUNT(*) as total_points,
          SUM(Is_Textile_enabled) as textile_points,
          SUM(Is_Pharmacy_enabled) as pharmacy_points,
          SUM(Is_Tire_enabled) as tire_points,
          SUM(Is_Ressourcerie_enabled) as ressourcerie_points,
          SUM(Is_Miscellanous_Trash_enabled) as dechetterie_points,
          SUM(CASE WHEN Is_Miscellanous_Trash_enabled = 1 AND Is_Ressourcerie_enabled = 1 THEN 1 ELSE 0 END) as dechetterie_ressourceries
        FROM `{bq_table_name}`
        """

        result = client.query(verify_query).to_dataframe().iloc[0]
        print(f"  ‚Ä¢ Total points: {result['total_points']:,}")
        print(f"  ‚Ä¢ Textile: {result['textile_points']:,}")
        print(f"  ‚Ä¢ Pharmacy: {result['pharmacy_points']:,}")
        print(f"  ‚Ä¢ Car repair/tire: {result['tire_points']:,}")
        print(f"  ‚Ä¢ Ressourcerie: {result['ressourcerie_points']:,}")
        print(f"  ‚Ä¢ D√©chetterie: {result['dechetterie_points']:,}")
        print(f"  ‚Ä¢ D√©chetterie with ressourcerie: {result['dechetterie_ressourceries']:,}")

    except Exception as e:
        print(f"‚ùå BigQuery upload failed: {e}")
        print("\nTrying alternative method...")

        try:
            # Try CSV upload
            with open(output_csv, 'rb') as source_file:
                job = client.load_table_from_file(
                    source_file,
                    bq_table_name,
                    job_config=job_config
                )
                job.result()
                print("‚úÖ Upload successful via CSV file")
        except Exception as e2:
            print(f"‚ùå CSV upload also failed: {e2}")
            print("\nYou can manually upload the CSV file to BigQuery:")
            print(f"  1. Go to BigQuery Console")
            print(f"  2. Create or select table: {bq_table_name}")
            print(f"  3. Upload file: {output_csv}")
            print(f"  4. Schema autodetection should work")

    # Final summary
    print("\n" + "=" * 80)
    print("DATA INTEGRATION COMPLETE!")
    print("=" * 80)

    print(f"\nSUMMARY:")
    print(f"  ‚Ä¢ Total collection points: {len(combined_df):,}")

    if len(combined_df) < 1000:
        print(f"  ‚ö†Ô∏è WARNING: Only {len(combined_df)} points - expected more!")

    print(f"\nNEW TYPES ADDED:")
    new_type_counts = {
        'Textile': textile_final['Is_Textile_enabled'].sum() if not textile_final.empty else 0,
        'Pharmacy': mixed_final['Is_Pharmacy_enabled'].sum() if not mixed_final.empty else 0,
        'Car Repair/Tire': mixed_final['Is_Tire_enabled'].sum() if not mixed_final.empty else 0,
        'Ressourcerie': mixed_final['Is_Ressourcerie_enabled'].sum() if not mixed_final.empty else 0
    }

    for type_name, count in new_type_counts.items():
        if count > 0:
            print(f"  ‚Ä¢ {type_name}: {count:,} points")

    print(f"\nFILES CREATED:")
    print(f"  ‚Ä¢ CSV: {output_csv}")
    print(f"  ‚Ä¢ BigQuery Table: {bq_table_name}")

    if 'backup_table_name' in locals():
        print(f"  ‚Ä¢ Backup: {backup_table_name}")

    print(f"\nNEXT STEPS:")
    print(f"  1. Verify data in BigQuery Console")
    print(f"  2. Test queries on the new waste types")
    print(f"  3. Update any dependent dashboards or applications")

else:
    print("‚ùå No data to save!")

print("\n" + "=" * 80)
print("PROCESS COMPLETE")
print("=" * 80)


ADDING MULTIPLE COLLECTION POINTS TO UNIFIED DATASET

1. LOADING TEXTILE DATA
   Successfully loaded with latin-1 encoding
   Loaded 165 rows, 4 columns
   Original columns: ['Name', 'Adresse', 'Latitude', 'Longitude']
   Column mapping: {'Name': 'Name', 'Adresse': 'Address', 'Latitude': 'Latitude', 'Longitude': 'Longitude'}

   SAMPLE TEXTILE DATA (3 rows):
                                        Name                                    Address   Latitude  Longitude
          Le relais VERTOU - Place du march√©             7 rue de Touraine 44120 VERTOU ¬†47.170327 ¬†-1.470135
Le relais LA HAIE FOUASSI√àRE - Rue de Pibrac   2 rue de Pibrac 44690 LA HAIE-FOUASSI√àRE ¬†47.160871 ¬†-1.427889
                 Le relais ERTOU - D√©cathlon 4 rue des Grands Ch√¢taigniers 44120 VERTOU ¬†47.179730 ¬†-1.502325

2. LOADING PHARMACY/GARAGE/RESSOURCERIE DATA
   Successfully loaded with utf-8 encoding
   Loaded 254 rows, 4 columns
   Columns: ['name', 'type', 'lat', 'lon']
   Renaming columns: {'nam

In [None]:
#TO DO

import re

# ADD TEXTILE, PHARMACY, GARAGE, AND RESSOURCERIE COLLECTION POINTS TO UNIFIED DATASET

print("ADDING MULTIPLE COLLECTION POINTS TO UNIFIED DATASET")

# 1. LOAD AND CLEAN TEXTILE DATA
print("LOADING TEXTILE DATA")

textile_file = "Textile_relais.csv"
textile_df = pd.DataFrame()  # Initialize as empty
textile_final = pd.DataFrame()  # Initialize as empty

if os.path.exists(textile_file):
    try:
        # Read with latin-1 encoding
        textile_df = pd.read_csv(textile_file, encoding='latin-1', on_bad_lines='skip')
        print(f"Loaded textile data: {len(textile_df)} rows")

        # Check column names and standardize
        column_mapping = {}
        for col in textile_df.columns:
            col_lower = str(col).lower()
            if 'name' in col_lower:
                column_mapping[col] = 'Name'
            elif 'adresse' in col_lower or 'address' in col_lower:
                column_mapping[col] = 'Address'
            elif 'latitude' in col_lower or 'lat' in col_lower:
                column_mapping[col] = 'Latitude'
            elif 'longitude' in col_lower or 'lon' in col_lower or 'long' in col_lower:
                column_mapping[col] = 'Longitude'

        if column_mapping:
            print(f"   Renaming columns: {column_mapping}")
            textile_df = textile_df.rename(columns=column_mapping)

        # Show sample
        if len(textile_df) > 0:
            print(f"SAMPLE TEXTILE DATA:")
            sample_cols = [col for col in ['Name', 'Address', 'Latitude', 'Longitude'] if col in textile_df.columns]
            if sample_cols:
                print(textile_df[sample_cols].head(3).to_string(index=False))

    except Exception as e:
        print(f"Error loading textile data: {e}")
        textile_df = pd.DataFrame()
else:
    print(f"Textile file not found: {textile_file}")

# 2. LOAD AND CLEAN PHARMACY/GARAGE/RESSOURCERIE DATA
print("\nLOADING PHARMACY/GARAGE/RESSOURCERIE DATA")

mixed_file = "pharmacies_garages_ressourceries_nantes.csv"
mixed_df = pd.DataFrame()
mixed_final = pd.DataFrame()

if os.path.exists(mixed_file):
    try:
        mixed_df = pd.read_csv(mixed_file, encoding='utf-8')
        print(f"Loaded mixed data: {len(mixed_df)} rows")
        print(f"   Columns: {mixed_df.columns.tolist()}")

        # Check column names
        column_mapping = {}
        for col in mixed_df.columns:
            col_lower = str(col).lower()
            if 'name' in col_lower or 'nom' in col_lower:
                column_mapping[col] = 'Name'
            elif 'type' in col_lower or 'categorie' in col_lower:
                column_mapping[col] = 'Type'
            elif 'latitude' in col_lower or 'lat' in col_lower:
                column_mapping[col] = 'Latitude'
            elif 'longitude' in col_lower or 'lon' in col_lower or 'long' in col_lower:
                column_mapping[col] = 'Longitude'
            elif 'adresse' in col_lower or 'address' in col_lower:
                column_mapping[col] = 'Address'

        if column_mapping:
            print(f"   Renaming columns: {column_mapping}")
            mixed_df = mixed_df.rename(columns=column_mapping)

        # Show unique types
        if 'Type' in mixed_df.columns:
            unique_types = mixed_df['Type'].unique()
            print(f"   Unique types found: {list(unique_types)}")
            for type_val in unique_types:
                count = (mixed_df['Type'] == type_val).sum()
                print(f"     ‚Ä¢ {type_val}: {count} points")

        # Show sample
        if len(mixed_df) > 0:
            print(f"SAMPLE MIXED DATA:")
            sample_cols = [col for col in ['Name', 'Type', 'Latitude', 'Longitude', 'Address'] if col in mixed_df.columns]
            if sample_cols:
                print(mixed_df[sample_cols].head(5).to_string(index=False))

    except Exception as e:
        print(f"Error loading mixed data: {e}")
        mixed_df = pd.DataFrame()
else:
    print(f"Mixed file not found: {mixed_file}")

# 3. PROCESS TEXTILE DATA
if not textile_df.empty:
    print("PROCESSING TEXTILE DATA")

    textile_clean = textile_df.copy()

    # Clean text fields
    if 'Name' in textile_clean.columns:
        textile_clean['Name'] = textile_clean['Name'].fillna('Textile Collection Point').astype(str).str.strip()
    else:
        textile_clean['Name'] = 'Textile Collection Point'

    if 'Address' in textile_clean.columns:
        textile_clean['Address'] = textile_clean['Address'].fillna('').astype(str).str.strip()
    else:
        textile_clean['Address'] = ''

    # Clean coordinates
    if 'Latitude' in textile_clean.columns:
        textile_clean['Latitude'] = textile_clean['Latitude'].astype(str).str.replace('[^0-9.-]', '', regex=True)
        textile_clean['Latitude'] = pd.to_numeric(textile_clean['Latitude'], errors='coerce')

    if 'Longitude' in textile_clean.columns:
        textile_clean['Longitude'] = textile_clean['Longitude'].astype(str).str.replace('[^0-9.-]', '', regex=True)
        textile_clean['Longitude'] = pd.to_numeric(textile_clean['Longitude'], errors='coerce')

    # Filter out points without valid coordinates
    if 'Latitude' in textile_clean.columns and 'Longitude' in textile_clean.columns:
        textile_clean = textile_clean[textile_clean['Latitude'].notna() & textile_clean['Longitude'].notna()].copy()

    # Add waste type capabilities (TEXTILE ONLY) - WITH CORRECT COLUMN NAMES
    textile_clean['Is_Textile_enabled'] = 1
    textile_clean['Is_Cardboard_enabled'] = 0
    textile_clean['Is_Food_enabled'] = 0
    textile_clean['Is_Glass_enabled'] = 0
    textile_clean['Is_Metal_enabled'] = 0
    textile_clean['Is_Paper_enabled'] = 0
    textile_clean['Is_Plastic_enabled'] = 0
    textile_clean['Is_Vegetation_enabled'] = 0
    textile_clean['Is_Neon_enabled'] = 0
    textile_clean['Is_Cartridge_enabled'] = 0
    textile_clean['Is_Lamp_Light_enabled'] = 0
    textile_clean['Is_Miscellanous_Trash_enabled'] = 0
    textile_clean['Is_Pharmacy_enabled'] = 0
    textile_clean['Is_Tire_enabled'] = 0
    textile_clean['Is_Ressourcerie_enabled'] = 0

    textile_final = textile_clean
    print(f"Processed {len(textile_final)} textile collection points")
else:
    print("No textile data to process")

# 4. PROCESS MIXED DATA (PHARMACY/GARAGE/RESSOURCERIE)
if not mixed_df.empty:
    print("PROCESSING MIXED DATA")

    mixed_clean = mixed_df.copy()

    # Clean Name
    if 'Name' in mixed_clean.columns:
        mixed_clean['Name'] = mixed_clean['Name'].fillna('').astype(str).str.strip()
    else:
        mixed_clean['Name'] = ''

    # Clean Type (normalize to lowercase)
    if 'Type' in mixed_clean.columns:
        mixed_clean['Type'] = mixed_clean['Type'].fillna('').astype(str).str.lower().str.strip()

    # Clean Address if exists
    if 'Address' not in mixed_clean.columns:
        mixed_clean['Address'] = ''

    # Clean coordinates
    if 'Latitude' in mixed_clean.columns:
        mixed_clean['Latitude'] = pd.to_numeric(mixed_clean['Latitude'], errors='coerce')

    if 'Longitude' in mixed_clean.columns:
        mixed_clean['Longitude'] = pd.to_numeric(mixed_clean['Longitude'], errors='coerce')

    # Filter out points without valid coordinates
    if 'Latitude' in mixed_clean.columns and 'Longitude' in mixed_clean.columns:
        mixed_clean = mixed_clean[mixed_clean['Latitude'].notna() & mixed_clean['Longitude'].notna()].copy()

    # Initialize all waste type columns to 0 - WITH CORRECT NAMES
    waste_columns = [
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    for col in waste_columns:
        mixed_clean[col] = 0

    # Set appropriate columns based on type
    if 'Type' in mixed_clean.columns:
        # Pharmacy points
        pharmacy_mask = mixed_clean['Type'].str.contains('pharmacy|pharmacie', case=False, na=False)
        mixed_clean.loc[pharmacy_mask, 'Is_Pharmacy_enabled'] = 1
        print(f"   Pharmacy points: {pharmacy_mask.sum()}")

        # Car repair/garage points (tire)
        tire_mask = mixed_clean['Type'].str.contains('car_repair|garage|repair|tire|pneu', case=False, na=False)
        mixed_clean.loc[tire_mask, 'Is_Tire_enabled'] = 1
        print(f"   Car repair/tire points: {tire_mask.sum()}")

        # Ressourcerie points
        ressourcerie_mask = mixed_clean['Type'].str.contains('ressourcerie|recup|recycl', case=False, na=False)
        mixed_clean.loc[ressourcerie_mask, 'Is_Ressourcerie_enabled'] = 1
        print(f"   Ressourcerie points: {ressourcerie_mask.sum()}")

        # Check for any unclassified types
        unclassified = mixed_clean[~pharmacy_mask & ~tire_mask & ~ressourcerie_mask]
        if len(unclassified) > 0:
            print(f"Unclassified types: {len(unclassified)} points")
            print(f"Types: {unclassified['Type'].unique()}")

    # CRITICAL FIX: Assign to mixed_final
    mixed_final = mixed_clean
    print(f"Processed {len(mixed_final)} mixed collection points")

    # Show breakdown
    print(f"MIXED DATA BREAKDOWN:")
    if 'Is_Pharmacy_enabled' in mixed_final.columns:
        print(f"   ‚Ä¢ Pharmacy: {mixed_final['Is_Pharmacy_enabled'].sum()}")
    if 'Is_Tire_enabled' in mixed_final.columns:
        print(f"   ‚Ä¢ Car repair/tire: {mixed_final['Is_Tire_enabled'].sum()}")
    if 'Is_Ressourcerie_enabled' in mixed_final.columns:
        print(f"   ‚Ä¢ Ressourcerie: {mixed_final['Is_Ressourcerie_enabled'].sum()}")

    # Show sample of mixed data
    print(f"SAMPLE OF PROCESSED MIXED DATA:")
    if len(mixed_final) > 0:
        sample = mixed_final.head(5)
        for i, row in sample.iterrows():
            name = row.get('Name', 'Unnamed')
            point_type = row.get('Type', 'unknown')
            print(f"   {i+1}. {name} ({point_type})")
else:
    print("No mixed data to process")
    mixed_final = pd.DataFrame()

# 5. LOAD EXISTING UNIFIED TABLE
print("\nLOADING EXISTING UNIFIED TABLE")

existing_table = f"{PROJECT}.{DATASET}.trash_collection_points"

try:
    # Query ALL existing data
    existing_query = f"""
    SELECT * FROM `{existing_table}`
    """
    existing_df = client.query(existing_query).to_dataframe()
    print(f"‚úÖ Loaded existing table: {len(existing_df)} rows")

    if len(existing_df) == 0:
        print("‚ö†Ô∏è WARNING: Existing table is empty! Check table name.")

    # Show current ID range
    if 'ID' in existing_df.columns:
        print(f"   Current ID range: {existing_df['ID'].min()} to {existing_df['ID'].max()}")

    # FIX 1: Check and add new columns if they don't exist
    new_columns = ['Is_Pharmacy_enabled', 'Is_Tire_enabled', 'Is_Ressourcerie_enabled']
    for col in new_columns:
        if col not in existing_df.columns:
            existing_df[col] = 0
            print(f"   Added missing waste type column: {col}")

    # FIX 2: SET Is_Miscellanous_Trash_enabled = 1 FOR ALL D√âCH√àTERIES
    print(f"\n   IDENTIFYING D√âCH√àTERIES IN NAME COLUMN...")

    # Check if Is_Miscellanous_Trash_enabled column exists
    if 'Is_Miscellanous_Trash_enabled' not in existing_df.columns:
        existing_df['Is_Miscellanous_Trash_enabled'] = 0
        print(f"   Created missing column: Is_Miscellanous_Trash_enabled")

    # Check for d√©chetteries in the Name column
    d√©chetterie_patterns = [
        'D√©ch√®terie', 'D√©chetterie', 'DECHETTERIE', 'dechetterie',
        'D√©ch√®teries', 'D√©chetteries', 'DECHETTERIES', 'dechetteries',
        'Recycling Center', 'RECYCLING CENTER', 'recycling center'
    ]

    # Create a pattern for case-insensitive matching
    pattern = '|'.join([re.escape(pattern) for pattern in d√©chetterie_patterns])

    # Find all entries with D√©ch√®terie in Name
    d√©chetterie_mask = existing_df['Name'].str.contains(pattern, case=False, na=False)
    d√©chetterie_count = d√©chetterie_mask.sum()

    print(f"   Found {d√©chetterie_count} entries with 'D√©ch√®terie' in Name")

    if d√©chetterie_count > 0:
        # Show sample of found d√©chetteries
        sample_d√©chetteries = existing_df[d√©chetterie_mask].head(5)
        print(f"   Sample d√©chetteries found:")
        for i, row in sample_d√©chetteries.iterrows():
            name = row.get('Name', 'Unnamed')[:60]
            print(f"     ‚Ä¢ {name}")

    # Set Is_Miscellanous_Trash_enabled = 1 for all d√©chetteries
    existing_df.loc[d√©chetterie_mask, 'Is_Miscellanous_Trash_enabled'] = 1

    # Also check for entries already marked as d√©chetteries
    already_marked = existing_df[existing_df['Is_Miscellanous_Trash_enabled'] == 1].shape[0]
    print(f"   Total d√©chetteries marked: {already_marked}")

    # Check for d√©chetteries that might also be ressourceries
    if 'Is_Ressourcerie_enabled' in existing_df.columns:
        ressourcerie_names = ['ressourcerie', 'recup', 'recyclerie', 'RESSOURCERIE', 'RECUP', 'RECYCLERIE']
        ressourcerie_pattern = '|'.join([re.escape(name) for name in ressourcerie_names])
        ressourcerie_mask = existing_df['Name'].str.contains(ressourcerie_pattern, case=False, na=False)

        # Set Is_Ressourcerie_enabled = 1 for d√©chetteries that are also ressourceries
        existing_df.loc[d√©chetterie_mask & ressourcerie_mask, 'Is_Ressourcerie_enabled'] = 1
        dechetterie_ressourcerie_count = (d√©chetterie_mask & ressourcerie_mask).sum()
        print(f"   {dechetterie_ressourcerie_count} d√©chetteries are also ressourceries")

    # FIX 3: Ensure all waste columns are filled with 0 instead of null
    waste_columns = [
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    for col in waste_columns:
        if col in existing_df.columns:
            existing_df[col] = existing_df[col].fillna(0).astype(int)

    print(f"   Fixed null values in waste type columns")

except Exception as e:
    print(f"‚ùå Error loading existing table: {e}")
    print("Creating new empty dataframe")
    existing_df = pd.DataFrame()

# 6. COMBINE ALL DATASETS
print("\nCOMBINING ALL DATASETS")

all_new_data = []
if not textile_final.empty:
    all_new_data.append(textile_final)
    print(f"   Textile data ready: {len(textile_final)} points")
if not mixed_final.empty:
    all_new_data.append(mixed_final)
    print(f"   Mixed data ready: {len(mixed_final)} points")

if all_new_data:
    # Combine all new data
    if len(all_new_data) == 1:
        new_data_combined = all_new_data[0]
    else:
        new_data_combined = pd.concat(all_new_data, ignore_index=True, sort=False)

    print(f"\n   Total new data to add: {len(new_data_combined)} points")

    # Get the next available ID starting from existing max + 1
    if not existing_df.empty and 'ID' in existing_df.columns:
        next_id = int(existing_df['ID'].max()) + 1
    else:
        next_id = 1

    print(f"   Next available ID: {next_id}")

    # Assign IDs to new points
    new_data_combined['ID'] = range(next_id, next_id + len(new_data_combined))
    print(f"   Assigned IDs {next_id} to {next_id + len(new_data_combined) - 1} to new points")

    # Ensure both dataframes have same columns
    all_columns = set(existing_df.columns.tolist() if not existing_df.empty else [])
    all_columns.update(new_data_combined.columns.tolist())

    # Remove metadata columns if they exist in new data
    metadata_columns_to_remove = ['City', 'Source_Files', 'Original_Waste_Types', 'Type']
    for col in metadata_columns_to_remove:
        if col in new_data_combined.columns:
            new_data_combined = new_data_combined.drop(columns=[col])
            print(f"   Removed metadata column from new data: {col}")

    # Add missing columns to each dataframe
    for col in all_columns:
        if col not in existing_df.columns and not existing_df.empty:
            existing_df[col] = None
        if col not in new_data_combined.columns:
            new_data_combined[col] = None

    # Define column order (ONLY essential columns)
    common_columns = [
        'ID', 'Name', 'Address', 'Longitude', 'Latitude',
        'Is_Cardboard_enabled', 'Is_Food_enabled', 'Is_Glass_enabled',
        'Is_Metal_enabled', 'Is_Paper_enabled', 'Is_Plastic_enabled',
        'Is_Textile_enabled', 'Is_Vegetation_enabled', 'Is_Neon_enabled',
        'Is_Cartridge_enabled', 'Is_Lamp_Light_enabled',
        'Is_Miscellanous_Trash_enabled', 'Is_Pharmacy_enabled',
        'Is_Tire_enabled', 'Is_Ressourcerie_enabled'
    ]

    # Keep only columns that exist
    existing_cols = [col for col in common_columns if col in existing_df.columns or col in new_data_combined.columns]

    # Ensure all waste columns are integers (0 or 1)
    for col in existing_cols:
        if col.startswith('Is_') and col.endswith('_enabled'):
            if col in existing_df.columns:
                existing_df[col] = existing_df[col].fillna(0).astype(int)
            if col in new_data_combined.columns:
                new_data_combined[col] = new_data_combined[col].fillna(0).astype(int)

    # Combine - Make sure we keep ALL existing data
    print(f"\nüîó COMBINING DATA:")
    print(f"   Existing data shape: {existing_df.shape}")
    print(f"   New data shape: {new_data_combined.shape}")

    if existing_df.empty:
        combined_df = new_data_combined[existing_cols].copy()
        print(f"   Combined: New data only (no existing data)")
    else:
        combined_df = pd.concat([
            existing_df[existing_cols],
            new_data_combined[existing_cols]
        ], ignore_index=True, sort=False)
        print(f"   Combined: Existing + New data")

    print(f"‚úÖ Combined dataset: {len(combined_df)} total points")
    print(f"   - Existing points: {len(existing_df) if not existing_df.empty else 0}")
    print(f"   - New points: {len(new_data_combined)}")

    # Show what was added
    print(f"\nüìä NEW DATA ADDED:")
    if not textile_final.empty:
        print(f"   ‚Ä¢ Textile points: {len(textile_final)}")
    if not mixed_final.empty:
        print(f"   ‚Ä¢ Pharmacy points: {mixed_final['Is_Pharmacy_enabled'].sum()}")
        print(f"   ‚Ä¢ Car repair/tire points: {mixed_final['Is_Tire_enabled'].sum()}")
        print(f"   ‚Ä¢ Ressourcerie points: {mixed_final['Is_Ressourcerie_enabled'].sum()}")

else:
    print("‚ö†Ô∏è No new data to add")
    combined_df = existing_df

# 7. UPDATE STATISTICS
if not combined_df.empty:
    print("\nUPDATED STATISTICS:")
    print(f"   Total collection points: {len(combined_df):,}")

    if len(combined_df) < 1000:
        print(f"   ‚ö†Ô∏è WARNING: Only {len(combined_df)} points - data may be lost!")

    # Count by waste type
    waste_stats = {}
    for col in combined_df.columns:
        if col.startswith('Is_') and col.endswith('_enabled'):
            count = int(combined_df[col].sum())
            if count > 0:
                waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
                waste_stats[waste_name] = count

    print(f"\n   Waste/Specialty type coverage:")
    for waste, count in sorted(waste_stats.items()):
        percentage = (count / len(combined_df)) * 100
        print(f"     ‚Ä¢ {waste:20s}: {count:6,d} points ({percentage:5.1f}%)")

    # Special statistics for new types
    print(f"\nNEW SPECIALTY TYPES:")
    new_types = ['Pharmacy', 'Tire', 'Ressourcerie']
    for type_name in new_types:
        col_name = f'Is_{type_name}_enabled'
        if col_name in combined_df.columns:
            count = int(combined_df[col_name].sum())
            if count > 0:
                percentage = (count / len(combined_df)) * 100
                print(f"     ‚Ä¢ {type_name:20s}: {count:6,d} points ({percentage:5.1f}%)")

    # Show d√©chetterie statistics
    if 'Is_Miscellanous_Trash_enabled' in combined_df.columns:
        dechetterie_count = int(combined_df['Is_Miscellanous_Trash_enabled'].sum())
        print(f"\nD√âCHETTERIE STATISTICS:")
        print(f"   ‚Ä¢ Total d√©chetteries: {dechetterie_count:,}")
        print(f"   ‚Ä¢ Percentage of total: {(dechetterie_count/len(combined_df))*100:.1f}%")

        # Also check by Name pattern for verification
        d√©chetterie_patterns = ['D√©ch√®terie', 'D√©chetterie', 'DECHETTERIE', 'dechetterie']
        pattern = '|'.join([re.escape(pattern) for pattern in d√©chetterie_patterns])
        name_based_count = combined_df['Name'].str.contains(pattern, case=False, na=False).sum()
        print(f"   ‚Ä¢ By Name pattern: {name_based_count:,}")

        if 'Is_Ressourcerie_enabled' in combined_df.columns:
            dechetterie_ressourcerie = combined_df[
                (combined_df['Is_Miscellanous_Trash_enabled'] == 1) &
                (combined_df['Is_Ressourcerie_enabled'] == 1)
            ].shape[0]
            print(f"   ‚Ä¢ D√©chetteries with ressourcerie: {dechetterie_ressourcerie:,}")

# 8. SAVE TO BIGQUERY
print("\nSAVING UPDATED DATASET")

# Use the same table name
new_table = f"{PROJECT}.{DATASET}.trash_collection_points_complete"
output_csv = "trash_collection_points_complete.csv"

if not combined_df.empty:
    # Sort by ID
    combined_df = combined_df.sort_values('ID').reset_index(drop=True)

    # Ensure all waste columns are integers
    waste_columns = [col for col in combined_df.columns if col.startswith('Is_') and col.endswith('_enabled')]
    for col in waste_columns:
        combined_df[col] = combined_df[col].fillna(0).astype(int)

    # Save to CSV
    combined_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"CSV saved: '{output_csv}' ({len(combined_df)} rows)")

    # Show sample of the CSV file
    print(f"FIRST 5 ROWS OF CSV FILE:")
    try:
        saved_csv = pd.read_csv(output_csv, nrows=5)

        # Show all rows
        for i, row in saved_csv.head(5).iterrows():
            name = row.get('Name', 'Unnamed')[:50]
            point_id = row.get('ID', 'N/A')

            # Determine point type
            point_type = "Other"
            if row.get('Is_Pharmacy_enabled', 0) == 1:
                point_type = "Pharmacy"
            elif row.get('Is_Tire_enabled', 0) == 1:
                point_type = "Car Repair/Tire"
            elif row.get('Is_Ressourcerie_enabled', 0) == 1:
                point_type = "Ressourcerie"
            elif row.get('Is_Textile_enabled', 0) == 1:
                point_type = "Textile"
            elif row.get('Is_Miscellanous_Trash_enabled', 0) == 1:
                point_type = "D√©chetterie"

            print(f"   ID {point_id}: {name}... ({point_type})")
    except Exception as e:
        print(f"   Could not read CSV for sample: {e}")

    # Upload to BigQuery
    print(f"\nUPLOADING TO BIGQUERY: {new_table}")

    job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE",
        autodetect=True,
        max_bad_records=100
    )

    try:
        job = client.load_table_from_dataframe(combined_df, new_table, job_config=job_config)
        job.result()

        table = client.get_table(new_table)
        print(f"BigQuery table created/updated: {new_table}")
        print(f"   Rows: {table.num_rows:,}")
        print(f"   Size: {table.num_bytes / (1024*1024):.2f} MB")

        # Verification query
        verify_query = f"""
        SELECT
          COUNT(*) as total_points,
          SUM(Is_Textile_enabled) as textile_points,
          SUM(Is_Pharmacy_enabled) as pharmacy_points,
          SUM(Is_Tire_enabled) as tire_points,
          SUM(Is_Ressourcerie_enabled) as ressourcerie_points,
          SUM(Is_Miscellanous_Trash_enabled) as dechetteries,
          SUM(CASE WHEN Is_Miscellanous_Trash_enabled = 1 AND Is_Ressourcerie_enabled = 1 THEN 1 ELSE 0 END) as dechetterie_ressourceries,
          -- Also count by Name pattern for verification
          SUM(CASE WHEN UPPER(Name) LIKE '%D√âCH√àTERIE%' OR UPPER(Name) LIKE '%DECHETTERIE%' THEN 1 ELSE 0 END) as name_based_dechetteries
        FROM `{new_table}`
        """

        result = client.query(verify_query).to_dataframe().iloc[0]
        print(f"\nVERIFICATION:")
        print(f"   ‚Ä¢ Total points: {result['total_points']:,}")
        print(f"   ‚Ä¢ Textile points: {result['textile_points']:,}")
        print(f"   ‚Ä¢ Pharmacy points: {result['pharmacy_points']:,}")
        print(f"   ‚Ä¢ Car repair/tire points: {result['tire_points']:,}")
        print(f"   ‚Ä¢ Ressourcerie points: {result['ressourcerie_points']:,}")
        print(f"   ‚Ä¢ D√©chetteries (flagged): {result['dechetteries']:,}")
        print(f"   ‚Ä¢ D√©chetteries (by name): {result['name_based_dechetteries']:,}")
        print(f"   ‚Ä¢ D√©chetteries with ressourcerie: {result['dechetterie_ressourceries']:,}")

        # Check data quality
        quality_query = f"""
        SELECT
          COUNT(*) as total_rows,
          SUM(CASE WHEN Is_Miscellanous_Trash_enabled IS NULL THEN 1 ELSE 0 END) as null_misc_trash,
          SUM(CASE WHEN Is_Pharmacy_enabled IS NULL THEN 1 ELSE 0 END) as null_pharmacy,
          SUM(CASE WHEN Is_Tire_enabled IS NULL THEN 1 ELSE 0 END) as null_tire,
          SUM(CASE WHEN Is_Ressourcerie_enabled IS NULL THEN 1 ELSE 0 END) as null_ressourcerie
        FROM `{new_table}`
        """

        quality_result = client.query(quality_query).to_dataframe().iloc[0]
        print(f"\nDATA QUALITY CHECK (NULL VALUES):")
        print(f"   ‚Ä¢ Total rows: {quality_result['total_rows']:,}")
        print(f"   ‚Ä¢ Null Is_Miscellanous_Trash_enabled: {quality_result['null_misc_trash']:,}")
        print(f"   ‚Ä¢ Null Is_Pharmacy_enabled: {quality_result['null_pharmacy']:,}")
        print(f"   ‚Ä¢ Null Is_Tire_enabled: {quality_result['null_tire']:,}")
        print(f"   ‚Ä¢ Null Is_Ressourcerie_enabled: {quality_result['null_ressourcerie']:,}")

        if (quality_result['null_misc_trash'] == 0 and
            quality_result['null_pharmacy'] == 0 and
            quality_result['null_tire'] == 0 and
            quality_result['null_ressourcerie'] == 0):
            print(f"   ‚úÖ All waste type columns have 0 null values")
        else:
            print(f"   ‚ö†Ô∏è Some waste type columns have null values")

    except Exception as e:
        print(f"BigQuery upload failed: {e}")

        # Try alternative
        try:
            print("\nTrying alternative CSV upload...")
            import pandas_gbq
            pandas_gbq.to_gbq(
                combined_df,
                destination_table=new_table,
                project_id=PROJECT,
                if_exists='replace',
                progress_bar=True
            )
            print("Upload successful via pandas_gbq!")
        except Exception as e2:
            print(f"Alternative failed: {e2}")
else:
    print("No data to save!")

print("\nDATA INTEGRATION COMPLETE!")

if not combined_df.empty:
    print(f"\nTotal collection points: {len(combined_df):,}")

    if len(combined_df) < 4000:
        print(f"‚ö†Ô∏è WARNING: Expected ~4000+ points, but got only {len(combined_df)}")
        print(f"   Check if existing data was loaded correctly from: {existing_table}")

    print(f"\nNew specialty types added:")
    if 'Is_Textile_enabled' in combined_df.columns:
        textile_count = int(combined_df['Is_Textile_enabled'].sum())
        print(f"  ‚Ä¢ Textile: {textile_count:,}")

    if not mixed_final.empty:
        print(f"  FROM MIXED DATA FILE:")
        if 'Is_Pharmacy_enabled' in mixed_final.columns:
            pharmacy_count = int(mixed_final['Is_Pharmacy_enabled'].sum())
            print(f"    ‚Ä¢ Pharmacy: {pharmacy_count:,}")
        if 'Is_Tire_enabled' in mixed_final.columns:
            tire_count = int(mixed_final['Is_Tire_enabled'].sum())
            print(f"    ‚Ä¢ Car repair/tire: {tire_count:,}")
        if 'Is_Ressourcerie_enabled' in mixed_final.columns:
            ressourcerie_count = int(mixed_final['Is_Ressourcerie_enabled'].sum())
            print(f"    ‚Ä¢ Ressourcerie: {ressourcerie_count:,}")

    # Show d√©chetterie information
    if 'Is_Miscellanous_Trash_enabled' in combined_df.columns:
        dechetterie_count = int(combined_df['Is_Miscellanous_Trash_enabled'].sum())
        print(f"\nD√âCHETTERIES:")
        print(f"  ‚Ä¢ Total d√©chetteries: {dechetterie_count:,}")

        # Also show count by name pattern
        d√©chetterie_patterns = ['D√©ch√®terie', 'D√©chetterie', 'DECHETTERIE', 'dechetterie']
        pattern = '|'.join([re.escape(pattern) for pattern in d√©chetterie_patterns])
        name_based_count = combined_df['Name'].str.contains(pattern, case=False, na=False).sum()
        print(f"  ‚Ä¢ With 'D√©ch√®terie' in Name: {name_based_count:,}")

        if 'Is_Ressourcerie_enabled' in combined_df.columns:
            dechetterie_ressourcerie = combined_df[
                (combined_df['Is_Miscellanous_Trash_enabled'] == 1) &
                (combined_df['Is_Ressourcerie_enabled'] == 1)
            ].shape[0]
            print(f"  ‚Ä¢ D√©chetteries with ressourcerie: {dechetterie_ressourcerie:,}")

    print(f"\nBigQuery table: {new_table}")
    print(f"Local backup: {output_csv}")

    # Final verification
    print(f"\nFINAL VERIFICATION - CSV CHECK:")
    try:
        final_check = pd.read_csv(output_csv)

        # Check for null values in waste columns
        print(f"\n   NULL VALUE CHECK IN CSV:")
        waste_cols = [col for col in final_check.columns if col.startswith('Is_') and col.endswith('_enabled')]
        all_good = True

        for col in waste_cols:
            null_count = final_check[col].isna().sum()
            if null_count > 0:
                print(f"     ‚ùå {col}: {null_count:,} null values")
                all_good = False
            else:
                print(f"     ‚úÖ {col}: No null values")

        if all_good:
            print(f"\n   ‚úÖ All waste type columns in CSV have 0 null values")
        else:
            print(f"\n   ‚ö†Ô∏è Some waste type columns in CSV have null values")

        # Show d√©chetterie count in CSV
        if 'Is_Miscellanous_Trash_enabled' in final_check.columns:
            csv_dechetteries = final_check['Is_Miscellanous_Trash_enabled'].sum()
            print(f"\n   D√âCHETTERIES IN CSV:")
            print(f"     ‚Ä¢ Total flagged: {int(csv_dechetteries):,}")

            # Also check by name
            d√©chetterie_patterns = ['D√©ch√®terie', 'D√©chetterie', 'DECHETTERIE', 'dechetterie']
            pattern = '|'.join([re.escape(pattern) for pattern in d√©chetterie_patterns])
            name_based_count = final_check['Name'].str.contains(pattern, case=False, na=False).sum()
            print(f"     ‚Ä¢ With 'D√©ch√®terie' in Name: {name_based_count:,}")

            if 'Is_Ressourcerie_enabled' in final_check.columns:
                csv_dechetterie_ressourcerie = final_check[
                    (final_check['Is_Miscellanous_Trash_enabled'] == 1) &
                    (final_check['Is_Ressourcerie_enabled'] == 1)
                ].shape[0]
                print(f"     ‚Ä¢ With ressourcerie: {csv_dechetterie_ressourcerie:,}")

        # Show column structure
        print(f"\n   CSV COLUMN STRUCTURE:")
        print(f"     Total columns: {len(final_check.columns)}")
        print(f"     Waste type columns: {len(waste_cols)}")
        print(f"     First few columns: {final_check.columns[:6].tolist()}")
        print(f"     Column names (waste types):")
        for col in waste_cols:
            print(f"       - {col}")

    except Exception as e:

        print(f"   Error checking CSV: {e}")

ADDING MULTIPLE COLLECTION POINTS TO UNIFIED DATASET
LOADING TEXTILE DATA
Loaded textile data: 165 rows
   Renaming columns: {'Name': 'Name', 'Adresse': 'Address', 'Latitude': 'Latitude', 'Longitude': 'Longitude'}
SAMPLE TEXTILE DATA:
                                        Name                                    Address   Latitude  Longitude
          Le relais VERTOU - Place du march√©             7 rue de Touraine 44120 VERTOU ¬†47.170327 ¬†-1.470135
Le relais LA HAIE FOUASSI√àRE - Rue de Pibrac   2 rue de Pibrac 44690 LA HAIE-FOUASSI√àRE ¬†47.160871 ¬†-1.427889
                 Le relais ERTOU - D√©cathlon 4 rue des Grands Ch√¢taigniers 44120 VERTOU ¬†47.179730 ¬†-1.502325

LOADING PHARMACY/GARAGE/RESSOURCERIE DATA
Loaded mixed data: 254 rows
   Columns: ['name', 'type', 'lat', 'lon']
   Renaming columns: {'name': 'Name', 'type': 'Type', 'lat': 'Latitude', 'lon': 'Longitude'}
   Unique types found: ['pharmacy', 'car_repair', 'ressourcerie']
     ‚Ä¢ pharmacy: 139 points
     ‚Ä¢ ca

In [None]:
# CREATE DETAILED ANALYSIS

print("="*60)
print("DETAILED ANALYSIS FOR TRASH COLLECTION POINTS")
print("="*60)

# 1. IMPROVED FACILITY TYPE BREAKDOWN
print("\n1. FACILITY TYPE BREAKDOWN:")
print("-"*40)

# Get total count for percentages
total_locations = len(combined_df)
type_counts = combined_df['Facility_Type'].value_counts()

print("   Type                              Count   Percentage")
print("   " + "-"*44)

total_count = 0
for type_name, count in type_counts.items():
    percentage = (count / total_locations) * 100
    total_count += count
    print(f"   {type_name:30} {count:6,}     {percentage:5.1f}%")

# Summary statistics
print("\n   SUMMARY:")
print(f"   ‚Ä¢ Total facilities: {total_locations:,}")
print(f"   ‚Ä¢ Top 3 types cover {(type_counts.head(3).sum()/total_locations*100):.1f}% of all facilities")
print(f"   ‚Ä¢ Most common: {type_counts.index[0]} ({type_counts.iloc[0]:,} locations)")

# 2. WASTE TYPE ACCEPTANCE - IMPROVED BASED ON YOUR DATA
print("\n2. WASTE TYPE ACCEPTANCE ANALYSIS:")
print("-"*40)

# Based on your output, we know the most common types are Glass and Food
# Let's create a more detailed analysis

# Get all waste columns that exist
waste_cols = [col for col in combined_df.columns
              if col.startswith('Is_') and col.endswith('_enabled')]

# Calculate acceptance rates
waste_stats = []
for col in waste_cols:
    count = int(combined_df[col].sum())
    if count > 0:
        percentage = (count / total_locations) * 100
        waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
        waste_stats.append((waste_name, count, percentage))

# Sort by count
waste_stats.sort(key=lambda x: x[1], reverse=True)

print("   Waste Type                    Count   Acceptance Rate")
print("   " + "-"*50)

# Display with visual indicators
top_waste_types = waste_stats[:10]  # Show top 10
for name, count, pct in top_waste_types:
    # Create a simple bar chart
    bar_length = int(pct / 3)  # Scale for better visualization
    bar = '‚ñà' * min(bar_length, 30)  # Limit bar length
    print(f"   {name:25} {count:6,}    {pct:5.1f}% {bar}")

# Special insights based on your data
print(f"\n   KEY INSIGHTS:")
print(f"   ‚Ä¢ Glass collection is the most available service")
print(f"   ‚Ä¢ Food waste collection is nearly as widespread as glass")
print(f"   ‚Ä¢ {len([x for x in waste_stats if x[2] > 50])} waste types are available at >50% of locations")
print(f"   ‚Ä¢ {len([x for x in waste_stats if x[2] < 5])} specialized services are available at <5% of locations")

# 3. FACILITY SPECIALIZATION ANALYSIS
print("\n3. FACILITY SPECIALIZATION:")
print("-"*40)

# Count how many waste types each facility accepts
combined_df['Num_Waste_Types'] = combined_df[waste_cols].sum(axis=1)

# Categorize by specialization level
def categorize_specialization(num_types):
    if num_types == 0:
        return "No waste types"
    elif num_types == 1:
        return "Single-type"
    elif num_types == 2:
        return "Dual-type"
    elif num_types <= 5:
        return "Multi-type (3-5)"
    else:
        return "Versatile (6+)"

combined_df['Specialization_Level'] = combined_df['Num_Waste_Types'].apply(categorize_specialization)
specialization_counts = combined_df['Specialization_Level'].value_counts()

print("   Specialization Level        Count   Percentage")
print("   " + "-"*44)

for level, count in specialization_counts.items():
    percentage = (count / total_locations) * 100
    print(f"   {level:25} {count:6,}     {percentage:5.1f}%")

# 4. CORRELATION ANALYSIS BETWEEN FACILITY TYPES AND WASTE TYPES
print("\n4. CORRELATION ANALYSIS:")
print("-"*40)

# Top facility types and their waste acceptance profiles
top_facility_types = type_counts.head(5).index.tolist()

print("   Top Facility Types and Their Waste Acceptance Patterns:")
print("   " + "-"*55)

for facility_type in top_facility_types:
    facilities_of_type = combined_df[combined_df['Facility_Type'] == facility_type]
    if len(facilities_of_type) > 0:
        # Get top 3 waste types for this facility type
        waste_acceptance = {}
        for col in waste_cols:
            waste_name = col.replace('Is_', '').replace('_enabled', '').replace('_', ' ').title()
            acceptance_rate = (facilities_of_type[col].sum() / len(facilities_of_type)) * 100
            if acceptance_rate > 0:
                waste_acceptance[waste_name] = acceptance_rate

        # Sort and get top 3
        top_wastes = sorted(waste_acceptance.items(), key=lambda x: x[1], reverse=True)[:3]

        top_waste_str = ", ".join([f"{name} ({rate:.0f}%)" for name, rate in top_wastes])
        print(f"   ‚Ä¢ {facility_type:25} {len(facilities_of_type):4,} facilities")
        print(f"     Accepts: {top_waste_str}")

# 5. GEOGRAPHIC DISTRIBUTION ANALYSIS
print("\n5. GEOGRAPHIC DISTRIBUTION:")
print("-"*40)

if combined_df['Latitude'].notna().any() and combined_df['Longitude'].notna().any():
    # Calculate density by facility type
    print("   Facility Density Analysis:")

    # For top facility types, show their geographic spread
    for facility_type in top_facility_types[:3]:  # Top 3 only
        facilities_of_type = combined_df[combined_df['Facility_Type'] == facility_type]
        with_coords = facilities_of_type['Latitude'].notna().sum()

        if with_coords > 0:
            lat_std = facilities_of_type['Latitude'].std()
            lon_std = facilities_of_type['Longitude'].std()

            # Simple spread indicator
            if lat_std > 0.05 or lon_std > 0.05:
                spread = "Widespread"
            elif lat_std > 0.02 or lon_std > 0.02:
                spread = "Moderate spread"
            else:
                spread = "Concentrated"

            print(f"   ‚Ä¢ {facility_type:25} {spread:15} ({with_coords:,} with coordinates)")

# 6. SERVICE COVERAGE GAP ANALYSIS
print("\n6. SERVICE COVERAGE GAP ANALYSIS:")
print("-"*40)

# Find combinations of services that are rarely found together
print("   Rare Service Combinations:")

# Get all waste types with low coverage
low_coverage_types = [name for name, count, pct in waste_stats if pct < 10]

if low_coverage_types:
    print(f"   ‚Ä¢ {len(low_coverage_types)} waste types have <10% coverage:")
    for waste_type in low_coverage_types[:5]:  # Show first 5
        # Find which facilities offer this service
        col_name = f"Is_{waste_type.lower().replace(' ', '_')}_enabled"
        if col_name in combined_df.columns:
            facilities_with_service = combined_df[combined_df[col_name] == 1]
            facility_types = facilities_with_service['Facility_Type'].value_counts().head(2)
            facility_str = ", ".join([f"{typ} ({cnt})" for typ, cnt in facility_types.items()])
            print(f"     - {waste_type:20} found in: {facility_str}")

# 7. RECOMMENDATIONS BASED ON ANALYSIS
print("\n7. RECOMMENDATIONS:")
print("-"*40)

print("   Based on the analysis, consider:")
print(f"   1. Food waste and glass collection have excellent coverage (>50%)")
print(f"   2. {type_counts.index[0]} facilities are most common - could serve as model for expansion")
print(f"   3. Specialized services (pharmacy, car repair, ressourcerie) have limited coverage")
print(f"   4. {specialization_counts.get('Versatile (6+)', 0):,} facilities offer 6+ waste types")

# Check if versatile facilities have coordinates
if 'Num_Waste_Types' in combined_df.columns:
    versatile_facilities = combined_df[combined_df['Num_Waste_Types'] >= 6]
    if len(versatile_facilities) > 0:
        with_coords = versatile_facilities['Latitude'].notna().sum()
        print(f"   5. {with_coords}/{len(versatile_facilities)} versatile facilities have coordinates")

print("\n" + "="*60)
print("ANALYSIS COMPLETE - KEY FINDINGS:")
print("="*60)

# Create summary statistics
print(f"\nüìä SUMMARY STATISTICS:")
print(f"   ‚Ä¢ Total collection points: {total_locations:,}")
print(f"   ‚Ä¢ Average waste types per facility: {combined_df['Num_Waste_Types'].mean():.1f}")
print(f"   ‚Ä¢ Most versatile facility accepts: {combined_df['Num_Waste_Types'].max()} waste types")
print(f"   ‚Ä¢ Facility types: {len(type_counts)} distinct categories")

# Facility type distribution
print(f"\nüè¢ FACILITY TYPE DISTRIBUTION:")
top_3_total = type_counts.head(3).sum()
print(f"   ‚Ä¢ Top 3 types: {', '.join(type_counts.head(3).index.tolist())}")
print(f"   ‚Ä¢ Cover {top_3_total:,} facilities ({top_3_total/total_locations*100:.1f}% of total)")

# Waste type availability
top_waste = waste_stats[0] if waste_stats else ("None", 0, 0)
print(f"\nüóëÔ∏è  WASTE TYPE AVAILABILITY:")
print(f"   ‚Ä¢ Most available: {top_waste[0]} ({top_waste[2]:.1f}% of facilities)")
print(f"   ‚Ä¢ Total waste types tracked: {len(waste_stats)}")

print("\n" + "="*60)

DETAILED ANALYSIS FOR TRASH COLLECTION POINTS

1. FACILITY TYPE BREAKDOWN:
----------------------------------------
   Type                              Count   Percentage
   --------------------------------------------
   Food Waste Collection           1,644      35.3%
   Recycling Center                1,422      30.5%
   Glass Collection                1,079      23.1%
   Pharmacy                          139       3.0%
   Dual Recycling Point              131       2.8%
   Car Repair/Tire                   108       2.3%
   Collection Point                   69       1.5%
   Other Facility                     38       0.8%
   Single-Type Collection             24       0.5%
   Ressourcerie                        7       0.2%

   SUMMARY:
   ‚Ä¢ Total facilities: 4,661
   ‚Ä¢ Top 3 types cover 88.9% of all facilities
   ‚Ä¢ Most common: Food Waste Collection (1,644 locations)

2. WASTE TYPE ACCEPTANCE ANALYSIS:
----------------------------------------
   Waste Type                 