In [1]:
import requests
import os
import io
import sys
import pandas as pd
from google.cloud import bigquery
from google.api_core.exceptions import NotFound, BadRequest

In [2]:
# Point de collecte d√©chets alimentaires (biod√©chet) de Nantes
BASE_URL = "https://data.nantesmetropole.fr/api/records/1.0/search/"

params = {
    "dataset": "244400404_point-collecte-dechets-alimentaires-biodechet-nantes",
    "rows": 10000}

response = requests.get(BASE_URL, params=params)
response.raise_for_status()
data = response.json()

# Extract records
records = []
for record in data['records']:
    fields = record['fields']
    records.append(fields)

df = pd.DataFrame(records)
print(f"Total records: {len(df)}")
print(df.head())

Total records: 2782
    micro_quartier                      adresse domaine identifiant  \
0      La Botti√®re        2 Rue de la Basinerie  Public    BIOD0582   
1      Le Landreau           18 Rue Anita Conti   Priv√©    BIOD0514   
2  Rte de Ste Luce  276 Route de Sainte de Luce  Public    BIOD0410   
3  Rte de Ste Luce     288 Route de Sainte Luce  Public    BIOD0503   
4  Rte de Ste Luce           1 Rue Henri Loiret  Public    BIOD0632   

                    date_mes commune fournisseur  \
0  2022-12-12T01:00:00+00:00  Nantes        SUEZ   
1  2022-12-20T01:00:00+00:00  Nantes        SUEZ   
2  2022-12-01T01:00:00+00:00  Nantes        SUEZ   
3  2022-12-01T01:00:00+00:00  Nantes        SUEZ   
4  2022-12-01T01:00:00+00:00  Nantes        SUEZ   

                                geo_point_2d plaque_id  
0    [47.23819860017184, -1.517678599864239]       NaN  
1   [47.238733199907195, -1.511478400639151]       NaN  
2   [47.24174179981523, -1.5074455997902827]       NaN  
3   [47.24

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2782 entries, 0 to 2781
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   micro_quartier  2781 non-null   object
 1   adresse         2782 non-null   object
 2   domaine         2752 non-null   object
 3   identifiant     2775 non-null   object
 4   date_mes        1464 non-null   object
 5   commune         2782 non-null   object
 6   fournisseur     611 non-null    object
 7   geo_point_2d    2782 non-null   object
 8   plaque_id       220 non-null    object
dtypes: object(9)
memory usage: 195.7+ KB


In [4]:
# Check what's in the geo_point_2d column
print(f"Type of first element: {type(df['geo_point_2d'][0])}")
print(f"First element: {df['geo_point_2d'][0]}")
print(f"First element length: {len(df['geo_point_2d'][0])}")

Type of first element: <class 'list'>
First element: [47.23819860017184, -1.517678599864239]
First element length: 2


In [5]:
def clean_duplicates(df, strategy='coordinates'):
    """
    Remove duplicates based on different strategies

    Parameters:
    -----------
    strategy: 'coordinates', 'address', or 'strict'
    """
    df_clean = df.copy()

    # Extract coordinates

    df_clean['lat'] = df_clean['geo_point_2d'].apply(lambda x: round(x[0], 6) if isinstance(x, list) else None)
    df_clean['lon'] = df_clean['geo_point_2d'].apply(lambda x: round(x[1], 6) if isinstance(x, list) else None)
    df_clean['adresse_clean'] = df_clean['adresse'].str.lower().str.strip()

    # Choose deduplication strategy

    if strategy == 'coordinates':
        # Keep first entry for each unique coordinate
        df_deduped = df_clean.drop_duplicates(subset=['lat', 'lon'], keep='first')

    elif strategy == 'address':
        # Keep first entry for each unique address/commune
        df_deduped = df_clean.drop_duplicates(subset=['adresse_clean', 'commune'], keep='first')

    elif strategy == 'strict':
        # Keep first entry for exact matches (excluding geo_point_2d list)
        cols = [col for col in df_clean.columns if col not in ['geo_point_2d']]
        df_deduped = df_clean.drop_duplicates(subset=cols, keep='first')

    else:
        raise ValueError("Strategy must be 'coordinates', 'address', or 'strict'")

    # Clean up temporary columns

    df_deduped = df_deduped.drop(columns=['lat', 'lon', 'adresse_clean'], errors='ignore')

    print(f"Original rows: {len(df)}")
    print(f"After {strategy} deduplication: {len(df_deduped)}")
    print(f"Removed {len(df) - len(df_deduped)} duplicates")

    return df_deduped

# Try different strategies
print("=== DEDUPLICATION STRATEGIES ===\n")
for strategy in ['coordinates', 'address', 'strict']:
    df_clean = clean_duplicates(df, strategy=strategy)
    print()

# For our use case, it's useful coordinate-based deduplication:

print("Original shape:", df.shape)
df_clean = clean_duplicates(df, strategy='coordinates')
print("\nCleaned data shape:", df_clean.shape)
print("\nFirst few rows of cleaned data:")
print(df_clean[['adresse', 'commune', 'geo_point_2d']].head())

=== DEDUPLICATION STRATEGIES ===

Original rows: 2782
After coordinates deduplication: 1644
Removed 1138 duplicates

Original rows: 2782
After address deduplication: 1690
Removed 1092 duplicates

Original rows: 2782
After strict deduplication: 2782
Removed 0 duplicates

Original shape: (2782, 9)
Original rows: 2782
After coordinates deduplication: 1644
Removed 1138 duplicates

Cleaned data shape: (1644, 9)

First few rows of cleaned data:
                       adresse commune  \
0        2 Rue de la Basinerie  Nantes   
1           18 Rue Anita Conti  Nantes   
2  276 Route de Sainte de Luce  Nantes   
3     288 Route de Sainte Luce  Nantes   
4           1 Rue Henri Loiret  Nantes   

                                geo_point_2d  
0    [47.23819860017184, -1.517678599864239]  
1   [47.238733199907195, -1.511478400639151]  
2   [47.24174179981523, -1.5074455997902827]  
3   [47.24231100038424, -1.5062445999470977]  
4  [47.243990000355396, -1.5045800003122118]  


In [6]:
df_clean.info

<bound method DataFrame.info of                    micro_quartier                      adresse    domaine  \
0                     La Botti√®re        2 Rue de la Basinerie     Public   
1                     Le Landreau           18 Rue Anita Conti      Priv√©   
2                 Rte de Ste Luce  276 Route de Sainte de Luce     Public   
3                 Rte de Ste Luce     288 Route de Sainte Luce     Public   
4                 Rte de Ste Luce           1 Rue Henri Loiret     Public   
...                           ...                          ...        ...   
2769          St Jacques - Pirmil          149 rue Bonne Garde     Public   
2772  Gde Gr√®neraie - Clos Toreau            12 rue des Herses     Public   
2774      St Jacques - Ripossi√®re          35 rue Ledru Rollin     Public   
2776  Gde Gr√®neraie - Clos Toreau          11 route de Clisson  Priv√© NMH   
2779                     Malakoff       10 rue de l'Angleterre     Public   

     identifiant                   da

In [7]:
# It's a list of two numbers [lon, lat]

df_clean['lat'] = df_clean['geo_point_2d'].apply(lambda x: float(x[0]) if isinstance(x, list)
and len(x) > 0 else None)

df_clean['lon'] = df_clean['geo_point_2d'].apply(lambda x: float(x[1]) if isinstance(x, list)
and len(x) > 1 else None)
print("Successfully extracted coordinates as [lon, lat]")

Successfully extracted coordinates as [lon, lat]


In [8]:
# Save df with name alimentary garbage (index = False)

df_clean.to_csv('alimentary_garbage.csv', index=False)
print("DataFrame saved as 'alimentary_garbage.csv'")

DataFrame saved as 'alimentary_garbage.csv'


In [9]:
# CSV UPLOAD METHOD for df_clean (deduplicated data)
# Set credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/dariaserbichenko/code/DariaSerb/key-gcp/trash-optimizer-479913-91e59ecc96c9.json"

# BigQuery setup
PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
TABLE = "alimentary_garbage_clean"  # Changed table name to distinguish from original
table_id = f"{PROJECT}.{DATASET}.{TABLE}"

# Initialize client
client = bigquery.Client(project=PROJECT)
print(f"‚úÖ BigQuery client initialized successfully")

# Check dataset
dataset_ref = f"{PROJECT}.{DATASET}"
try:
    dataset = client.get_dataset(dataset_ref)
    print(f"‚úÖ Dataset '{DATASET}' exists")
except NotFound:
    print(f"üìÅ Creating dataset '{DATASET}'...")
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "EU"
    dataset = client.create_dataset(dataset, timeout=30)
    print(f"‚úÖ Dataset created")

print(f"üìä Original data: {len(df)} rows")
print(f"üìä Clean data to upload: {len(df_clean)} rows, {len(df_clean.columns)} columns")
print(f"üìà Duplicates removed: {len(df) - len(df_clean)} rows")

# Clean data (updated for df_clean's structure)
def clean_dataframe_for_bq(df_input):
    """Basic cleaning for BigQuery, preserving coordinate structure"""
    df_clean_bq = df_input.copy()

    print("üßπ Cleaning DataFrame for BigQuery...")

    # 1. Fix column names
    original_cols = df_clean_bq.columns.tolist()
    df_clean_bq.columns = [str(col).replace(' ', '_').replace('-', '_').replace('.', '_').lower()
                          for col in df_clean_bq.columns]

    print(f"   Renamed columns: {dict(zip(original_cols, df_clean_bq.columns))}")

    # 2. Handle geo_point_2d - keep as string or extract coordinates
    if 'geo_point_2d' in df_clean_bq.columns:
        print("   Processing geo_point_2d column...")

        # Option 1: Keep as string (if you want to preserve the list structure as text)
        df_clean_bq['geo_point_2d_str'] = df_clean_bq['geo_point_2d'].astype(str)

        # Option 2: Extract latitude and longitude as separate columns
        try:
            df_clean_bq['latitude'] = df_clean_bq['geo_point_2d'].apply(
                lambda x: float(x[0]) if isinstance(x, list) and len(x) > 0 else None
            )
            df_clean_bq['longitude'] = df_clean_bq['geo_point_2d'].apply(
                lambda x: float(x[1]) if isinstance(x, list) and len(x) > 1 else None
            )
            print(f"   Extracted coordinates: {df_clean_bq['latitude'].notna().sum()} valid lat/lon pairs")
        except Exception as e:
            print(f"   Warning: Could not extract coordinates: {e}")

    # 3. Convert other lists/dicts to strings
    for col in df_clean_bq.columns:
        if col != 'geo_point_2d':  # Skip the original geo_point_2d
            if df_clean_bq[col].apply(lambda x: isinstance(x, (list, dict, tuple))).any():
                df_clean_bq[col] = df_clean_bq[col].astype(str)
                print(f"   Converted {col} to string (contains lists/dicts)")

    # 4. Fill NaN values
    for col in df_clean_bq.columns:
        if df_clean_bq[col].dtype == 'object':
            df_clean_bq[col] = df_clean_bq[col].fillna('')
        elif pd.api.types.is_numeric_dtype(df_clean_bq[col]):
            # For numeric columns, you might want to keep NaN or fill with 0
            # df_clean_bq[col] = df_clean_bq[col].fillna(0)  # Uncomment if needed
            pass

    # 5. Remove the original list column if we created string version
    if 'geo_point_2d' in df_clean_bq.columns and 'geo_point_2d_str' in df_clean_bq.columns:
        df_clean_bq = df_clean_bq.drop(columns=['geo_point_2d'])
        print("   Dropped original geo_point_2d column (kept string version)")

    print(f"   Final columns: {list(df_clean_bq.columns)}")

    return df_clean_bq

# Apply cleaning
df_bq_ready = clean_dataframe_for_bq(df_clean)

# Check data types
print("\nüîç Data types after cleaning:")
print(df_bq_ready.dtypes)

# Convert DataFrame to CSV in memory
print("\nüì§ Converting DataFrame to CSV...")
csv_buffer = io.StringIO()
df_bq_ready.to_csv(csv_buffer, index=False, encoding='utf-8')
csv_content = csv_buffer.getvalue().encode('utf-8')

print(f"   CSV size: {len(csv_content) / 1024:.2f} KB")

# Create job configuration
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # Replaces entire table
    autodetect=True,  # Let BigQuery detect schema
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,  # Skip header
    max_bad_records=100,  # Allow some bad records
    encoding='UTF-8',
    allow_quoted_newlines=True  # Important for text fields
)

print(f"\n‚¨ÜÔ∏è  Uploading {len(df_bq_ready)} cleaned rows to {table_id}...")

# Upload from CSV
try:
    # Create file-like object
    file_obj = io.BytesIO(csv_content)

    # Submit job
    job = client.load_table_from_file(
        file_obj,
        table_id,
        job_config=job_config
    )

    print("   Job submitted. Waiting for completion...")
    job.result()  # Wait for completion

    # Verify
    table = client.get_table(table_id)
    print(f"\nüéâ SUCCESS!")
    print(f"   Table: {table_id}")
    print(f"   Rows uploaded: {table.num_rows:,}")
    print(f"   Size: {table.num_bytes / (1024*1024):.2f} MB")

    # Show table schema
    print(f"\nüìã Table schema:")
    for field in table.schema:
        print(f"   - {field.name}: {field.field_type}")

except Exception as e:
    print(f"\n‚ùå Upload failed: {e}")
    print("Trying alternative approach...")

    # Alternative: Use pandas_gbq for better error handling
    try:
        print("Trying pandas_gbq...")
        import pandas_gbq

        df_bq_ready.to_gbq(
            destination_table=table_id,
            project_id=PROJECT,
            if_exists='replace',
            progress_bar=True
        )
        print("‚úÖ Success with pandas_gbq!")

    except Exception as e2:
        print(f"‚ùå Also failed: {e2}")

        # Last resort: Save to local CSV and inspect
        local_csv = "debug_cleaned_data.csv"
        df_bq_ready.to_csv(local_csv, index=False, encoding='utf-8')
        print(f"üìÅ Saved data to {local_csv} for debugging")

‚úÖ BigQuery client initialized successfully
‚úÖ Dataset 'nantes' exists
üìä Original data: 2782 rows
üìä Clean data to upload: 1644 rows, 11 columns
üìà Duplicates removed: 1138 rows
üßπ Cleaning DataFrame for BigQuery...
   Renamed columns: {'micro_quartier': 'micro_quartier', 'adresse': 'adresse', 'domaine': 'domaine', 'identifiant': 'identifiant', 'date_mes': 'date_mes', 'commune': 'commune', 'fournisseur': 'fournisseur', 'geo_point_2d': 'geo_point_2d', 'plaque_id': 'plaque_id', 'lat': 'lat', 'lon': 'lon'}
   Processing geo_point_2d column...
   Extracted coordinates: 1644 valid lat/lon pairs
   Dropped original geo_point_2d column (kept string version)
   Final columns: ['micro_quartier', 'adresse', 'domaine', 'identifiant', 'date_mes', 'commune', 'fournisseur', 'plaque_id', 'lat', 'lon', 'geo_point_2d_str', 'latitude', 'longitude']

üîç Data types after cleaning:
micro_quartier       object
adresse              object
domaine              object
identifiant          object
d

In [10]:
# D√©ch√®teries-√©copoints de Nantes M√©tropoles

BASE_URL = "https://data.nantesmetropole.fr/api/explore/v2.1/catalog/datasets/244400404_decheteries-ecopoints-nantes-metropole/records"

# To get all records, we use limit=-1
params = {"limit": -1}

response = requests.get(BASE_URL, params=params)
response.raise_for_status()
data = response.json()

records = data.get('results', [])
print(f"Total records retrieved: {len(records)}")

Total records retrieved: 15


In [11]:
df1 = pd.DataFrame(response.json()['results'])

In [12]:
print(df1.head(5))

   identifiant                                    nom        type code_postal  \
0         1015                       Ecopoint Auvours    Ecopoint       44000   
1         2152          D√©ch√®terie de Saint S√©bastien  D√©ch√®terie       44230   
2         3139                D√©ch√®terie de Carquefou  D√©ch√®terie       44470   
3         3144  D√©ch√®terie de Saint Aignan Grand Lieu  D√©ch√®terie       44860   
4         1012                     Ecopoint Chantenay    Ecopoint       44100   

                     commune                              adresse bois carton  \
0                     Nantes                    20 Rue du Bourget  oui    oui   
1  Saint-S√©bastien-sur-Loire                   Rue de la Pyramide  oui    oui   
2                  Carquefou                    Route du Prouzeau  oui    oui   
3     Saint-Aignan-Grandlieu                    Route de la For√™t  oui    oui   
4                     Nantes  42 Boulevard Mar√©chal Alphonse Juin  oui    oui   

  deee pne

In [13]:
print(df1.columns)

Index(['identifiant', 'nom', 'type', 'code_postal', 'commune', 'adresse',
       'bois', 'carton', 'deee', 'pneus', 'verre', 'mobilier', 'extincteur',
       'batterie', 'gravat', 'encombrant', 'ferraille', 'huile_moteur',
       'papier', 'placoplatre', 'textile', 'dechet_vert', 'pile', 'cartouche',
       'neon', 'dechet_dangereux', 'bouteille_gaz', 'polystyrene',
       'huile_alimentaire', 'ressourcerie', 'horaire_ressourcerie',
       'geo_point_2d'],
      dtype='object')


In [33]:
# The MOST IMPORTANT check - IDs should be unique

print("Checking ID uniqueness (identifiant column):")

if 'identifiant' in df1.columns:
    total_ids = len(df1['identifiant'])
    unique_ids = df1['identifiant'].nunique()
    duplicate_id_count = df1['identifiant'].duplicated().sum()

    print(f"Total IDs: {total_ids}")
    print(f"Unique IDs: {unique_ids}")
    print(f"Duplicate IDs: {duplicate_id_count}")

    if duplicate_id_count > 0:
        print("DUPLICATE IDs FOUND:")
        dup_ids = df1[df1['identifiant'].duplicated(keep=False)]
        for id_val in dup_ids['identifiant'].unique():
            id_rows = dup_ids[dup_ids['identifiant'] == id_val]
            print(f"\nID {id_val} appears {len(id_rows)} times:")
            for _, row in id_rows.iterrows():
                print(f"  - {row.get('nom', 'Unknown')} | {row.get('adresse', 'No address')}")
else:
    print("No 'identifiant' column found")

Checking ID uniqueness (identifiant column):
Total IDs: 15
Unique IDs: 15
Duplicate IDs: 0


In [34]:
# Extract coordinates from dictionaries

df1['lon'] = df1['geo_point_2d'].apply(
    lambda x: float(x['lon']) if isinstance(x, dict) and 'lon' in x else None
)
df1['lat'] = df1['geo_point_2d'].apply(
    lambda x: float(x['lat']) if isinstance(x, dict) and 'lat' in x else None
)

print(f"Successfully extracted coordinates for {df1['lon'].notna().sum()} rows")

Successfully extracted coordinates for 15 rows


In [35]:
# Save df with name ecopoints (index = False)

df1.to_csv('ecopoints.csv', index=False)
print("DataFrame saved as 'ecopoints.csv'")

DataFrame saved as 'ecopoints.csv'


In [36]:
# CSV UPLOAD METHOD
# Set credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/dariaserbichenko/code/DariaSerb/key-gcp/trash-optimizer-479913-91e59ecc96c9.json"

# BigQuery setup
PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
TABLE = "ecopoints"
table_id = f"{PROJECT}.{DATASET}.{TABLE}"

# Initialize client
client = bigquery.Client(project=PROJECT)
print(f"‚úÖ BigQuery client initialized successfully")

# Check dataset
dataset_ref = f"{PROJECT}.{DATASET}"
try:
    dataset = client.get_dataset(dataset_ref)
    print(f"‚úÖ Dataset '{DATASET}' exists")
except NotFound:
    print(f"üìÅ Creating dataset '{DATASET}'...")
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "EU"
    dataset = client.create_dataset(dataset, timeout=30)
    print(f"‚úÖ Dataset created")

print(f"üìä Data to upload: {len(df1)} rows, {len(df1.columns)} columns")

# Prepare DataFrame - ensure no lists/dicts
df1_clean = df1.copy()
for col in df1_clean.columns:
    if df1_clean[col].apply(lambda x: isinstance(x, (list, dict))).any():
        print(f"   Converting {col} to string")
        df1_clean[col] = df1_clean[col].astype(str)

# Convert DataFrame to CSV in memory
print("Converting DataFrame to CSV...")
csv_buffer = io.StringIO()
df1_clean.to_csv(csv_buffer, index=False, encoding='utf-8')
csv_content = csv_buffer.getvalue().encode('utf-8')

# Create job configuration
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
    autodetect=True,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,  # Skip header
    max_bad_records=100,
    encoding='UTF-8'
)

print(f"‚¨ÜÔ∏è  Uploading {len(df_clean)} rows...")

# Upload from CSV
try:
    # Create file-like object
    file_obj = io.BytesIO(csv_content)

    # Submit job
    job = client.load_table_from_file(
        file_obj,
        table_id,
        job_config=job_config
    )

    print("   Job submitted. Waiting...")
    job.result()  # Wait for completion

    # Verify
    table = client.get_table(table_id)
    print(f"\n‚úÖ SUCCESS!")
    print(f"   Table: {table_id}")
    print(f"   Rows uploaded: {table.num_rows:,}")
    print(f"   Size: {table.num_bytes / (1024*1024):.2f} MB")

except Exception as e:
    print(f"‚ùå Upload failed: {e}")

‚úÖ BigQuery client initialized successfully
‚úÖ Dataset 'nantes' exists
üìä Data to upload: 15 rows, 34 columns
   Converting geo_point_2d to string
Converting DataFrame to CSV...
‚¨ÜÔ∏è  Uploading 1644 rows...
   Job submitted. Waiting...

‚úÖ SUCCESS!
   Table: trash-optimizer-479913.nantes.ecopoints
   Rows uploaded: 15
   Size: 0.00 MB


In [37]:
import io

# Annuaire des d√©ch√®teries des d√©chets m√©nagers et assimil√©s en Pays de la Loire
# Direct CSV download URL from the web page
CSV_URL = "https://data.nantesmetropole.fr/explore/dataset/837810944_annuairedesdecheteriesdma_pdl@data-teo-paysdelaloire/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B"

# Download the CSV
print("Downloading CSV data...")
response = requests.get(CSV_URL)
response.raise_for_status()

# Read CSV directly from the response content
df2 = pd.read_csv(io.StringIO(response.content.decode('utf-8')), sep=';')
print(f"Dataset loaded: {len(df2)} records")
print("\nColumns:", list(df2.columns))
print("\nFirst few rows:")
print(df2.head())

Downloading CSV data...
Dataset loaded: 328 records

Columns: ['C_REGION', 'L_REGION', 'C_DEPT', 'N_DEPT', 'ANNEE', 'C_SERVICE', 'N_SERVICE', 'TEL_SERVICE', 'D_OUV', 'AD1_SITE', 'AD2_SITE', 'insee_commune_actuel', 'N_COMM_SITE', 'CP_SITE', 'epci', 'LOV_MO_GEST', 'GPS_Y', 'GPS_X', 'C_ACTEUR', 'N_ACTEUR', 'L_TYP_ACTEUR', 'AD1_ACTEUR', 'AD2_ACTEUR', 'CP_ACTEUR', 'L_VILLE_ACTEUR', 'TEL_ACTEUR', 'position', 'D_MODIF', 'ORIGINE_DECHET_ACC', 'GPS_LONG', 'GPS_LAT']

First few rows:
   C_REGION          L_REGION  C_DEPT            N_DEPT  ANNEE  C_SERVICE  \
0        52  Pays de la Loire      44  Loire-Atlantique   2025       2459   
1        52  Pays de la Loire      44  Loire-Atlantique   2025       2494   
2        52  Pays de la Loire      44  Loire-Atlantique   2025       2500   
3        52  Pays de la Loire      44  Loire-Atlantique   2025       2508   
4        52  Pays de la Loire      44  Loire-Atlantique   2025       5163   

                              N_SERVICE  TEL_SERVICE    D_

In [38]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328 entries, 0 to 327
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   C_REGION              328 non-null    int64  
 1   L_REGION              328 non-null    object 
 2   C_DEPT                328 non-null    int64  
 3   N_DEPT                328 non-null    object 
 4   ANNEE                 328 non-null    int64  
 5   C_SERVICE             328 non-null    int64  
 6   N_SERVICE             328 non-null    object 
 7   TEL_SERVICE           258 non-null    float64
 8   D_OUV                 328 non-null    object 
 9   AD1_SITE              328 non-null    object 
 10  AD2_SITE              63 non-null     object 
 11  insee_commune_actuel  327 non-null    float64
 12  N_COMM_SITE           327 non-null    object 
 13  CP_SITE               327 non-null    float64
 14  epci                  325 non-null    object 
 15  LOV_MO_GEST           3

In [40]:
print("QUICK df2 DUPLICATE CHECK")

# Simple exact duplicate check
print(f"Total rows: {len(df2)}")
print(f"Exact duplicates: {df2.duplicated().sum()}")
print(f"Unique rows: {df2.drop_duplicates().shape[0]}")

# Check key columns
print("Key column duplicates:")
key_columns = ['C_SERVICE', 'N_SERVICE', 'AD1_SITE', 'GPS_LAT', 'GPS_LONG']
for col in key_columns:
    if col in df2.columns:
        dup_count = df2[col].duplicated().sum()
        unique_count = df2[col].nunique()
        print(f"  {col}: {dup_count} duplicates ({unique_count} unique values)")

QUICK df2 DUPLICATE CHECK
Total rows: 328
Exact duplicates: 0
Unique rows: 328
Key column duplicates:
  C_SERVICE: 0 duplicates (328 unique values)
  N_SERVICE: 0 duplicates (328 unique values)
  AD1_SITE: 9 duplicates (319 unique values)
  GPS_LAT: 0 duplicates (328 unique values)
  GPS_LONG: 0 duplicates (328 unique values)


In [41]:
df2[['lat', 'lon']] = df2['position'].str.split(',', expand=True).astype(float)

print(f"Successfully extracted coordinates for {df2['lon'].notna().sum()} rows")

Successfully extracted coordinates for 328 rows


In [42]:
df2.info

<bound method DataFrame.info of      C_REGION          L_REGION  C_DEPT            N_DEPT  ANNEE  C_SERVICE  \
0          52  Pays de la Loire      44  Loire-Atlantique   2025       2459   
1          52  Pays de la Loire      44  Loire-Atlantique   2025       2494   
2          52  Pays de la Loire      44  Loire-Atlantique   2025       2500   
3          52  Pays de la Loire      44  Loire-Atlantique   2025       2508   
4          52  Pays de la Loire      44  Loire-Atlantique   2025       5163   
..        ...               ...     ...               ...    ...        ...   
323        52  Pays de la Loire      72            Sarthe   2025      65448   
324        52  Pays de la Loire      85            Vend√©e   2025       4040   
325        52  Pays de la Loire      85            Vend√©e   2025       4386   
326        52  Pays de la Loire      85            Vend√©e   2025       4757   
327        52  Pays de la Loire      85            Vend√©e   2025     118549   

               

In [43]:
# Save df with name collection centres Pays dde la Loire region (index = False)

df2.to_csv('collection_centres_PdL_region.csv', index=False)
print("DataFrame saved as 'collection_centres_PdL_region.csv'")

DataFrame saved as 'collection_centres_PdL_region.csv'


In [44]:
# CSV UPLOAD METHOD TO BIGQUERY
# Set credentials

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/dariaserbichenko/code/DariaSerb/key-gcp/trash-optimizer-479913-91e59ecc96c9.json"

# BigQuery setup
PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
TABLE = "collection_centres_pdl"
table_id = f"{PROJECT}.{DATASET}.{TABLE}"

print("\n" + "=" * 60)
print("UPLOADING TO BIGQUERY")
print("=" * 60)

# Initialize client
client = bigquery.Client(project=PROJECT)
print(f"‚úÖ BigQuery client initialized successfully")

# Check dataset
dataset_ref = f"{PROJECT}.{DATASET}"
try:
    dataset = client.get_dataset(dataset_ref)
    print(f"‚úÖ Dataset '{DATASET}' exists")
    print(f"   Location: {dataset.location}")
except NotFound:
    print(f"üìÅ Creating dataset '{DATASET}'...")
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "EU"
    dataset = client.create_dataset(dataset, timeout=30)
    print(f"‚úÖ Dataset created")
    print(f"   Location: {dataset.location}")

# Display DataFrame info
print(f"\nüìä Data to upload:")
print(f"   Rows: {len(df2):,}")
print(f"   Columns: {len(df2.columns)}")
print(f"   Columns: {list(df2.columns)}")

# Check if we need to split the position column
if 'position' in df2.columns and 'lat' not in df2.columns and 'lon' not in df2.columns:
    print("\nüîç Splitting 'position' column into lat/lon...")
    # Split coordinates if format is "lat,lon"
    df2[['lat', 'lon']] = df2['position'].str.split(',', expand=True)
    df2['lat'] = pd.to_numeric(df2['lat'], errors='coerce')
    df2['lon'] = pd.to_numeric(df2['lon'], errors='coerce')
    print(f"‚úÖ Coordinates split: {df2['lat'].notna().sum()} valid coordinates")

# Prepare DataFrame - ensure no lists/dicts
df2_clean = df2.copy()

# Clean column names for BigQuery compatibility
df2_clean.columns = df2_clean.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)
print(f"\nüßπ Cleaning data for BigQuery...")

conversions = 0
for col in df2_clean.columns:
    # Convert lists/dicts to strings
    if df2_clean[col].apply(lambda x: isinstance(x, (list, dict, tuple))).any():
        df2_clean[col] = df2_clean[col].astype(str)
        conversions += 1
        print(f"   Converted {col} to string")

# Fill NaN values for string columns
nan_count = df2_clean.isna().sum().sum()
if nan_count > 0:
    print(f"   Found {nan_count} NaN values")
    for col in df2_clean.columns:
        if df2_clean[col].dtype == 'object':
            df2_clean[col] = df2_clean[col].fillna('')

print(f"   Cleaned shape: {df2_clean.shape}")

# Convert DataFrame to CSV in memory
print("\nüìÑ Converting DataFrame to CSV in memory...")
csv_buffer = io.StringIO()
df2_clean.to_csv(csv_buffer, index=False, encoding='utf-8')
csv_content = csv_buffer.getvalue().encode('utf-8')

# Create job configuration
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # Will replace existing table
    autodetect=True,                     # Let BigQuery detect schema
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,                 # Skip header row
    max_bad_records=100,                 # Allow some bad records
    encoding='UTF-8'
)

print(f"\n‚¨ÜÔ∏è  Uploading {len(df2_clean):,} rows to table '{TABLE}'...")

# Upload from CSV
try:
    # Create file-like object
    file_obj = io.BytesIO(csv_content)

    # Submit job
    job = client.load_table_from_file(
        file_obj,
        table_id,
        job_config=job_config
    )

    print("   Job submitted. Waiting for completion...")
    job.result()  # Wait for completion

    # Verify upload
    table = client.get_table(table_id)
    print(f"\n‚úÖ SUCCESS!")
    print(f"   Table: {table_id}")
    print(f"   Rows uploaded: {table.num_rows:,}")
    print(f"   Table size: {table.num_bytes / (1024*1024):.2f} MB")
    print(f"   Created: {table.created.strftime('%Y-%m-%d %H:%M:%S')}")

    # Show schema preview
    print(f"\nüìê Schema preview (first 5 columns):")
    for i, field in enumerate(table.schema[:5], 1):
        print(f"   {i}. {field.name:20} : {field.field_type}")

    if len(table.schema) > 5:
        print(f"   ... and {len(table.schema) - 5} more columns")

except Exception as e:
    print(f"\n‚ùå Upload failed: {e}")

    # Try alternative method
    print("\nüîÑ Trying alternative upload method...")
    try:
        # Try direct DataFrame upload
        direct_job_config = bigquery.LoadJobConfig(
            write_disposition="WRITE_TRUNCATE",
            autodetect=True,
            max_bad_records=100
        )

        direct_job = client.load_table_from_dataframe(df2_clean, table_id, job_config=direct_job_config)
        direct_job.result()

        table = client.get_table(table_id)
        print(f"‚úÖ Direct upload successful!")
        print(f"   Rows uploaded: {table.num_rows:,}")

    except Exception as e2:
        print(f"‚ùå Alternative method also failed: {e2}")
        print("\nüí° You can:")
        print("1. Check the saved CSV file: 'collection_centres_PdL_region.csv'")
        print("2. Upload it manually via Google Cloud Console")
        print("3. Or check for data format issues")

try:
    # List all tables in dataset
    tables = list(client.list_tables(DATASET))
    table_names = [t.table_id for t in tables]

    print(f"Tables in dataset '{DATASET}':")
    for name in sorted(table_names):
        if name == TABLE:
            print(f"   ‚úÖ {name} (just uploaded)")
        else:
            print(f"   ‚Ä¢ {name}")

    if TABLE not in table_names:
        print(f"\n‚ö†Ô∏è  Warning: Table '{TABLE}' not found in dataset!")

except Exception as e:
    print(f"Error listing tables: {e}")


UPLOADING TO BIGQUERY
‚úÖ BigQuery client initialized successfully
‚úÖ Dataset 'nantes' exists
   Location: EU

üìä Data to upload:
   Rows: 328
   Columns: 33
   Columns: ['C_REGION', 'L_REGION', 'C_DEPT', 'N_DEPT', 'ANNEE', 'C_SERVICE', 'N_SERVICE', 'TEL_SERVICE', 'D_OUV', 'AD1_SITE', 'AD2_SITE', 'insee_commune_actuel', 'N_COMM_SITE', 'CP_SITE', 'epci', 'LOV_MO_GEST', 'GPS_Y', 'GPS_X', 'C_ACTEUR', 'N_ACTEUR', 'L_TYP_ACTEUR', 'AD1_ACTEUR', 'AD2_ACTEUR', 'CP_ACTEUR', 'L_VILLE_ACTEUR', 'TEL_ACTEUR', 'position', 'D_MODIF', 'ORIGINE_DECHET_ACC', 'GPS_LONG', 'GPS_LAT', 'lat', 'lon']

üßπ Cleaning data for BigQuery...
   Found 533 NaN values
   Cleaned shape: (328, 33)

üìÑ Converting DataFrame to CSV in memory...

‚¨ÜÔ∏è  Uploading 328 rows to table 'collection_centres_pdl'...
   Job submitted. Waiting for completion...

‚úÖ SUCCESS!
   Table: trash-optimizer-479913.nantes.collection_centres_pdl
   Rows uploaded: 328
   Table size: 0.13 MB
   Created: 2025-12-02 12:49:35

üìê Schema p

In [None]:
# Localisation des colonnes d‚Äôapports volontaires de Nantes M√©tropole

BASE_URL = "https://data.nantesmetropole.fr/api/explore/v2.1/catalog/datasets/244400404_localisation-des-colonnes-apports-volontaires-de-nantes-metropole/records"

all_records = []
limit = 100  # Records per page
offset = 0
total_count = None

while True:

# Build URL with current offset
    url = f"{BASE_URL}?limit={limit}&offset={offset}"
    print(f"  Fetching {limit} records from offset {offset}...")

    response = requests.get(url)
    response.raise_for_status()
    data = response.json()

# Get total count on first request
    if total_count is None:
        total_count = data.get('total_count', 0)
        print(f"  Total records available: {total_count}")

# Add records from this page
    page_records = data.get('results', [])
    all_records.extend(page_records)

    # Update offset
    offset += limit

# Stop if we have all records or if page is empty
    if not page_records or offset >= total_count:
        break

# Create DataFrame
df3 = pd.DataFrame(all_records)

  Fetching 100 records from offset 0...
  Total records available: 2575
  Fetching 100 records from offset 100...
  Fetching 100 records from offset 200...
  Fetching 100 records from offset 300...
  Fetching 100 records from offset 400...
  Fetching 100 records from offset 500...
  Fetching 100 records from offset 600...
  Fetching 100 records from offset 700...
  Fetching 100 records from offset 800...
  Fetching 100 records from offset 900...
  Fetching 100 records from offset 1000...
  Fetching 100 records from offset 1100...
  Fetching 100 records from offset 1200...
  Fetching 100 records from offset 1300...
  Fetching 100 records from offset 1400...
  Fetching 100 records from offset 1500...
  Fetching 100 records from offset 1600...
  Fetching 100 records from offset 1700...
  Fetching 100 records from offset 1800...
  Fetching 100 records from offset 1900...
  Fetching 100 records from offset 2000...
  Fetching 100 records from offset 2100...
  Fetching 100 records from offset

In [21]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2575 entries, 0 to 2574
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_colonne            1424 non-null   object 
 1   id_colonne_ancien     2571 non-null   object 
 2   type_colonne          2575 non-null   object 
 3   type_dechet           2575 non-null   object 
 4   adresse               2574 non-null   object 
 5   mot_directeur         2403 non-null   object 
 6   volume_colonne        2370 non-null   float64
 7   volume_fosse          1438 non-null   float64
 8   matiere               362 non-null    object 
 9   modele                1097 non-null   object 
 10  fournisseur           1973 non-null   object 
 11  prehension            2237 non-null   object 
 12  type_avaloir          1032 non-null   object 
 13  date_mise_en_place    1261 non-null   object 
 14  date_mise_en_service  987 non-null    object 
 15  numero_serie         

In [28]:
print(df3.columns)

Index(['id_colonne', 'id_colonne_ancien', 'type_colonne', 'type_dechet',
       'adresse', 'mot_directeur', 'volume_colonne', 'volume_fosse', 'matiere',
       'modele', 'fournisseur', 'prehension', 'type_avaloir',
       'date_mise_en_place', 'date_mise_en_service', 'numero_serie',
       'investisseur', 'domanialite', 'changement_colonne',
       'nouvelle_convention', 'operateur_collecte', 'commune', 'pole',
       'observation', 'gid', 'globalid', 'geo_point_2d', 'lon', 'lat'],
      dtype='object')


In [None]:
print("QUICK df3 DUPLICATE CHECK")

# Create a string version for duplicate checking

df3_str = df3.copy()

# Convert any dictionary columns to strings
for col in df3_str.columns:
    if df3_str[col].apply(lambda x: isinstance(x, dict)).any():
        df3_str[col] = df3_str[col].astype(str)

# Now check duplicates
print(f"Total rows: {len(df3)}")
print(f"Exact duplicates: {df3_str.duplicated().sum()}")
print(f"Unique rows: {df3_str.drop_duplicates().shape[0]}")

# Check key columns that ACTUALLY exist in df3
print("\nKey column duplicates (actual df3 columns):")
actual_key_columns = ['id_colonne', 'adresse', 'commune', 'type_dechet', 'type_colonne']
for col in actual_key_columns:
    if col in df3.columns:
        dup_count = df3[col].duplicated().sum()
        unique_count = df3[col].nunique()
        print(f"  {col}: {dup_count} duplicates ({unique_count} unique values)")

QUICK df3 DUPLICATE CHECK (FIXED)
Total rows: 2575
Exact duplicates: 0
Unique rows: 2575

Key column duplicates (actual df3 columns):
  id_colonne: 1151 duplicates (1423 unique values)
  adresse: 1206 duplicates (1368 unique values)
  commune: 2551 duplicates (24 unique values)
  type_dechet: 2570 duplicates (5 unique values)
  type_colonne: 2573 duplicates (2 unique values)


In [26]:
df3['geo_point_2d'][0]

{'lon': -1.623654076876046, 'lat': 47.181190184213065}

In [27]:
# Extract coordinates from dictionaries

df3['lon'] = df3['geo_point_2d'].apply(
    lambda x: float(x['lon']) if isinstance(x, dict) and 'lon' in x else None
)
df3['lat'] = df3['geo_point_2d'].apply(
    lambda x: float(x['lat']) if isinstance(x, dict) and 'lat' in x else None
)

print(f"Successfully extracted coordinates for {df3['lon'].notna().sum()} rows")

Successfully extracted coordinates for 2575 rows


In [30]:
# Save df with name location dropoff points nantes (index = False)

df3.to_csv('location_dropoff_points_nantes.csv', index=False)
print("DataFrame saved as 'location_dropoff_points_nantes.csv'")

DataFrame saved as 'location_dropoff_points_nantes.csv'


In [32]:
# CSV UPLOAD METHOD TO BIGQUERY
# Set credentials

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/dariaserbichenko/code/DariaSerb/key-gcp/trash-optimizer-479913-91e59ecc96c9.json"

# BigQuery setup
PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
TABLE = "location_dropoff_points_nantes"
table_id = f"{PROJECT}.{DATASET}.{TABLE}"

print("\n" + "=" * 60)
print("UPLOADING TO BIGQUERY")
print("=" * 60)

# Initialize client
client = bigquery.Client(project=PROJECT)
print(f"‚úÖ BigQuery client initialized successfully")

# Check dataset
dataset_ref = f"{PROJECT}.{DATASET}"
try:
    dataset = client.get_dataset(dataset_ref)
    print(f"‚úÖ Dataset '{DATASET}' exists")
    print(f"   Location: {dataset.location}")
except NotFound:
    print(f"üìÅ Creating dataset '{DATASET}'...")
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "EU"
    dataset = client.create_dataset(dataset, timeout=30)
    print(f"‚úÖ Dataset created")
    print(f"   Location: {dataset.location}")

# Display DataFrame info
print(f"\nüìä Data to upload:")
print(f"   Rows: {len(df3):,}")
print(f"   Columns: {len(df3.columns)}")
print(f"   Columns: {list(df3.columns)}")

# Check if we need to split the position column
if 'position' in df3.columns and 'lat' not in df3.columns and 'lon' not in df3.columns:
    print("\nüîç Splitting 'position' column into lat/lon")
    # Split coordinates if format is "lat,lon"
    df3[['lat', 'lon']] = df3['position'].str.split(',', expand=True)
    df3['lat'] = pd.to_numeric(df3['lat'], errors='coerce')
    df3['lon'] = pd.to_numeric(df3['lon'], errors='coerce')
    print(f"‚úÖ Coordinates split: {df2['lat'].notna().sum()} valid coordinates")

# Prepare DataFrame - ensure no lists/dicts
df3_clean = df3.copy()

# Clean column names for BigQuery compatibility
df3_clean.columns = df3_clean.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)
print(f"\nüßπ Cleaning data for BigQuery")

conversions = 0
for col in df3_clean.columns:
    # Convert lists/dicts to strings
    if df3_clean[col].apply(lambda x: isinstance(x, (list, dict, tuple))).any():
        df3_clean[col] = df3_clean[col].astype(str)
        conversions += 1
        print(f"   Converted {col} to string")

# Fill NaN values for string columns
nan_count = df3_clean.isna().sum().sum()
if nan_count > 0:
    print(f"   Found {nan_count} NaN values")
    for col in df3_clean.columns:
        if df3_clean[col].dtype == 'object':
            df3_clean[col] = df3_clean[col].fillna('')

print(f"   Cleaned shape: {df3_clean.shape}")

# Convert DataFrame to CSV in memory
print("\nüìÑ Converting DataFrame to CSV in memory...")
csv_buffer = io.StringIO()
df3_clean.to_csv(csv_buffer, index=False, encoding='utf-8')
csv_content = csv_buffer.getvalue().encode('utf-8')

# Create job configuration
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",  # Will replace existing table
    autodetect=True,                     # Let BigQuery detect schema
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,                 # Skip header row
    max_bad_records=100,                 # Allow some bad records
    encoding='UTF-8'
)

print(f"\n‚¨ÜÔ∏è  Uploading {len(df3_clean):,} rows to table '{TABLE}'...")

# Upload from CSV
try:
    # Create file-like object
    file_obj = io.BytesIO(csv_content)

    # Submit job
    job = client.load_table_from_file(
        file_obj,
        table_id,
        job_config=job_config
    )

    print("   Job submitted. Waiting for completion...")
    job.result()  # Wait for completion

    # Verify upload
    table = client.get_table(table_id)
    print(f"\n‚úÖ SUCCESS!")
    print(f"   Table: {table_id}")
    print(f"   Rows uploaded: {table.num_rows:,}")
    print(f"   Table size: {table.num_bytes / (1024*1024):.2f} MB")
    print(f"   Created: {table.created.strftime('%Y-%m-%d %H:%M:%S')}")

    # Show schema preview
    print(f"\nüìê Schema preview (first 5 columns):")
    for i, field in enumerate(table.schema[:5], 1):
        print(f"   {i}. {field.name:20} : {field.field_type}")

    if len(table.schema) > 5:
        print(f"   ... and {len(table.schema) - 5} more columns")

except Exception as e:
    print(f"\n‚ùå Upload failed: {e}")

    # Try alternative method
    print("\nüîÑ Trying alternative upload method...")
    try:
        # Try direct DataFrame upload
        direct_job_config = bigquery.LoadJobConfig(
            write_disposition="WRITE_TRUNCATE",
            autodetect=True,
            max_bad_records=100
        )

        direct_job = client.load_table_from_dataframe(df3_clean, table_id, job_config=direct_job_config)
        direct_job.result()

        table = client.get_table(table_id)
        print(f"‚úÖ Direct upload successful!")
        print(f"   Rows uploaded: {table.num_rows:,}")

    except Exception as e2:
        print(f"‚ùå Alternative method also failed: {e2}")
        print("\nüí° You can:")
        print("1. Check the saved CSV file: 'location_dropoff_points_nantes.csv'")
        print("2. Upload it manually via Google Cloud Console")
        print("3. Or check for data format issues")

try:
    # List all tables in dataset
    tables = list(client.list_tables(DATASET))
    table_names = [t.table_id for t in tables]

    print(f"Tables in dataset '{DATASET}':")
    for name in sorted(table_names):
        if name == TABLE:
            print(f"   ‚úÖ {name} (just uploaded)")
        else:
            print(f"   ‚Ä¢ {name}")

    if TABLE not in table_names:
        print(f"\n‚ö†Ô∏è  Warning: Table '{TABLE}' not found in dataset!")

except Exception as e:
    print(f"Error listing tables: {e}")


UPLOADING TO BIGQUERY
‚úÖ BigQuery client initialized successfully
‚úÖ Dataset 'nantes' exists
   Location: EU

üìä Data to upload:
   Rows: 2,575
   Columns: 29
   Columns: ['id_colonne', 'id_colonne_ancien', 'type_colonne', 'type_dechet', 'adresse', 'mot_directeur', 'volume_colonne', 'volume_fosse', 'matiere', 'modele', 'fournisseur', 'prehension', 'type_avaloir', 'date_mise_en_place', 'date_mise_en_service', 'numero_serie', 'investisseur', 'domanialite', 'changement_colonne', 'nouvelle_convention', 'operateur_collecte', 'commune', 'pole', 'observation', 'gid', 'globalid', 'geo_point_2d', 'lon', 'lat']

üßπ Cleaning data for BigQuery
   Converted geo_point_2d to string
   Found 25440 NaN values
   Cleaned shape: (2575, 29)

üìÑ Converting DataFrame to CSV in memory...

‚¨ÜÔ∏è  Uploading 2,575 rows to table 'location_dropoff_points_nantes'...
   Job submitted. Waiting for completion...

‚úÖ SUCCESS!
   Table: trash-optimizer-479913.nantes.location_dropoff_points_nantes
   Rows upl

In [None]:
# Create unified table in BigQuery combining all three datasets
# Set credentials

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/dariaserbichenko/code/DariaSerb/key-gcp/trash-optimizer-479913-91e59ecc96c9.json"

PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
UNIFIED_TABLE = "all_trash_locations"
unified_table_id = f"{PROJECT}.{DATASET}.{UNIFIED_TABLE}"

client = bigquery.Client(project=PROJECT)

# Simple SQL query - FIXED
create_unified_table_query = f"""
CREATE OR REPLACE TABLE `{PROJECT}.{DATASET}.{UNIFIED_TABLE}` AS

-- From alimentary_garbage_clean
SELECT
  'alimentary' as trash_type,
  identifiant as nom,
  adresse,
  lat as latitude,
  lon as longitude
FROM `{PROJECT}.{DATASET}.alimentary_garbage_clean`
WHERE lat IS NOT NULL AND lon IS NOT NULL

UNION ALL

-- From ecopoints
SELECT
  'ecopoints' as trash_type,
  nom,
  adresse,
  lat as latitude,
  lon as longitude
FROM `{PROJECT}.{DATASET}.ecopoints`
WHERE lat IS NOT NULL AND lon IS NOT NULL

UNION ALL

-- From collection_centres_pdl
SELECT
  'collection_centres' as trash_type,
  N_SERVICE as nom,
  AD1_ACTEUR as adresse,
  lat as latitude,
  lon as longitude
FROM `{PROJECT}.{DATASET}.collection_centres_pdl`
WHERE lat IS NOT NULL AND lon IS NOT NULL
"""

print("Creating unified table...")
try:
    # Execute query
    job = client.query(create_unified_table_query)
    job.result()
    print("‚úÖ Unified table created successfully!")

    # Get table info
    table = client.get_table(unified_table_id)
    print(f"\nTable info:")
    print(f"   Name: {unified_table_id}")
    print(f"   Rows: {table.num_rows}")
    print(f"   Size: {table.num_bytes / (1024*1024):.2f} MB")

except Exception as e:
    print(f"‚ùå Error: {e}")

Creating unified table...
‚úÖ Unified table created successfully!

Table info:
   Name: trash-optimizer-479913.nantes.all_trash_locations
   Rows: 1987
   Size: 0.13 MB
