In [2]:
import requests
import pandas as pd
from google.cloud import bigquery
from google.colab import auth
from google.api_core.exceptions import NotFound, BadRequest

In [7]:
# Point de collecte déchets alimentaires (biodéchet) de Nantes
BASE_URL = "https://data.nantesmetropole.fr/api/records/1.0/search/"

params = {
    "dataset": "244400404_point-collecte-dechets-alimentaires-biodechet-nantes",
    "rows": 10000,
    "sort": "-commune",
}

response = requests.get(BASE_URL, params=params)
response.raise_for_status()
data = response.json()

# Extract records
records = []
for record in data['records']:
    fields = record['fields']
    records.append(fields)

df = pd.DataFrame(records)
print(f"Total records: {len(df)}")
print(df.head())

Total records: 2782
    micro_quartier                      adresse domaine identifiant  \
0      La Bottière        2 Rue de la Basinerie  Public    BIOD0582   
1      Le Landreau           18 Rue Anita Conti   Privé    BIOD0514   
2  Rte de Ste Luce  276 Route de Sainte de Luce  Public    BIOD0410   
3  Rte de Ste Luce     288 Route de Sainte Luce  Public    BIOD0503   
4  Rte de Ste Luce           1 Rue Henri Loiret  Public    BIOD0632   

                    date_mes commune fournisseur  \
0  2022-12-12T01:00:00+00:00  Nantes        SUEZ   
1  2022-12-20T01:00:00+00:00  Nantes        SUEZ   
2  2022-12-01T01:00:00+00:00  Nantes        SUEZ   
3  2022-12-01T01:00:00+00:00  Nantes        SUEZ   
4  2022-12-01T01:00:00+00:00  Nantes        SUEZ   

                                geo_point_2d plaque_id  
0    [47.23819860017184, -1.517678599864239]       NaN  
1   [47.238733199907195, -1.511478400639151]       NaN  
2   [47.24174179981523, -1.5074455997902827]       NaN  
3   [47.2423

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2782 entries, 0 to 2781
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   micro_quartier  2781 non-null   object
 1   adresse         2782 non-null   object
 2   domaine         2752 non-null   object
 3   identifiant     2775 non-null   object
 4   date_mes        1464 non-null   object
 5   commune         2782 non-null   object
 6   fournisseur     611 non-null    object
 7   geo_point_2d    2782 non-null   object
 8   plaque_id       220 non-null    object
dtypes: object(9)
memory usage: 195.7+ KB


In [10]:
# Check what's in the geo_point_2d column
print(f"Type of first element: {type(df['geo_point_2d'][0])}")
print(f"First element: {df['geo_point_2d'][0]}")
print(f"First element length: {len(df['geo_point_2d'][0])}")

Type of first element: <class 'list'>
First element: [47.23819860017184, -1.517678599864239]
First element length: 2


In [12]:
# It's a list of two numbers [lon, lat]

df['lon'] = df['geo_point_2d'].apply(lambda x: float(x[0]) if isinstance(x, list)
and len(x) > 0 else None)

df['lat'] = df['geo_point_2d'].apply(lambda x: float(x[1]) if isinstance(x, list)
and len(x) > 1 else None)
print("Successfully extracted coordinates as [lon, lat]")


Successfully extracted coordinates as [lon, lat]


In [13]:
df = df[['adresse', 'lon', 'lat']]

In [14]:
df

Unnamed: 0,adresse,lon,lat
0,2 Rue de la Basinerie,47.238199,-1.517679
1,18 Rue Anita Conti,47.238733,-1.511478
2,276 Route de Sainte de Luce,47.241742,-1.507446
3,288 Route de Sainte Luce,47.242311,-1.506245
4,1 Rue Henri Loiret,47.243990,-1.504580
...,...,...,...
2777,2 rue de Suisse,47.216644,-1.521431
2778,3 rue de Madrid,47.214020,-1.532369
2779,10 rue de l'Angleterre,47.214726,-1.527392
2780,10 rue de l'Angleterre,47.214726,-1.527392


In [15]:
# Save df with name alimentary garbage (index = False)

df.to_csv('alimentary_garbage.csv', index=False)
print("DataFrame saved as 'alimentary_garbage.csv'")

DataFrame saved as 'alimentary_garbage.csv'


In [19]:
# Upload to BigQuery
# Authenticate the user for Google Cloud services

auth.authenticate_user()

# BigQuery setup
PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
TABLE = "alimentary_garbage"
table_id = f"{PROJECT}.{DATASET}.{TABLE}"

# Initialize client
client = bigquery.Client(project=PROJECT)  # Specify project
print(f"BigQuery client initialized successfully")

# Check if the dataset exists, and create it if it doesn't
dataset_ref = f"{PROJECT}.{DATASET}"

try:
    client.get_dataset(dataset_ref)
    print(f"Dataset '{DATASET}' exists in project '{PROJECT}'")
except NotFound:
    print(f"Dataset '{DATASET}' not found. Creating dataset...")
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "EU"
    client.create_dataset(dataset, timeout=30)
    print(f"Dataset '{DATASET}' created successfully.")

# Create the job configuration
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
    autodetect=True,
    max_bad_records=100  # Allow up to 100 bad records
)

print(f"Uploading {len(df)} rows with max_bad_records=100")

# Submit the job

job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
job.result()


# Get table info after upload
table = client.get_table(table_id)

print(f"Successfully uploaded the dataframe to BigQuery")
print(f"Table: {table_id}")
print(f"Rows uploaded: {table.num_rows}")
print(f"Table size: {table.num_bytes / (1024*1024):.2f} MB")

BigQuery client initialized successfully
Dataset 'nantes' exists in project 'trash-optimizer-479913'
Uploading 2782 rows with max_bad_records=100
Successfully uploaded the dataframe to BigQuery!
Table: trash-optimizer-479913.nantes.alimentary_garbage
Rows uploaded: 2782
Table size: 0.11 MB


In [8]:
# Déchèteries-écopoints de Nantes Métropoles

BASE_URL = "https://data.nantesmetropole.fr/api/explore/v2.1/catalog/datasets/244400404_decheteries-ecopoints-nantes-metropole/records"

# To get all records, we use limit=-1
params = {"limit": -1}

response = requests.get(BASE_URL, params=params)
response.raise_for_status()
data = response.json()

records = data.get('results', [])
print(f"Total records retrieved: {len(records)}")

Total records retrieved: 15


In [9]:
df1 = pd.DataFrame(response.json()['results'])

In [10]:
print(df1.head())

   identifiant                                    nom        type code_postal  \
0         1015                       Ecopoint Auvours    Ecopoint       44000   
1         2152          Déchèterie de Saint Sébastien  Déchèterie       44230   
2         3139                Déchèterie de Carquefou  Déchèterie       44470   
3         3144  Déchèterie de Saint Aignan Grand Lieu  Déchèterie       44860   
4         1012                     Ecopoint Chantenay    Ecopoint       44100   

                     commune                              adresse bois carton  \
0                     Nantes                    20 Rue du Bourget  oui    oui   
1  Saint-Sébastien-sur-Loire                   Rue de la Pyramide  oui    oui   
2                  Carquefou                    Route du Prouzeau  oui    oui   
3     Saint-Aignan-Grandlieu                    Route de la Forêt  oui    oui   
4                     Nantes  42 Boulevard Maréchal Alphonse Juin  oui    oui   

  deee pneus  ... pile car

In [11]:
# Extract coordinates from dictionaries

df1['lon'] = df1['geo_point_2d'].apply(
    lambda x: float(x['lon']) if isinstance(x, dict) and 'lon' in x else None
)
df1['lat'] = df1['geo_point_2d'].apply(
    lambda x: float(x['lat']) if isinstance(x, dict) and 'lat' in x else None
)

print(f"Successfully extracted coordinates for {df1['lon'].notna().sum()} rows")

Successfully extracted coordinates for 15 rows


In [12]:
# Save df with name ecopoints (index = False)

df1.to_csv('ecopoints.csv', index=False)
print("DataFrame saved as 'ecopoints.csv'")

DataFrame saved as 'ecopoints.csv'


In [13]:
# Upload to BigQuery for df1 (ecopoints)
# Authenticate the user for Google Cloud services

auth.authenticate_user()

# BigQuery setup - Changed table name to "ecopoints"
PROJECT = "trash-optimizer-479913"
DATASET = "nantes"
TABLE = "ecopoints"
table_id = f"{PROJECT}.{DATASET}.{TABLE}"

# Initialize client
client = bigquery.Client(project=PROJECT)
print(f"BigQuery client initialized successfully")

# Check if the dataset exists, and create it if it doesn't
dataset_ref = f"{PROJECT}.{DATASET}"
try:
    dataset = client.get_dataset(dataset_ref)
    print(f"Dataset '{DATASET}' exists in project '{PROJECT}'")

except NotFound:
    print(f"Dataset '{DATASET}' not found. Creating dataset...")
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "EU"  # Set location when creating dataset
    dataset = client.create_dataset(dataset, timeout=30)
    print(f"Dataset '{DATASET}' created successfully in location: {dataset.location}")


# Create the job configuration
job_config = bigquery.LoadJobConfig(
    write_disposition="WRITE_TRUNCATE",
    autodetect=True,
    max_bad_records=100,  # Allow up to 100 bad records
    )

print(f"Uploading {len(df1)} rows to table '{TABLE}' with max_bad_records=100")

# Submit the job

job = client.load_table_from_dataframe(df1, table_id, job_config=job_config)
job.result()

# Get table info after upload
table = client.get_table(table_id)

print(f"Successfully uploaded dataframe to BigQuery!")
print(f"Table: {table_id}")
print(f"Rows uploaded: {table.num_rows:,}")
print(f"sTable size: {table.num_bytes / (1024*1024):.2f} MB")

BigQuery client initialized successfully
Dataset 'nantes' exists in project 'trash-optimizer-479913'
Uploading 15 rows to table 'ecopoints' with max_bad_records=100
Successfully uploaded dataframe to BigQuery!
   Table: trash-optimizer-479913.nantes.ecopoints
   Rows uploaded: 15
   Table size: 0.00 MB


In [14]:
# Annuaire des déchèteries des déchets ménagers et assimilés en Pays de la Loire

BASE_URL = "https://data.nantesmetropole.fr/api"

# Query parameters - get all records
params = {
    "limit": -1  # Get all records
}

response = requests.get(BASE_URL, params=params)
response.raise_for_status()
data = response.json()

# Create DataFrame
df2 = pd.DataFrame(data['results'])
print(f"Dataset loaded: {len(df2)} records")
print("\nColumns:", list(df2.columns))
print("\nFirst few rows:")
print(df2.head())

HTTPError: 404 Client Error: Not Found for url: https://data.nantesmetropole.fr/api/explore/v2.1/catalog/datasets/244400404_annuaire-decheteries-dechets-menagers-assimiles-pays-loire/records?limit=-1

In [None]:
# Clean the dataframe before upload (important step!)
def clean_dataframe_for_bigquery(df):
    """Clean dataframe for BigQuery compatibility"""
    df_clean = df.copy()

    # 1. Fix column names (BigQuery doesn't like special characters)
    df_clean.columns = df_clean.columns.str.replace('[^a-zA-Z0-9_]', '_', regex=True)

    # 2. Convert problematic types (lists, dicts) to strings
    for col in df_clean.columns:
        # Check if column contains lists, dicts, or None
        if df_clean[col].apply(lambda x: isinstance(x, (list, dict, tuple))).any():
            print(f"Converting {col} to string (contains lists/dicts)")
            df_clean[col] = df_clean[col].astype(str)

        # Replace NaN/None with empty string for string columns
        if df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].fillna('')

    return df_clean

# Clean the dataframe
print(f"Cleaning dataframe with {len(df1)} rows...")
df1_clean = clean_dataframe_for_bigquery(df1)
print(f"Original columns: {list(df1.columns)}")
print(f"Cleaned columns: {list(df1_clean.columns)}")


# Task 2

In [None]:
def get_addresses_by_trash_class(trash_classes, project_id="trash-optimizer-479913", dataset_name="nantes"):

    """
    Query BigQuery for addresses with specific trash classes

    Parameters:
    - trash_classes: list of strings, e.g., ['Verre', 'Papier', 'Emballages']
    - project_id: Google Cloud project ID
    - dataset_name: BigQuery dataset name

    Returns:
    - DataFrame with columns: address, latitude, longitude, trash_class, and other relevant info
    """

    from google.cloud import bigquery
    import pandas as pd

    # Initialize BigQuery client
    client = bigquery.Client(project=project_id)

    # First, let's check what tables we have and their schemas
    print("Checking available tables in dataset...")

    try:
        # List all tables
        tables = list(client.list_tables(dataset_name))
        table_names = [table.table_id for table in tables]

        print(f"Available tables: {table_names}")

        # Let's check each table to see which has trash class information
        tables_with_classes = []

        for table_name in table_names:
            try:
                # Get table schema
                table_ref = client.get_table(f"{project_id}.{dataset_name}.{table_name}")
                columns = [field.name for field in table_ref.schema]

                # Check for columns that might contain trash class info
                possible_class_columns = []
                for col in columns:
                    if any(keyword in col.lower() for keyword in ['type', 'categorie', 'class', 'dechet', 'trash', 'waste']):
                        possible_class_columns.append(col)

                if possible_class_columns:
                    tables_with_classes.append({
                        'table_name': table_name,
                        'columns': columns,
                        'class_columns': possible_class_columns
                    })

            except Exception as e:
                continue

        print(f"\nTables that might have trash class info: {len(tables_with_classes)}")

        if not tables_with_classes:
            print("No tables found with trash class information.")
            return pd.DataFrame()

        # Let's examine each candidate table
        all_results = []

        for table_info in tables_with_classes:
            table_name = table_info['table_name']
            class_columns = table_info['class_columns']

            print(f"\nChecking table: {table_name}")
            print(f"Possible class columns: {class_columns}")

            # Try to query each possible class column
            for class_col in class_columns:
                try:
                    # Build query based on available columns
                    # First check if we have coordinate columns
                    table_ref = client.get_table(f"{project_id}.{dataset_name}.{table_name}")
                    all_columns = [field.name for field in table_ref.schema]

                    # Look for coordinate columns
                    coord_cols = []
                    for col in all_columns:
                        if any(coord in col.lower() for coord in ['lat', 'lon', 'latitude', 'longitude']):
                            coord_cols.append(col)

                    if len(coord_cols) < 2:
                        print(f"  Not enough coordinate columns in {table_name}")
                        continue

                    # Build SELECT clause
                    select_cols = []

                    # Address column
                    address_cols = [col for col in all_columns if any(keyword in col.lower()
                                                                    for keyword in ['adresse', 'address', 'location', 'lieu'])]
                    if address_cols:
                        select_cols.append(f"`{address_cols[0]}` as address")
                    else:
                        select_cols.append("NULL as address")

                    # Coordinates
                    lat_col = next((col for col in coord_cols if 'lat' in col.lower()), None)
                    lon_col = next((col for col in coord_cols if 'lon' in col.lower()), None)

                    if lat_col and lon_col:
                        select_cols.append(f"`{lat_col}` as latitude")
                        select_cols.append(f"`{lon_col}` as longitude")
                    else:
                        select_cols.append("NULL as latitude")
                        select_cols.append("NULL as longitude")

                    # Trash class
                    select_cols.append(f"`{class_col}` as trash_class")

                    # Table name for reference
                    select_cols.append(f"'{table_name}' as source_table")

                    # Build WHERE clause for trash classes
                    where_conditions = []
                    for trash_class in trash_classes:
                        # Use ILIKE for case-insensitive matching
                        where_conditions.append(f"LOWER(`{class_col}`) LIKE LOWER('%{trash_class}%')")

                    where_clause = " OR ".join(where_conditions)

                    # Build full query
                    query = f"""
                    SELECT {', '.join(select_cols)}
                    FROM `{project_id}.{dataset_name}.{table_name}`
                    WHERE ({where_clause})
                    AND `{lat_col}` IS NOT NULL
                    AND `{lon_col}` IS NOT NULL
                    """

                    print(f"  Querying column: {class_col}")

                    # Execute query
                    query_job = client.query(query)
                    results = query_job.result()

                    # Convert to DataFrame
                    df_results = results.to_dataframe()

                    if not df_results.empty:
                        print(f"    Found {len(df_results)} matching records")
                        all_results.append(df_results)

                except Exception as e:
                    print(f"    Error querying {class_col}: {str(e)[:100]}...")
                    continue

        # Combine all results
        if all_results:
            final_df = pd.concat(all_results, ignore_index=True)

            # Clean up the DataFrame
            # Remove duplicates (same coordinates and class)
            final_df = final_df.drop_duplicates(subset=['latitude', 'longitude', 'trash_class'])

            print(f"Found {len(final_df)} unique locations for trash classes: {trash_classes}")
            print(f"\nColumns: {list(final_df.columns)}")

            if not final_df.empty:
                print("First few results:")
                print(final_df[['address', 'trash_class', 'latitude', 'longitude']].head())

            return final_df

        else:
            print(f"No locations found for trash classes: {trash_classes}")
            return pd.DataFrame()

    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()