# Grafos

## Autores

| Nome | nUSP |
| :--- | :--- |
| Guilherme de Abreu Barreto | 12543033 |
| Lucas Eduardo Gulka Pulcinelli | 12547336 |
| Vinicio Yusuke Hayashibara | 13642797 |

In [1]:
import json
import pandas as pd
import psycopg2
from sqlalchemy import create_engine, text
from sqlalchemy.orm import (
    Mapped,
    declared_attr,
    declarative_base,
    mapped_column as column,
    sessionmaker,
)
from dataclasses import dataclass
from pyvis import network as net
from tqdm import tqdm
from typing import final

In [2]:
DEFAULT_DATABASE = "postgres"
FLIGHTS_DATABASE = "flights" 
USER = "postgres"
PASSWORD = "postgres"
HOST = "localhost"
PORT = 5432
DRIVER = "postgresql+psycopg2"

engine = create_engine(
    f"{DRIVER}://{USER}:{PASSWORD}@{HOST}/{DEFAULT_DATABASE}", echo=True
)

In [3]:
with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
    try:
        conn.execute(text(
        f"""
        SELECT pg_terminate_backend(pid)
        FROM pg_stat_activity
        WHERE datname = '{FLIGHTS_DATABASE}';
        """
    ))
    except ProgrammingError as e:
        pass # Could not terminate connections (there are no connections)
    # NOTE: DROP DATABASE cannot run inside a transaction block, that is why we're
    # running it separately below.
    conn.execute(text(f"DROP DATABASE IF EXISTS {FLIGHTS_DATABASE};"))
    conn.execute(text(f"CREATE DATABASE {FLIGHTS_DATABASE};"))

2025-11-17 23:32:59,859 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-11-17 23:32:59,860 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-11-17 23:32:59,862 INFO sqlalchemy.engine.Engine select current_schema()
2025-11-17 23:32:59,863 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-11-17 23:32:59,865 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-11-17 23:32:59,866 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-11-17 23:32:59,869 INFO sqlalchemy.engine.Engine BEGIN (implicit; DBAPI should not BEGIN due to autocommit mode)
2025-11-17 23:32:59,870 INFO sqlalchemy.engine.Engine 
        SELECT pg_terminate_backend(pid)
        FROM pg_stat_activity
        WHERE datname = 'flights';
        
2025-11-17 23:32:59,871 INFO sqlalchemy.engine.Engine [generated in 0.00216s] {}
2025-11-17 23:32:59,874 INFO sqlalchemy.engine.Engine DROP DATABASE IF EXISTS flights;
2025-11-17 23:32:59,875 INFO sqlalchemy.engine.Engine [generated in 0.00071s] {}
2025-11-17 23:3

In [4]:
engine = create_engine(
    f"{DRIVER}://{USER}:{PASSWORD}@{HOST}/{FLIGHTS_DATABASE}", echo=True
)

In [5]:
with engine.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
    # Use raw connection for better control
    raw_conn = conn.connection
    cursor = raw_conn.cursor()
    
    cursor.execute("CREATE EXTENSION IF NOT EXISTS age;")
    cursor.execute("LOAD 'age';")
    cursor.execute('SET search_path = ag_catalog, "$user", public;')
    cursor.execute("SHOW search_path;")
    
    search_path = cursor.fetchone()
    print("Current search_path:", search_path[0])

2025-11-17 23:32:59,984 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-11-17 23:32:59,985 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-11-17 23:32:59,986 INFO sqlalchemy.engine.Engine select current_schema()
2025-11-17 23:32:59,987 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-11-17 23:32:59,988 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-11-17 23:32:59,988 INFO sqlalchemy.engine.Engine [raw sql] {}
Current search_path: ag_catalog, "$user", public


In [6]:
GRAPH = "openflights"

with engine.begin() as conn:
    result = conn.execute(
        text(f"SELECT * FROM pg_namespace WHERE nspname = '{GRAPH}'")
    )
    if len(result.fetchall()) > 0:
        conn.execute(text(f"SELECT drop_graph('{GRAPH}', true)"))
    conn.execute(text(f"SELECT create_graph('{GRAPH}');"))

2025-11-17 23:33:00,054 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-11-17 23:33:00,056 INFO sqlalchemy.engine.Engine SELECT * FROM pg_namespace WHERE nspname = 'openflights'
2025-11-17 23:33:00,057 INFO sqlalchemy.engine.Engine [generated in 0.00083s] {}
2025-11-17 23:33:00,058 INFO sqlalchemy.engine.Engine SELECT create_graph('openflights');
2025-11-17 23:33:00,059 INFO sqlalchemy.engine.Engine [generated in 0.00067s] {}
2025-11-17 23:33:00,072 INFO sqlalchemy.engine.Engine COMMIT


In [7]:
airport_columns = {
    'index': 'int64',
    'name': 'string', 
    'city': 'string', 
    'country': 'string', 
    'iata': 'string', 
    'icao': 'string', 
    'lat': 'float64',
    'lon': 'float64',
    'altitude': 'int64',
    'timezone': 'string', 
    'dst': 'string', 
    'tz': 'string', 
    'type': 'string', 
    'source': 'string'
}

# Load the CSV with your column names
airports_df = pd.read_csv(
    'data/airports.csv',
    header=None,
    names=airport_columns.keys(),
    dtype=airport_columns,
    na_values=['\\N'],  # Treat \N as missing values
)
airports_df = airports_df.set_index('index')
airports_df = airports_df.fillna('NULL')
airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7698 entries, 1 to 14110
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7698 non-null   string 
 1   city      7698 non-null   string 
 2   country   7698 non-null   string 
 3   iata      7698 non-null   string 
 4   icao      7698 non-null   string 
 5   lat       7698 non-null   float64
 6   lon       7698 non-null   float64
 7   altitude  7698 non-null   int64  
 8   timezone  7698 non-null   string 
 9   dst       7698 non-null   string 
 10  tz        7698 non-null   string 
 11  type      7698 non-null   string 
 12  source    7698 non-null   string 
dtypes: float64(2), int64(1), string(10)
memory usage: 842.0 KB


In [8]:
engine.echo = False
batch_size = 100
total_batches = (len(airports_df) + batch_size - 1) // batch_size

for batch_num in tqdm(range(total_batches)):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(airports_df))
    batch_df = airports_df.iloc[start_idx:end_idx]

    with engine.begin() as conn:
        queries = []
        for index, row in batch_df.iterrows():
            query = f"""
            (a_{index}:Airport {{
                index: {index},
                name: "{row['name'].replace('"', '\\"')}",
                city: "{row['city'].replace('"', '\\"')}",
                country: "{row['country'].replace('"', '\\"')}",
                iata: "{row['iata']}",
                icao: "{row['icao']}",
                lat: {row['lat']},
                lon: {row['lon']},
                altitude: {row['altitude']},
                timezone: "{row['timezone']}",
                dst: "{row['dst']}",
                tz: "{row['tz']}",
                type: "{row['type']}",
                source: "{row['source']}"
            }})
            """
            queries.append(query)
        
        conn.execute(text(
            f"""
            SELECT * FROM cypher('{GRAPH}', $$
            CREATE {",".join(queries)}
            $$) AS (result agtype);
            """
        ))

print("Airport creation completed!")
engine.echo = True

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77/77 [00:02<00:00, 27.75it/s]


In [9]:
route_columns = {
    'airline': 'string',
    'airline_id': 'Int64',
    'source': 'string',
    'source_id': 'Int64',
    'dest': 'string',
    'dest_id': 'Int64',
    'codeshare': 'string',
    'stops': 'Int64',
    'equipment': 'string'
}

# Load the CSV with your column names
routes_df = pd.read_csv(
    'data/routes.csv',
    header=None,
    names=route_columns.keys(),
    dtype=route_columns,
    na_values=['\\N'],  # Treat \N as missing values
)
routes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67663 entries, 0 to 67662
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   airline     67663 non-null  string
 1   airline_id  67184 non-null  Int64 
 2   source      67663 non-null  string
 3   source_id   67443 non-null  Int64 
 4   dest        67663 non-null  string
 5   dest_id     67442 non-null  Int64 
 6   codeshare   14597 non-null  string
 7   stops       67663 non-null  Int64 
 8   equipment   67645 non-null  string
dtypes: Int64(4), string(5)
memory usage: 4.9 MB


In [10]:
engine.echo = False
batch_size = 100
total_batches = (len(routes_df) + batch_size - 1) // batch_size

for batch_num in tqdm(range(total_batches)):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(routes_df))
    batch_df = routes_df.iloc[start_idx:end_idx]
    
    with engine.begin() as conn:
        queries = []
        for index, row in batch_df.iterrows():
            # Handle nullable values by checking for pd.NA
            airline_id = row['airline_id'] if pd.notna(row['airline_id']) else 'NULL'
            source_id = row['source_id'] if pd.notna(row['source_id']) else 'NULL'
            dest_id = row['dest_id'] if pd.notna(row['dest_id']) else 'NULL'
            codeshare = f'"{row["codeshare"]}"' if pd.notna(row['codeshare']) else 'NULL'
            equipment = f'"{row["equipment"]}"' if pd.notna(row['equipment']) else 'NULL'
            
            query = f"""
            MATCH (source:Airport {{icao: "{row['source']}"}})
            MATCH (dest:Airport {{icao: "{row['dest']}"}})
            CREATE (source)-[r:ROUTE {{
                airline: "{row['airline']}",
                airline_id: {airline_id},
                source_id: {source_id},
                dest_id: {dest_id},
                codeshare: {codeshare},
                stops: {row['stops']},
                equipment: {equipment}
            }}]->(dest)
            """
            queries.append(query)
        
        # Execute all queries in this batch
        for query in queries:
            try:
                conn.execute(text(
                    f"""
                    SELECT * FROM cypher('{GRAPH}', $$
                    {query}
                    $$) AS (result agtype);
                    """
                ))
            except Exception as e:
                print(f"Error executing query: {e}")
                print(f"Problematic query: {query}")
                continue

print("Routes creation completed!")
engine.echo = True

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 677/677 [05:22<00:00,  2.10it/s]

Routes creation completed!





In [26]:
# The 'ORDER BY' at the end is REMOVED
query = """
MATCH path1 = (sp:Airport {city: "São Paulo"})-[r1:ROUTE]->(bsb:Airport {city: "Brasília"})
WHERE r1.stops = 0
UNWIND nodes(path1) AS ap
WITH path1, collect(ap.name) AS airports
UNWIND relationships(path1) AS rel
WITH path1, airports, collect(rel.airline) AS airlines, collect(rel.stops) AS stops_per_leg
RETURN 
    airports,
    airlines,
    stops_per_leg,
    0 AS total_stops,
    0 AS num_stops,
    "Direct" AS route_type

UNION ALL

MATCH path2 = (sp:Airport {city: "São Paulo"})-[r2a:ROUTE]->(stop1:Airport)-[r2b:ROUTE]->(bsb:Airport {city: "Brasília"})
WHERE r2a.stops = 0 AND r2b.stops = 0
  AND stop1.city <> "São Paulo" AND stop1.city <> "Brasília"
UNWIND nodes(path2) AS ap
WITH path2, collect(ap.name) AS airports
UNWIND relationships(path2) AS rel
WITH path2, airports, collect(rel.airline) AS airlines, collect(rel.stops) AS stops_per_leg
RETURN 
    airports,
    airlines,
    stops_per_leg,
    0 AS total_stops,
    1 AS num_stops,
    "1 Stop" AS route_type

UNION ALL

MATCH path3 = (sp:Airport {city: "São Paulo"})-[r3a:ROUTE]->(stop1:Airport)-[r3b:ROUTE]->(stop2:Airport)-[r3c:ROUTE]->(bsb:Airport {city: "Brasília"})
WHERE r3a.stops = 0 AND r3b.stops = 0 AND r3c.stops = 0
  AND stop1.city <> "São Paulo" AND stop1.city <> "Brasília"
  AND stop2.city <> "São Paulo" AND stop2.city <> "Brasília"
  AND stop1 <> stop2
UNWIND nodes(path3) AS ap
WITH path3, collect(ap.name) AS airports
UNWIND relationships(path3) AS rel
WITH path3, airports, collect(rel.airline) AS airlines, collect(rel.stops) AS stops_per_leg
RETURN 
    airports,
    airlines,
    stops_per_leg,
    0 AS total_stops,
    2 AS num_stops,
    "2 Stops" AS route_type
"""

# The Python code stays the same
with engine.connect() as conn:
    result = conn.execute(text(f"""
        SELECT * FROM cypher('{GRAPH}', $$
        {query}
        $$) AS (airports agtype, airlines agtype, stops_per_leg agtype, total_stops agtype, num_stops agtype, route_type agtype)
        ORDER BY num_stops, total_stops;
    """))
    
    for row in result:
        print(f"Route Type: {row.route_type}")
        print(f"Airports: {row.airports}")
        print(f"Airlines: {row.airlines}")
        print(f"Stops per leg: {row.stops_per_leg}")
        print(f"Total stops: {row.total_stops}")
        print("-" * 50)

2025-11-18 00:11:40,655 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-11-18 00:11:40,657 INFO sqlalchemy.engine.Engine 
        SELECT * FROM cypher('openflights', $$
        
MATCH path1 = (sp:Airport {city: "São Paulo"})-[r1:ROUTE]->(bsb:Airport {city: "Brasília"})
WHERE r1.stops = 0
UNWIND nodes(path1) AS ap
WITH path1, collect(ap.name) AS airports
UNWIND relationships(path1) AS rel
WITH path1, airports, collect(rel.airline) AS airlines, collect(rel.stops) AS stops_per_leg
RETURN 
    airports,
    airlines,
    stops_per_leg,
    0 AS total_stops,
    0 AS num_stops,
    "Direct" AS route_type

UNION ALL

MATCH path2 = (sp:Airport {city: "São Paulo"})-[r2a:ROUTE]->(stop1:Airport)-[r2b:ROUTE]->(bsb:Airport {city: "Brasília"})
WHERE r2a.stops = 0 AND r2b.stops = 0
  AND stop1.city <> "São Paulo" AND stop1.city <> "Brasília"
UNWIND nodes(path2) AS ap
WITH path2, collect(ap.name) AS airports
UNWIND relationships(path2) AS rel
WITH path2, airports, collect(rel.airline) AS airlines