In [1]:
from pyspark.sql import SparkSession

# Criar uma sessão Spark
spark = SparkSession.builder\
    .appName("ApiDataExtration")\
    .config("spark.master", "local[*]")\
        .getOrCreate()



In [7]:
from sodapy import Socrata

# URL da API
url = 'https://data.cityofchicago.org/api/v3/views/85ca-t3if/query.json'

client = Socrata("data.cityofchicago.org", None)
results = client.get("85ca-t3if", limit=100000)




In [5]:
results[0]

{'crash_record_id': '8874ac8edcf59711dc4af0148a1f70fff8a287eb0698128dba8e60c5864087db020e970280012d9d2ce3556e2890ab2e8c7a7a1f00bfa6ea6b953c0e4164b6d1',
 'crash_date': '2025-08-07T00:00:00.000',
 'posted_speed_limit': '30',
 'traffic_control_device': 'NO CONTROLS',
 'device_condition': 'OTHER',
 'weather_condition': 'CLEAR',
 'lighting_condition': 'DARKNESS',
 'first_crash_type': 'PARKED MOTOR VEHICLE',
 'trafficway_type': 'NOT DIVIDED',
 'alignment': 'STRAIGHT AND LEVEL',
 'roadway_surface_cond': 'DRY',
 'road_defect': 'NO DEFECTS',
 'report_type': 'ON SCENE',
 'crash_type': 'INJURY AND / OR TOW DUE TO CRASH',
 'hit_and_run_i': 'Y',
 'damage': 'OVER $1,500',
 'date_police_notified': '2025-08-07T00:22:00.000',
 'prim_contributory_cause': 'IMPROPER OVERTAKING/PASSING',
 'sec_contributory_cause': 'IMPROPER LANE USAGE',
 'street_no': '5234',
 'street_direction': 'W',
 'street_name': 'MONTROSE AVE',
 'beat_of_occurrence': '1623',
 'num_units': '3',
 'most_severe_injury': 'NO INDICATION OF I

In [None]:
from pyspark.sql.types import ArrayType, StructType, StructField, DoubleType, IntegerType, StringType, TimestampType

# Inferir o schema para o Dataframe
# Definir o schema manualmente para garantir consistência
schema = StructType([
    StructField("crash_record_id", StringType(), True), 
    StructField("crash_date", StringType(), True), 
    StructField("posted_speed_limit", StringType(), True),
    StructField("traffic_control_device", StringType(), True),
    StructField("device_condition", StringType(), True),
    StructField("weather_condition", StringType(), True),
    StructField("lighting_condition", StringType(), True), 
    StructField("first_crash_type", StringType(), True),
    StructField("trafficway_type", StringType(), True),
    StructField("alignment", StringType(), True),
    StructField("roadway_surface_cond", StringType(), True),
    StructField("road_defect", StringType(), True),
    StructField("report_type", StringType(), True),
    StructField("crash_type", StringType(), True),
    StructField("hit_and_run_i", StringType(), True),
    StructField("damage", StringType(), True), 
    StructField("date_police_notified", StringType(), True),
    StructField("prim_contributory_cause", StringType(), True),
    StructField("sec_contributory_cause", StringType(), True),
    StructField("street_no", StringType(), True),
    StructField("street_direction", StringType(), True),
    StructField("street_name", StringType(), True),
    StructField("beat_of_occurrence", IntegerType(), True),
    StructField("num_units", IntegerType(), True),
    StructField("most_severe_injury", StringType(), True),
    StructField("injuries_total", IntegerType(), True),
    StructField("injuries_fatal", IntegerType(), True),
    StructField("injuries_incapacitating", IntegerType(), True),
    StructField("injuries_non_incapacitating", IntegerType(), True),
    StructField("injuries_reported_not_evident", IntegerType(), True),
    StructField("injuries_no_indication", IntegerType(), True),
    StructField("injuries_unknown", IntegerType(), True),
    StructField("crash_hour", StringType(), True),
    StructField("crash_day_of_week", StringType(), True),
    StructField("crash_month", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("location", StructType([
        StructField("type", StringType(), True),
        StructField("coordinates", ArrayType(DoubleType()), True)
    ]), True),
    StructField(":@computed_region_rpca_8um6", StringType(), True)
])

# Criar um Dataframe a partir da lista de dicionarios (dados da API)

df = spark.createDataFrame(results, schema=schema)

df.printSchema()