In [87]:
#!pip install requests
#!pip install psycopg2

Collecting psycopg2
  Downloading psycopg2-2.9.9-cp311-cp311-win_amd64.whl.metadata (4.5 kB)
Downloading psycopg2-2.9.9-cp311-cp311-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.2 MB 1.3 MB/s eta 0:00:01
   ----------- ---------------------------- 0.3/1.2 MB 4.3 MB/s eta 0:00:01
   --------------------------------- ------ 1.0/1.2 MB 7.7 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 7.4 MB/s eta 0:00:00
Installing collected packages: psycopg2
Successfully installed psycopg2-2.9.9


In [28]:
import requests
import pytz
from datetime import datetime, timedelta
import os
import tempfile
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, when,concat, lit,regexp_replace
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import concat_ws
import json
import psycopg2

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

### Define JDBC path
jar_path = os.path.abspath("./postgresql-42.7.4.jar")


try:
    spark.stop()
except:
    pass

#Create Spark session including JAR
spark = SparkSession.builder \
    .appName('pySparkSetup') \
    .config("spark.jars", jar_path) \
    .getOrCreate()

In [22]:
#DB Connection
url_db = "jdbc:postgresql://localhost:5432/postgres"  # Cambia la URL si tu DB tiene otro puerto o host
properties = {
    "user": "postgres",         # Coloca tu usuario de PostgreSQL
    "password": "postgres",  # Coloca tu contraseña de PostgreSQL
    "driver": "org.postgresql.Driver"
}

In [23]:
#Note: You should change the values to the database connection psycopg2
conn = psycopg2.connect(
    host="localhost",
    database="postgres",
    user="postgres",
    password="postgres"
)


In [24]:
#Check currently date
timezone = pytz.timezone('US/Pacific')

# datetime variables containing current date
date_now = datetime.now(timezone)

# Substract 7 days to fetch the last week
last_days=(date_now - timedelta(days=7))

date_now_str = date_now.strftime("%Y-%m-%dT%H:%M:%S%z")
last_days_str = last_days.strftime("%Y-%m-%dT%H:%M:%S%z")
print("date_now_str", date_now_str)
print("last_days_str", last_days_str)

#Now I can get the last 7 days in the api endpoint https://api.weather.gov/stations/station_id/observations/

date_now_str 2024-09-06T09:44:48-0700
last_days_str 2024-08-30T09:44:48-0700


In [25]:
# Variables declaration, if I need change the statition ID or another params to the request 

#Here I can change for other value, if the solution is called in AWS GLUE for example or called it in State machine
#The station_id will be in the arguments

station_id= "0128W" #"0112W" 


# Build ulr with the parameters
url_ep = f"https://api.weather.gov/stations/{station_id}/observations?start={last_days_str}&end={date_now_str}"


print("URL:", url_ep)

# Fetch de data
response = requests.get(url_ep)

#If response == 200 it is OK
if response.status_code == 200:
    data = response.json()

    #Save the JSON in temporal file, I don't know the size of data so this way keep less memory in usage
    with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w') as temp_file:
        json.dump(data['features'], temp_file)
        temp_filename = temp_file.name
    # Read the json file
    df = spark.read.json(temp_filename)
else:
    print("Error", response.status_code)
    print(response.text)



URL: https://api.weather.gov/stations/0128W/observations?start=2024-08-30T09:44:48-0700&end=2024-09-06T09:44:48-0700


In [26]:
# Create an UDF for get the name from other link properties.station with that I can go to the name,timeZone,coordinates of station.
def get_station_info(station_url):
    try:
        # Fetch data,
        response = requests.get(station_url)
        # if response equal to 200 it is OK
        if response.status_code == 200:
            station_data = response.json()
            
            #Get the station name
            station_name = station_data.get('properties', {}).get('name', "Unknown")
            
            #Get the station time_zone
            time_zone = station_data.get('properties', {}).get('timeZone', "Unknown")
            
            #Get the station coordinates
            coordinates = station_data.get('geometry', {}).get('coordinates', ["Unknown", "Unknown"])
            
            return (station_name, time_zone, coordinates[0], coordinates[1])  # Nombre, zona horaria, latitud, longitud    
        else:
            return ("Unknown", "Unknown", "Unknown", "Unknown")  # Default values
    except Exception as e:
        return ("Unknown", "Unknown", "Unknown", "Unknown")  # Exception default values



In [29]:
df_aux=df #Copy the data frame to avoid loose the original information

df_selected = df_aux.select(
    col("id"),
    col("properties.timestamp").alias("timestamp"),
    col("properties.temperature.value").alias("temperature_value"),
    col("properties.temperature.unitCode").alias("temperature_unit"),
    col("properties.windSpeed.value").alias("wind_speed_value"),
    col("properties.windSpeed.unitCode").alias("wind_speed_unit"),
    col("properties.relativeHumidity.value").alias("humidity_value"),
    col("properties.relativeHumidity.unitCode").alias("humidity_unit"),
    col("properties.station").alias("station_url")
)

get_station_info_udf = udf(get_station_info, ArrayType(StringType()))

#Using the UDF
df_selected = df_selected.withColumn("station_info", get_station_info_udf(col("station_url")))

#Splitting each data column
df_selected = df_selected.withColumn("station_name", col("station_info")[0])
df_selected = df_selected.withColumn("time_zone", col("station_info")[1])
df_selected = df_selected.withColumn("latitude", col("station_info")[2])
df_selected = df_selected.withColumn("longitude", col("station_info")[3])
df_selected = df_selected.withColumn("station_id",lit(station_id))
df_selected =  df_selected.withColumn("temperature_unit",regexp_replace(col("temperature_unit"), "wmoUnit:", ""))
df_selected =  df_selected.withColumn("humidity_unit",regexp_replace(col("humidity_unit"), "wmoUnit:", ""))
df_selected =  df_selected.withColumn("wind_speed_unit",regexp_replace(col("wind_speed_unit"), "wmoUnit:", ""))


#Drop column 'station_info' 
df_selected = df_selected.drop("station_info")


In [30]:
#Create a cursor
cursor = conn.cursor()

# Verificar si la tabla existe
cursor.execute("""
    SELECT EXISTS (
        SELECT 1
        FROM information_schema.tables 
        WHERE table_schema = 'public' 
        AND table_name = 'weather_information'
    );
""")

#Check if the table exists
table_exists = cursor.fetchone()[0]


if not table_exists:
    cursor.execute("""
        CREATE TABLE public.weather_information (
            station_id varchar NOT NULL,
            station_name varchar NULL,
            station_timezone varchar NULL,
            latitude float8 NULL,
            longitude float8 NULL,
            observation_timestamp varchar NULL,
            temperature varchar NULL,
            temperature_unit varchar NULL,
            wind_speed varchar NULL,
            wind_speed_unit varchar NULL,
            humidity varchar NULL,
            humidity_unit varchar NULL,
            createt_at timestamp(0) NULL DEFAULT now(),
            station_url varchar NULL,
            id varchar NULL
        );
    """)
    conn.commit()
    print("Table created successfully.")
else:
    print("Table exists.")

# Cerrar la conexión
cursor.close()
conn.close()

Table created successfully.


In [31]:
#Mapping each column to SQL table
df_sql = df_selected.select(
    col("station_id"),
    col("station_name"),
    col("time_zone").alias("station_timezone"),
    when(col("longitude") == "Unknown", None).otherwise(col("longitude").cast("float")).alias("longitude"), 
    when(col("latitude") == "Unknown", None).otherwise(col("latitude").cast("float")).alias("latitude"), 
    col("timestamp").alias("observation_timestamp"),
    col("temperature_value").alias("temperature"),
    col("temperature_unit"),
    col("wind_speed_value").alias("wind_speed"),
    col("wind_speed_unit"),
    col("humidity_value").alias("humidity"),
    col("humidity_unit"),
    col("station_url"),
    col("id")
)

#Query to get max  observation_timestamp in all table to ask Which information will be Insert or Not
query = "(SELECT MAX(observation_timestamp) as max_timestamp FROM public.weather_information WHERE station_id = '"+station_id+"' ) AS temp"


#Read query
df_max_timestamp = spark.read.jdbc(url=url_db, table=query, properties=properties)


max_timestamp = df_max_timestamp.collect()[0]["max_timestamp"]

#Filter the data to Insert only the rows wich timestamp greater than the max timestamp in the table
#If max timestamp does not exist will insert all data frame
if max_timestamp is None:
    df_filtered=df_sql
else:
    df_filtered = df_sql.filter(col("observation_timestamp") > max_timestamp)

# Mostrar el DataFrame filtrado
df_filtered.show()

#Write data frame in database
df_filtered.write.jdbc(url=url_db, table="weather_information", mode="append", properties=properties)


Py4JJavaError: An error occurred while calling o1095.showString.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:64)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:832)

The currently active SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:64)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:832)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:120)
	at org.apache.spark.SparkContext.broadcastInternal(SparkContext.scala:1545)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1530)
	at org.apache.spark.sql.execution.datasources.json.JsonFileFormat.buildReader(JsonFileFormat.scala:99)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:138)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:129)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:285)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:548)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:537)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:576)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.BaseLimitExec.inputRDDs(limit.scala:168)
	at org.apache.spark.sql.execution.BaseLimitExec.inputRDDs$(limit.scala:167)
	at org.apache.spark.sql.execution.LocalLimitExec.inputRDDs(limit.scala:208)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.python.EvalPythonExec.doExecute(EvalPythonExec.scala:88)
	at org.apache.spark.sql.execution.python.EvalPythonExec.doExecute$(EvalPythonExec.scala:87)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.doExecute(BatchEvalPythonExec.scala:34)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:195)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:364)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:498)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4177)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3161)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3382)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:284)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:323)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:64)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:832)
