In [0]:
import requests
import uuid
from datetime import datetime
from pyspark.sql.functions import col, lit, to_timestamp,udf, explode,from_unixtime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType, ArrayType
from delta.tables import *

In [0]:
#%run "/Users/evashrestha99@gmail.com/Weather pipeline task/Logger"

In [0]:
weather_schema = StructType([
    StructField('coord', StructType([
        StructField('lon', FloatType(), True),
        StructField('lat', FloatType(), True)
    ])),
    StructField('weather', ArrayType(StructType([
                StructField('id', IntegerType(), True),
                StructField('main', StringType(), True),
                StructField('description', StringType(), True),
                StructField('icon', StringType(), True)
    ]), True)),
    StructField('base', StringType(), True),
    StructField('main', StructType([
        StructField('temp', FloatType(), True),
        StructField('feels_like', FloatType(), True),
        StructField('temp_min', FloatType(), True),
        StructField('temp_max', FloatType(), True),
        StructField('pressure', IntegerType(), True),
        StructField('humidity', IntegerType(), True),
        StructField('sea_level', IntegerType(), True),
        StructField('grnd_level', IntegerType(), True)
    ])),
    StructField('visibility', IntegerType(), True),
    StructField('wind', StructType([
        StructField('speed', IntegerType(), True),
        StructField('deg', IntegerType(), True),
        StructField('gust', FloatType(), True)
    ])),
    StructField('clouds', StructType([
        StructField('all', FloatType(), True)
    ])),
    StructField('dt', IntegerType(), True),
    StructField('sys', StructType([
        StructField('country', StringType(), True),
        StructField('sunrise', IntegerType(), True),
        StructField('sunset', IntegerType(), True)
    ])),
    StructField('timezone', IntegerType(), True),
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('cod', IntegerType(), True)
])

In [0]:
#fetch raw data from api

@logs
def get_weather(df):
    
    def fetch_weather_data(id):
        url = f'https://api.openweathermap.org/data/2.5/weather?id={id}&appid=44719617124a11a924063c357bb44bc9'
        result = requests.get(url)
        return result.json()
    
    fetch_weather_udf = udf(lambda x: fetch_weather_data(x), weather_schema)
    
    df = df.withColumn('result', fetch_weather_udf(col('id'))).select('result')
        
    start = datetime.fromtimestamp(df.selectExpr("min(result.dt)").first()[0])
    end = datetime.fromtimestamp(df.selectExpr("max(result.dt)").first()[0])
    
    return df, start, end

In [0]:
#taking only 5 cities

df= spark.read.format("delta").load("dbfs:/user/hive/warehouse/cities")
df = df.select("*").limit(5)
df = df.withColumn('id', col('id').cast('int'))
display(df)

In [0]:
%sql
drop table if exists weather_raw

In [0]:
dbutils.fs.rm("/user/hive/warehouse/weather_raw", recurse=True)

In [0]:
raw_df = get_weather('RAW', 'weather_raw', df)
