In [0]:
import pandas as pd
import requests
from datetime import datetime, timedelta

# Path to your extracted file
file_path = "/Volumes/workspace/default/miningaccidents/Accidents.txt"  # Replace with your local or cloud path

# Load the file using pipe delimiter and quoted values
df = pd.read_csv(
    file_path,
    sep="|",
    quotechar='"',
    encoding="latin1",  # Use 'latin1' encoding to handle decoding errors
    dtype=str  # Start with all fields as strings
)

# Strip whitespace from column names and convert to lowercase
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

# Convert relevant date fields
df['accident_dt'] = pd.to_datetime(df['accident_dt'], errors='coerce')
df['return_to_work_dt'] = pd.to_datetime(df['return_to_work_dt'], errors='coerce')

# Drop rows missing crucial information (e.g., mine_id or accident_dt)
df = df.dropna(subset=["mine_id", "accident_dt"])

# Preview cleaned data
# print(df.head())

# Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(df)

# Save as Delta table in the default workspace
table_name = "mine_accidents"
spark_df.write.format("delta").mode("overwrite").saveAsTable(table_name)

print(f"🎉 Data saved as Delta table: {table_name}")

In [0]:
# Analyze the most common types of accidents
common_accidents = df['accident_type'].value_counts().head(10)

display(common_accidents)

In [0]:
# Not Required
# # Gettng address data
# #/Volumes/workspace/default/miningaccidents/AddressOfRecord.txt

# import pandas as pd

# # Path to your extracted file
# file_path_add = "/Volumes/workspace/default/miningaccidents/AddressOfRecord.txt"  # Replace with your local or cloud path

# # Load the file using pipe delimiter and quoted values
# df_add = pd.read_csv(
#     file_path_add,
#     sep="|",
#     quotechar='"',
#     encoding="latin1",  # Use 'latin1' encoding to handle decoding errors
#     dtype=str  # Start with all fields as strings
# )

# # Strip whitespace from column names and convert to lowercase
# df_add.columns = [col.strip().lower().replace(" ", "_") for col in df_add.columns]

# # print(df_add.head())
# df_add.display()

In [0]:
#not required
# df_add = df_add.fillna('')
# # Combine zip_cd and state into one column
# df_add['zip_state'] = df_add.apply(lambda row: f"{row['zip_cd']}, {row['state']}", axis=1)
# # Replace empty cells with blanks
# display(df_add['zip_state'])

In [0]:
# Getting mines data location co-ordinates

import pandas as pd

# Path to your extracted file
file_path_add = "/Volumes/workspace/default/miningaccidents/Mines.txt" # Replace with your local or cloud path

# Load the file using pipe delimiter and quoted values
df_add = pd.read_csv(
    file_path_add,
    sep="|",
    quotechar='"',
    encoding="latin1",  # Use 'latin1' encoding to handle decoding errors
    dtype=str  # Start with all fields as strings
)

# Strip whitespace from column names and convert to lowercase
df_add.columns = [col.strip().lower().replace(" ", "_") for col in df_add.columns]

# print(df_add.head())
df_add.display()

# Convert Pandas DataFrame to Spark DataFrame
spark_df_mine_locations = spark.createDataFrame(df_add)

# Save as Delta table in the default workspace
table_name = "mine_locations"
spark_df_mine_locations.write.format("delta").mode("overwrite").saveAsTable(table_name)

print(f"🎉 Data saved as Delta table: {table_name}")

In [0]:
%python
from pyspark.sql.functions import to_date

# Load the mine_accidents table
mine_accidents_df = spark.table("default.mine_accidents")

# Load the mine_locations table
mine_locations_df = spark.table("default.mine_locations")

# Rename the 'coal_metal_ind' column in mine_locations_df to avoid conflict
mine_locations_df = mine_locations_df.withColumnRenamed("coal_metal_ind", "coal_metal_ind_loc")

# Rename the 'mine_id' column in mine_locations_df to avoid conflict
mine_locations_df = mine_locations_df.withColumnRenamed("mine_id", "mine_id_loc")

# Join the two DataFrames on the common key 'mine_id' and 'current_controller_id'
combined_df = mine_accidents_df.join(
    mine_locations_df,
    on=[mine_accidents_df.mine_id == mine_locations_df.mine_id_loc, 
        mine_accidents_df.controller_id == mine_locations_df.current_controller_id],
    how='inner'
)

# Convert 'accident_dt' to proper date format
combined_df = combined_df.withColumn("accident_dt", to_date(combined_df["accident_dt"]))

# Drop the existing Delta table if it exists
spark.sql("DROP TABLE IF EXISTS default.combined_mine_data")

# Save the combined DataFrame as a Delta table in the default schema
combined_df.write.format("delta").mode("overwrite").saveAsTable("default.combined_mine_data")

print("🎉 Data saved as Delta table: default.combined_mine_data")

In [0]:
# Load the combined_mine_data table
combined_mine_data_df = spark.table("default.combined_mine_data")

# Example query: Select specific columns and filter by a condition
query = """
SELECT distinct latitude, longitude,current_mine_name
FROM default.combined_mine_data
where fiscal_yr > 2020
"""

# Execute the query
result_df = spark.sql(query)

# Display the result
display(result_df)

In [0]:
print(result_df.count())

In [0]:
# Assuming df is already defined and loaded
# Join df to df_add on a common key, e.g., 'mine_id'
df_combined = pd.merge(df, df_add, on='mine_id', how='inner')

# Display the combined DataFrame
display(df_combined)

In [0]:
# not required

# import pandas as pd
# from geopy.geocoders import Nominatim
# from geopy.exc import GeocoderTimedOut, GeocoderServiceError
# import time

# # Example DataFrame (remove this if you're using your actual df_add)
# # df_add = pd.DataFrame({'zip_cd': ['35040', '10001'], 'state': ['Alabama', 'New York']})

# # Step 1: Combine ZIP code and state into one column
# df_add['zip_state'] = df_add.apply(lambda row: f"{row['zip_cd']}, {row['state']}", axis=1)

# # Step 2: Initialize geolocator
# geolocator = Nominatim(user_agent="geoapi")

# # Step 3: Define geocoding function with retry and caching
# geo_cache = {}

# def get_lat_long(location):
#     if location in geo_cache:
#         return geo_cache[location]
    
#     try:
#         loc = geolocator.geocode(location)
#         time.sleep(1)  # Respect Nominatim rate limits

#         if loc:
#             result = (loc.latitude, loc.longitude)
#         else:
#             result = (None, None)
        
#         geo_cache[location] = result
#         return result
    
#     except (GeocoderTimedOut, GeocoderServiceError):
#         time.sleep(2)
#         return get_lat_long(location)

# # Step 4: Apply the function and split into new columns
# df_add[['latitude', 'longitude']] = df_add['zip_state'].apply(get_lat_long).apply(pd.Series)

# # Step 5: Preview the DataFrame
# display(df_add.head())


In [0]:
# not required

# %pip install opencage

# import pandas as pd
# from opencage.geocoder import OpenCageGeocode
# import time

# # Your OpenCage API Key
# key = '1980097060d7409a8811e0a65f5b60b1'
# geocoder = OpenCageGeocode(key)

# # Combine ZIP and state
# df_add['zip_state'] = df_add.apply(lambda row: f"{row['zip_cd']}, {row['state']}", axis=1)

# # Caching to avoid repeat lookups
# geo_cache = {}

# def get_lat_long(location):
#     if location in geo_cache:
#         return geo_cache[location]
    
#     try:
#         result = geocoder.geocode(location)
#         time.sleep(0.2)  # avoid hitting rate limits

#         if result and len(result):
#             lat = result[0]['geometry']['lat']
#             lng = result[0]['geometry']['lng']
#             geo_cache[location] = (lat, lng)
#             return lat, lng
#         else:
#             geo_cache[location] = (None, None)
#             return None, None

#     except Exception as e:
#         print(f"Error for {location}: {e}")
#         return None, None

# # Apply to DataFrame
# df_add[['latitude', 'longitude']] = df_add['zip_state'].apply(get_lat_long).apply(pd.Series)

# # Preview
# df_add.head()



In [0]:
# # %python
# # %pip install tabulate
# import tabulate
# import json
# import pandas as pd

# # Path to the downloaded file
# file_path = '/Volumes/workspace/default/miningaccidents/lite.json'  # Update this if needed

# # Step 1: Open and load JSON from the file
# with open(file_path, 'r', encoding='utf-8') as f:
#     stations = json.load(f)

# # # Load the data again in case the previous state is not preserved
# df = pd.read_json(file_path)

# # Flatten the 'name' column
# df_name = pd.json_normalize(df['name'], sep='_')
# df = pd.concat([df, df_name], axis=1)

# # Flatten the 'identifiers' column
# df_identifiers = pd.json_normalize(df['identifiers'], sep='_')
# df = pd.concat([df, df_identifiers], axis=1)

# # Flatten the 'location' column
# df_location = pd.json_normalize(df['location'], sep='_')
# df = pd.concat([df, df_location], axis=1)

# # Flatten the 'inventory' column
# df_inventory = pd.json_normalize(df['inventory'], sep='_')
# df = pd.concat([df, df_inventory], axis=1)

# # Drop the original nested columns
# df = df.drop(columns=['name', 'identifiers', 'location', 'inventory'])

# # Display the first 5 rows of the flattened DataFrame
# # print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# # Print the column names and their data types
# #print(df.info())
# df.display()

In [0]:
from geopy.distance import geodesic

def find_nearest_station(mine_coord, df):
        df['distance_km'] = df.apply(
        lambda row: geodesic(mine_coord, (row['latitude'], row['longitude'])).km, axis=1
    )
        return df.sort_values('distance_km').iloc[0]

# Example
mine_location = ( 41.305,-94.453333)
nearest_station = find_nearest_station(mine_location, df)
print(nearest_station)




In [0]:
# %pip install pandas==2.0.3
%restart_python
# Update the meteostat library
# %pip install --upgrade meteostat

In [0]:
%python
# Update the meteostat library
# %pip install --upgrade meteostat

from meteostat import Point, Daily
from datetime import datetime
import pandas as pd

# Define the station and date range
station = Point( 41.305,-94.453333)  # Example: Denver
start = datetime(2021, 2, 24) --2008-05-13
end = datetime(2021, 2, 24)

# Fetch daily weather data
data = Daily(station, start, end)
weather_df = data.fetch()

# OPTIONAL: Replace missing values with None (if needed)
weather_df = weather_df.where(pd.notnull(weather_df), None)

# Display the DataFrame
print(weather_df)

# 44.146667	-72.481111
# 2021-02-24

In [0]:
from meteostat import Point, Daily
import pandas as pd
from datetime import datetime

# Example accident data
accidents = pd.DataFrame({
    'accident_id': [1, 2],
    'date': ['2023-07-15', '2023-11-02'],
    'latitude': [39.7392, 40.01499],
    'longitude': [-104.9903, -105.2705]
    # 44.14, 72.48
})

# Function to get weather
def get_weather(lat, lon, date):
    location = Point(lat, lon)
    start = datetime.strptime(date, "%Y-%m-%d")
    end = start  # Fetch data only for the accident day

    try:
        data = Daily(location, start, end)
        data = data.fetch()

        if not data.empty:
            # Return temperature, precipitation, etc.
            return {
                'temp_c': data['tavg'].iloc[0],
                'precip_mm': data['prcp'].iloc[0],
                'snow_cm': data['snow'].iloc[0],
                'wind_kmh': data['wspd'].iloc[0]
            }
        else:
            return {'temp_c': None, 'precip_mm': None, 'snow_cm': None, 'wind_kmh': None}

    except Exception as e:
        print(f"Error fetching weather for {lat}, {lon} on {date}: {e}")
        return {'temp_c': None, 'precip_mm': None, 'snow_cm': None, 'wind_kmh': None}

# Apply to accident data
weather_data = accidents.apply(
    lambda row: get_weather(row['latitude'], row['longitude'], row['date']), axis=1
)

# Convert weather data to DataFrame and merge
weather_df = pd.json_normalize(weather_data)
accidents_with_weather = pd.concat([accidents, weather_df], axis=1)

print(accidents_with_weather)


In [0]:
# Select distinct fiscal_yr values and order by descending
distinct_fiscal_years = result_df.select("fiscal_yr").distinct().orderBy("fiscal_yr", ascending=False)

# Display the distinct fiscal_yr values
display(distinct_fiscal_years)

In [0]:
%python
from meteostat import Point, Daily
from datetime import datetime
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, DoubleType, StringType

# Function to fetch weather details
def fetch_weather_for_row(lat, lon, date_str):
    try:
        location = Point(lat, lon)
        date = datetime.strptime(date_str, "%Y-%m-%d")
        data = Daily(location, date, date)
        data = data.fetch()

        if not data.empty:
            temp_c = data['tavg'].iloc[0]
            precip_mm = data['prcp'].iloc[0]
            snow_cm = data['snow'].iloc[0]
            wind_kmh = data['wspd'].iloc[0]

            # Classify weather
            if (temp_c is not None and (temp_c < 5 or temp_c > 30)) or \
               (precip_mm is not None and precip_mm >= 2) or \
               (snow_cm is not None and snow_cm > 0) or \
               (wind_kmh is not None and wind_kmh >= 40):
                weather_summary = "Bad"
            else:
                weather_summary = "Good"

            return (temp_c, precip_mm, snow_cm, wind_kmh, weather_summary)

        else:
            return (None, None, None, None, "Unknown")

    except Exception as e:
        print(f"Error fetching weather for {lat}, {lon} on {date_str}: {e}")
        return (None, None, None, None, "Unknown")

# Define the schema for the returned values
schema = StructType([
    StructField("temp_c", DoubleType(), True),
    StructField("precip_mm", DoubleType(), True),
    StructField("snow_cm", DoubleType(), True),
    StructField("wind_kmh", DoubleType(), True),
    StructField("weather_summary", StringType(), True)
])

# Register the UDF
fetch_weather_udf = udf(fetch_weather_for_row, schema)

# Apply the UDF to the dataframe
result_df = result_df.withColumn(
    "weather_data",
    fetch_weather_udf(result_df["latitude"], result_df["longitude"], result_df["accident_dt"])
)

# Split the struct column into separate columns
result_df = result_df.select(
    "*",
    "weather_data.temp_c",
    "weather_data.precip_mm",
    "weather_data.snow_cm",
    "weather_data.wind_kmh",
    "weather_data.weather_summary"
).drop("weather_data")

# Preview updated dataframe
display(result_df)

# Optional: Save to CSV
# result_df.write.csv("accident_data_with_weather.csv", header=True)

In [0]:
import requests

latitude = 44.146667
longitude = -72.481111
date_str = "2019-09-19"

# Open-Meteo API URL
url = (
    f"https://archive-api.open-meteo.com/v1/archive?"
    f"latitude={latitude}&longitude={longitude}"
    f"&start_date={date_str}&end_date={date_str}"
    "&daily=temperature_2m_max,precipitation_sum,snowfall_sum,windspeed_10m_max"
    "&timezone=UTC"
)

response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    daily = data['daily']

    temp_c = daily['temperature_2m_max'][0]
    precip_mm = daily['precipitation_sum'][0]
    snow_cm = daily['snowfall_sum'][0]
    wind_kmh = daily['windspeed_10m_max'][0]

    # Classify weather
    if (temp_c < 5 or temp_c > 30) or precip_mm >= 2 or snow_cm > 0 or wind_kmh >= 40:
        weather_summary = "Bad"
    else:
        weather_summary = "Good"

    print(f"Date: {date_str}")
    print(f"Location: ({latitude}, {longitude})")
    print(f"Max Temp (°C): {temp_c}")
    print(f"Precipitation (mm): {precip_mm}")
    print(f"Snowfall (cm): {snow_cm}")
    print(f"Max Wind Speed (km/h): {wind_kmh}")
    print(f"Weather Summary: {weather_summary}")

else:
    print(f"API Error: {response.status_code}")


In [0]:
%pip install pandas tqdm requests


In [0]:
%python
import requests
import pandas as pd
from tqdm import tqdm  # Progress bar

# Assuming result_df is a PySpark DataFrame
# Convert PySpark DataFrame to Pandas DataFrame
result_pd_df = result_df.toPandas()

# Function to fetch weather details for each row
def fetch_weather(row):
    lat = row['latitude']
    lon = row['longitude']
    date_str = row['accident_dt']

    try:
        # Open-Meteo API URL
        url = (
            f"https://archive-api.open-meteo.com/v1/archive?"
            f"latitude={lat}&longitude={lon}"
            f"&start_date={date_str}&end_date={date_str}"
            "&daily=temperature_2m_max,precipitation_sum,snowfall_sum,windspeed_10m_max"
            "&timezone=UTC"
        )
        response = requests.get(url)

        if response.status_code == 200:
            daily_data = response.json()['daily']
            temp_c = daily_data['temperature_2m_max'][0]
            precip_mm = daily_data['precipitation_sum'][0]
            snow_cm = daily_data['snowfall_sum'][0]
            wind_kmh = daily_data['windspeed_10m_max'][0]

            # Classify weather
            if (temp_c < 5 or temp_c > 30) or precip_mm >= 2 or snow_cm > 0 or wind_kmh >= 40:
                weather_summary = "Bad"
            else:
                weather_summary = "Good"

            return pd.Series([temp_c, precip_mm, snow_cm, wind_kmh, weather_summary])
        else:
            print(f"API error {response.status_code} for {lat}, {lon} on {date_str}")
            return pd.Series([None, None, None, None, "Unknown"])

    except Exception as e:
        print(f"Error fetching weather for {lat}, {lon} on {date_str}: {e}")
        return pd.Series([None, None, None, None, "Unknown"])

# Apply fetch_weather function with progress bar
tqdm.pandas(desc="Fetching Weather")
result_pd_df[['temp_c', 'precip_mm', 'snow_cm', 'wind_kmh', 'weather_summary']] = result_pd_df.progress_apply(
    fetch_weather, axis=1
)

# Convert back to PySpark DataFrame if needed
result_df = spark.createDataFrame(result_pd_df)

# Preview updated DataFrame
display(result_df)

# Optionally save result
# result_df.toPandas().to_csv("result_df_with_weather.csv", index=False)

In [0]:
from pyspark.sql import functions as F
import requests
from tqdm import tqdm

# Get unique combinations
unique_locations_df = result_df.select("latitude", "longitude", "accident_dt").distinct()

# Collect unique combinations to driver for API calls
unique_locations = unique_locations_df.collect()



weather_data = []

for row in tqdm(unique_locations, desc="Fetching Weather"):
    lat = row['latitude']
    lon = row['longitude']
    date_str = row['accident_dt']

    try:
        # API Call
        url = (
            f"https://archive-api.open-meteo.com/v1/archive?"
            f"latitude={lat}&longitude={lon}"
            f"&start_date={date_str}&end_date={date_str}"
            "&daily=temperature_2m_max,precipitation_sum,snowfall_sum,windspeed_10m_max"
            "&timezone=UTC"
        )
        response = requests.get(url)
        if response.status_code == 200:
            daily_data = response.json()['daily']
            temp_c = daily_data['temperature_2m_max'][0]
            precip_mm = daily_data['precipitation_sum'][0]
            snow_cm = daily_data['snowfall_sum'][0]
            wind_kmh = daily_data['windspeed_10m_max'][0]

            # Classify weather
            if (temp_c < 5 or temp_c > 30) or precip_mm >= 2 or snow_cm > 0 or wind_kmh >= 40:
                weather_summary = "Bad"
            else:
                weather_summary = "Good"

            # Append result
            weather_data.append((lat, lon, date_str, temp_c, precip_mm, snow_cm, wind_kmh, weather_summary))
        else:
            # API failure
            weather_data.append((lat, lon, date_str, None, None, None, None, "Unknown"))

    except Exception as e:
        # Exception case
        weather_data.append((lat, lon, date_str, None, None, None, None, "Unknown"))

