In [2]:
import pandas as pd

# Load the CSV file
file_path = 'EIA_Refinery_Data.csv'
refineries_data = pd.read_csv(file_path)

# From the dataset info, there is no single column that explicitly represents the total production capacity
# Therefore, we might sum up relevant columns that indicate different aspects of refining capacity
# However, many of these columns have a significant number of missing values

# For this analysis, we'll sum up all the available capacity-related columns for each refinery as an estimate of its total capacity
# Then, we'll calculate the weighted average centroid for PADD III locations based on this estimated total capacity

# Columns to consider for capacity estimation (excluding columns with many null values or non-relevant data)
capacity_columns = [
    'Vacuum Dist', 
    'Catalytic Cracking', 
    'Hydro Cracking', 
    'Catalytic Recorming', 
    'Alkylates, Isomerization', 
    'Desulfurization', 
    'Fluid and Delayed Coking', 
    'Asphalt and Road Oil'
]

# Creating a new column 'Total Capacity' as the sum of the selected capacity columns
refineries_data['Total Capacity'] = refineries_data[capacity_columns].sum(axis=1)

# Filtering for PADD III locations
padd_iii_data = refineries_data[refineries_data['PADD'] == 3]

# Calculating the weighted average centroid based on overall production capacity
# The weighted average of the latitude and longitude coordinates is computed
total_capacity = padd_iii_data['Total Capacity'].sum()
weighted_avg_lat = (padd_iii_data['Latitude'] * padd_iii_data['Total Capacity']).sum() / total_capacity
weighted_avg_lon = (padd_iii_data['Longitude'] * padd_iii_data['Total Capacity']).sum() / total_capacity

weighted_avg_lat, weighted_avg_lon


(30.111955893502792, -94.00019300317739)