# Optimizing for CO2 storage from CO2 Emitters

## Setup

In [109]:
import numpy as np
import pandas as pd
from pulp import *
from geopy.distance import distance
from geopy.distance import great_circle
import random
import pickle

In [112]:
def facility_cleaning(fac_df, emiss_df):
    facilitiesB = fac_df.rename(inplace=False,
                                    columns={'Facility Name':'facility_name',
                                             'City':'city',
                                             'Primary NAICS Code':'naics_code',
                                             'Industry Type (subparts)':'industry_subparts',
                                             'Industry Type (sectors)':'industry_sectors'})

    emsCA2019loc = pd.merge(emiss_df, facilitiesB[['Facility Id',
                                                    'facility_name',
    #                                               'Facility Name2',
                                                   'city',
                                                   'Zip Code',
                                                   'Address',
                                                   'County',
                                                   'Latitude',
                                                   'Longitude',
    #                                               'Primary NAICS Code2',
                                                   'industry_subparts',
                                                   'industry_sectors']],
                            how='left', on='Facility Id')
    emsCA2019locagg = emsCA2019loc[['Facility Id','facility_name','Latitude','Longitude',
                                    'Unit CO2 emissions (non-biogenic)']].dropna().groupby(
                                        ['Facility Id','facility_name']).agg({'Latitude':'min',
                                                            'Longitude':'min',
                                                            'Unit CO2 emissions (non-biogenic)':'sum'})
    emsCA2019locagg.reset_index(inplace=True)
    emsCA2019locagg = emsCA2019locagg.rename(columns={'Facility Id':'facility_id',
                                                      'facility_name':'facility_name',
                                                      'Latitude':'lat',
                                                      'Longitude':'lon',
                                                      'Unit CO2 emissions (non-biogenic)':'annual_emissions'})
    emsCA2019locagg = emsCA2019locagg[emsCA2019locagg['annual_emissions'] > 0]
    facility_latlon = emsCA2019locagg[['lat', 'lon']].to_records(index=False)
    facility_ids = emsCA2019locagg['facility_id'].tolist()
    facility_names = emsCA2019locagg['facility_name'].tolist()
    supply = emsCA2019locagg['annual_emissions'].tolist()
    return facility_latlon, facility_ids, facility_names, supply


def get_distances(facility_latlon, pools_latlon):
    pools_distance_matrix = []
    for i in range(len(facility_latlon)):
        distances = []
        for j in range(len(pools_latlon)):
            dist = great_circle(facility_latlon[i], pools_latlon[j]).miles
            distances.append(dist)
        pools_distance_matrix.append(distances)
    return pools_distance_matrix


def get_complex_costs(distance_matrix, pool_volumes, emitter_volumes):
    complex_costs = []
    for i in range(len(distance_matrix)):
        new = []
        for j in range(len(distance_matrix[i])):
            # cost = 2 * ($2*miles)*(volume/25) + 2,000,000
            # cost of truck = $2 per mile X2 for round trip
            # number of trucks = volume / 25
            # cost = cost of trunk * number of trucks + 2,000,000
            volume = min(pool_volumes[j], emitter_volumes[i])
            cost = 4 * distance_matrix[i][j] * volume/25 + 2000000
            new.append(cost)
        complex_costs.append(new)
    return complex_costs


def run_optimization(n_receivers, n_suppliers, supply, demand, costs, distances, facility_names, facility_latlon, pools_ids, pools_latlon, industry_str, year):
    print("Running optimization for {} industry and {} year timeframe".format(industry_str, year))
    supply = [i*year for i in supply]
    
    p_prob = LpProblem('Unbalaced_Transportation_Problem', LpMinimize)
    #costs = pools_distance_matrix_complexcost
    routes = [(i, j) for i in range(n_suppliers) for j in range(n_receivers)]

    x = LpVariable.dicts('X', routes, lowBound=0)
    p_prob += lpSum([x[i, j] * costs[i][j] for i in range(n_suppliers) for j in range(n_receivers)])

    for i in range(n_suppliers):
        p_prob += lpSum([x[i, j] for j in range(n_receivers)]) == supply[i]

    for j in range(n_receivers):
        p_prob += lpSum([x[i, j] for i in range(n_suppliers)]) <= demand[j]

    # Solving problem
    p_prob.solve()
    print("Problem successfully solved")
    
    volumes_moved = [i.varValue for i in p_prob.variables()]
    volume_matrix = []
    k=0
    while k < n_suppliers:
        vols = []
        for i in range(n_receivers):
            j = i + k*n_receivers
            vols.append(volumes_moved[j])
        volume_matrix.append(vols)
        k+=1
    print("Volume matrix created")
        
    results = []
    for i in range(n_suppliers):
        f_name = facility_names[i]
        f_loc = facility_latlon[i]
        for j in range(n_receivers):
            vol = volume_matrix[i][j]
            if vol > 0:
                w_id = pools_ids[j]
                w_loc = pools_latlon[j]
                dist = distances[i][j]
                cost = costs[i][j]
                cpt = cost/vol
                r = [f_name, f_loc, w_id, w_loc, dist, cost, vol, cpt, industry_str, year]
                results.append(r)
    results_df = pd.DataFrame(results, columns = ['facility_name', 'facility_location','pool_id','pool_location','distance','cost','volume', 'cost_per_ton', 'industry', 'timeframe'])
    print("Results created")
    print()
    print("--------------------------------------------")
    
    return results_df


## Data import and cleaning

In [115]:
wells = pd.read_csv("data/wells/AllWells_20210915.csv")

# Oil & Gas, Dry Gas (no liquids), Gas and "Liquefied Gas"
wells_oilgas = wells[ (wells['WellType'] == 'OG')  |
                      (wells['WellType'] == 'DG')  |
                      (wells['WellType'] == 'GAS') |
                      (wells['WellType'] == 'LG') ]
wells_latlon = wells_oilgas[['Latitude', 'Longitude']].to_records(index=False)
wells_ids = wells_oilgas['API'].tolist()
pooled_wells = pd.read_csv("data/pool_volumes.csv")
pools_latlon = pooled_wells[['Latitude', 'Longitude']].to_records(index=False)
pools_ids = pooled_wells['pool_id'].tolist()
pools_volumes = pooled_wells['totalco2'].tolist()
n_pools = len(pools_ids)

In [116]:
emissions = pd.read_csv(r'./data/Emissions by Unit and Fuel Type - csv/UNIT_DATA-Table 1.csv')
emsCA2019 = emissions[emissions['Reporting Year'] == 2019]
facilities = pd.read_excel(r'./data/ghgp_data_2019.xlsx',
                          sheet_name='Direct Emitters', skiprows=3)
# One Michegan emitter lists its CA headquarters. Retag it as Michigan
facilities.loc[facilities['Facility Id'] == 1000594, 'State'] = 'MI'

## Prepare data for optimization

In [117]:
facilities = facilities[(facilities.State == 'CA')]
facilities_cement = facilities[facilities['Cement Production'] > 0]
facilities_oil = facilities[facilities['Petroleum Refining'] > 0]
facilities_energy = facilities[facilities['Electricity Generation'] > 0]
facilities_steel = facilities[facilities['Iron and Steel Production'] > 0]

facility_latlon, facility_ids, facility_names, supply = facility_cleaning(facilities, emsCA2019)
n_facilities = len(facility_ids)

facility_c_latlon, facility_c_ids, facility_c_names, supply_c = facility_cleaning(facilities_cement, emsCA2019)
n_facilities_c = len(facility_c_ids)

facility_o_latlon, facility_o_ids, facility_o_names, supply_o = facility_cleaning(facilities_oil, emsCA2019)
n_facilities_o = len(facility_o_ids)

facility_e_latlon, facility_e_ids, facility_e_names, supply_e = facility_cleaning(facilities_energy, emsCA2019)
n_facilities_e = len(facility_e_ids)

facility_s_latlon, facility_s_ids, facility_s_names, supply_s = facility_cleaning(facilities_steel, emsCA2019)
n_facilities_s = len(facility_s_ids)

In [118]:
distances_c = get_distances(facility_c_latlon, pools_latlon)
complex_costs_c = get_complex_costs(distances_c, pools_volumes, supply_c)

distances_o = get_distances(facility_o_latlon, pools_latlon)
complex_costs_o = get_complex_costs(distances_o, pools_volumes, supply_o)

distances_e = get_distances(facility_e_latlon, pools_latlon)
complex_costs_e = get_complex_costs(distances_e, pools_volumes, supply_e)

distances_s = get_distances(facility_s_latlon, pools_latlon)
complex_costs_s = get_complex_costs(distances_s, pools_volumes, supply_s)

In [126]:
industry_list = ['cement', 'oil_refining', 'steel', 'energy']
year_list = [1, 5, 10, 20, 30]

inputs_list = []
for i in year_list:
    cement = [n_facilities_c, supply_c, complex_costs_c, distances_c, facility_c_names, facility_c_latlon, 'cement', i]
    oil = [n_facilities_o, supply_o, complex_costs_o, distances_o, facility_o_names, facility_o_latlon, 'oil_refining', i]
    energy = [n_facilities_e, supply_e, complex_costs_e, distances_e, facility_e_names, facility_e_latlon, 'energy', i]
    steel = [n_facilities_s, supply_s, complex_costs_s, distances_s, facility_s_names, facility_s_latlon, 'steel', i]
    inputs_list.append(cement)
    inputs_list.append(oil)
    inputs_list.append(steel)
    inputs_list.append(energy)

    

## Run optimization and write results

In [127]:
detailed_results = pd.DataFrame()

for i in inputs_list:
    results = run_optimization(n_pools,
                               i[0],
                               i[1],
                               pools_volumes,
                               i[2],
                               i[3],
                               i[4],
                               i[5],
                               pools_ids,
                               pools_latlon,
                               i[6],
                               i[7])
    detailed_results = detailed_results.append(results)
    

Running optimization for cement industry and 1 year timeframe
Problem successfully solved
Volume matrix created
Results created

--------------------------------------------
Running optimization for oil_refining industry and 1 year timeframe
Problem successfully solved
Volume matrix created
Results created

--------------------------------------------
Running optimization for steel industry and 1 year timeframe
Problem successfully solved
Volume matrix created
Results created

--------------------------------------------
Running optimization for energy industry and 1 year timeframe
Problem successfully solved
Volume matrix created
Results created

--------------------------------------------
Running optimization for cement industry and 5 year timeframe
Problem successfully solved
Volume matrix created
Results created

--------------------------------------------
Running optimization for oil_refining industry and 5 year timeframe
Problem successfully solved
Volume matrix created
Results

In [128]:
list_results = detailed_results.values.tolist()
file_name = 'data/results/full_detailed_results.pkl'
open_file = open(file_name, "wb")
pickle.dump(list_results, open_file)
open_file.close()

html_results = detailed_results.to_html()
html_file = open("data/results/full_detailed_results.html", "w")
html_file.write(html_results)
html_file.close()

In [129]:
summary_results = detailed_results.groupby(by=['industry','timeframe']).agg(
    volume_transferred=('volume', 'sum'),
    total_cost=('cost', 'sum'),
    total_distance=('distance', 'sum')  
)
summary_results['cost_per_ton'] = summary_results['total_cost']/summary_results['volume_transferred']


In [130]:
total_pool_volume = sum(pools_volumes)

summary_results['percent_pool_volume_used'] = summary_results['volume_transferred'] / total_pool_volume
summary_results

Unnamed: 0_level_0,Unnamed: 1_level_0,volume_transferred,total_cost,total_distance,cost_per_ton,percent_pool_volume_used
industry,timeframe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cement,1,18252.4,205241800.0,18411.660806,11244.64744,1e-05
cement,5,91262.0,321840000.0,26933.056665,3526.550229,5.2e-05
cement,10,182524.0,395847600.0,33726.292742,2168.74279,0.000105
cement,20,365048.0,497125900.0,43055.013564,1361.809806,0.00021
cement,30,547572.0,564558500.0,48317.452634,1031.021508,0.000315
energy,1,27951620.0,4109521000.0,158966.726577,147.022679,0.01607
energy,5,139758100.0,5622285000.0,199146.946142,40.228694,0.080348
energy,10,279516200.0,6435594000.0,216748.092135,23.024048,0.160697
energy,20,559032300.0,6362631000.0,225899.307623,11.381509,0.321393
energy,30,838548500.0,6660400000.0,229121.32921,7.942773,0.48209


In [131]:
list_summary_results = summary_results.values.tolist()
file_name = 'data/results/summary_results.pkl'
open_file = open(file_name, "wb")
pickle.dump(list_summary_results, open_file)
open_file.close()

html_summary_results = detailed_results.to_html()
html_file = open("data/results/summary_results.html", "w")
html_file.write(html_summary_results)
html_file.close()