In [1]:
import numpy as np
import pandas as pd
from pulp import *
from geopy.distance import distance
from geopy.distance import great_circle
import random
import pickle

In [13]:
def facility_cleaning(fac_df, emiss_df):
    facilitiesB = fac_df.rename(inplace=False,
                                    columns={'Facility Name':'facility_name',
                                             'City':'city',
                                             'Primary NAICS Code':'naics_code',
                                             'Industry Type (subparts)':'industry_subparts',
                                             'Industry Type (sectors)':'industry_sectors'})

    emsCA2019loc = pd.merge(emiss_df, facilitiesB[['Facility Id',
                                                    'facility_name',
    #                                               'Facility Name2',
                                                   'city',
                                                   'Zip Code',
                                                   'Address',
                                                   'County',
                                                   'Latitude',
                                                   'Longitude',
    #                                               'Primary NAICS Code2',
                                                   'industry_subparts',
                                                   'industry_sectors']],
                            how='left', on='Facility Id')
    emsCA2019locagg = emsCA2019loc[['Facility Id','facility_name','Latitude','Longitude',
                                    'Unit CO2 emissions (non-biogenic)']].dropna().groupby(
                                        ['Facility Id','facility_name']).agg({'Latitude':'min',
                                                            'Longitude':'min',
                                                            'Unit CO2 emissions (non-biogenic)':'sum'})
    emsCA2019locagg.reset_index(inplace=True)
    emsCA2019locagg = emsCA2019locagg.rename(columns={'Facility Id':'facility_id',
                                                      'facility_name':'facility_name',
                                                      'Latitude':'lat',
                                                      'Longitude':'lon',
                                                      'Unit CO2 emissions (non-biogenic)':'annual_emissions'})
    emsCA2019locagg = emsCA2019locagg[emsCA2019locagg['annual_emissions'] > 0]
    facility_latlon = emsCA2019locagg[['lat', 'lon']].to_records(index=False)
    facility_ids = emsCA2019locagg['facility_id'].tolist()
    facility_names = emsCA2019locagg['facility_name'].tolist()
    supply = emsCA2019locagg['annual_emissions'].tolist()
    return facility_latlon, facility_ids, facility_names, supply


def get_distances(facility_latlon, pools_latlon):
    pools_distance_matrix = []
    for i in range(len(facility_latlon)):
        distances = []
        for j in range(len(pools_latlon)):
            dist = great_circle(facility_latlon[i], pools_latlon[j]).miles
            distances.append(dist)
        pools_distance_matrix.append(distances)
    return pools_distance_matrix


def get_complex_costs(distance_matrix):
    distances_cc = []
    for i in distance_matrix:
        row = []
        for j in i:
            k = j/25 + 2000000
            row.append(k)
        distances_cc.append(row)
    return distances_cc


def run_optimization(n_receivers, n_suppliers, supply, demand, costs, distances, facility_names, facility_latlon, pools_ids, pools_latlon, filename_str):
    p_prob = LpProblem('Unbalaced_Transportation_Problem', LpMinimize)
    #costs = pools_distance_matrix_complexcost
    routes = [(i, j) for i in range(n_suppliers) for j in range(n_receivers)]

    x = LpVariable.dicts('X', routes, lowBound=0)
    p_prob += lpSum([x[i, j] * costs[i][j] for i in range(n_suppliers) for j in range(n_receivers)])

    for i in range(n_suppliers):
        p_prob += lpSum([x[i, j] for j in range(n_receivers)]) == supply[i]

    for j in range(n_receivers):
        p_prob += lpSum([x[i, j] for i in range(n_suppliers)]) <= demand[j]

    # Solving problem
    p_prob.solve()
    
    volumes_moved = [i.varValue for i in p_prob.variables()]
    volume_matrix = []
    k=0
    while k < n_suppliers:
        vols = []
        for i in range(n_receivers):
            j = i + k*n_receivers
            vols.append(volumes_moved[j])
        volume_matrix.append(vols)
        k+=1
        
    results = []
    for i in range(n_suppliers):
        f_name = facility_names[i]
        f_loc = facility_latlon[i]
        for j in range(n_receivers):
            vol = volume_matrix[i][j]
            if vol > 0:
                w_id = pools_ids[j]
                w_loc = pools_latlon[j]
                dist = distances[i][j]
                r = [f_name, f_loc, w_id, w_loc, dist, vol]
                results.append(r)
    results_html = pd.DataFrame(results, columns = ['facility_name', 'facility_location','pool_id','pool_location','distance','volume']).to_html()
    
    file_name = 'data/results/{}.pkl'.format(filename_str)
    open_file = open(file_name, "wb")
    pickle.dump(results, open_file)
    open_file.close()
    
    html_file = open("data/results/{}.html".filename_str, "w")
    html_file.write(results_html)
    html_file.close()
    
    return results



In [17]:
wells = pd.read_csv("data/wells/AllWells_20210915.csv")

# Oil & Gas, Dry Gas (no liquids), Gas and "Liquefied Gas"
wells_oilgas = wells[ (wells['WellType'] == 'OG')  |
                      (wells['WellType'] == 'DG')  |
                      (wells['WellType'] == 'GAS') |
                      (wells['WellType'] == 'LG') ]
wells_latlon = wells_oilgas[['Latitude', 'Longitude']].to_records(index=False)
wells_ids = wells_oilgas['API'].tolist()
pooled_wells = pd.read_csv("data/wells/pool_volumes.csv")
pools_latlon = pooled_wells[['Latitude', 'Longitude']].to_records(index=False)
pools_ids = pooled_wells['poolID'].tolist()
pools_volumes = pooled_wells['totalco2'].tolist()
n_pools = len(pools_ids)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [18]:
emissions = pd.read_csv(r'./data/Emissions by Unit and Fuel Type - csv/UNIT_DATA-Table 1.csv')
emsCA2019 = emissions[emissions['Reporting Year'] == 2019]
facilities = pd.read_excel(r'./data/ghgp_data_2019.xlsx',
                          sheet_name='Direct Emitters', skiprows=3)
# One Michegan emitter lists its CA headquarters. Retag it as Michigan
facilities.loc[facilities['Facility Id'] == 1000594, 'State'] = 'MI'

facilities = facilities[(facilities.State == 'CA')]
facilities_cement = facilities[facilities['Cement Production'] > 0]
facility_latlon, facility_ids, facility_names, supply = facility_cleaning(facilities, emsCA2019)
n_facilities = len(facility_ids)

facility_c_latlon, facility_c_ids, facility_c_names, supply_c = facility_cleaning(facilities, emsCA2019)
n_facilities_c = len(facility_c_ids)

In [14]:
distances = get_distances(facility_latlon, pools_latlon)
complex_costs = get_complex_costs(distances)

In [15]:
distances_c = get_distances(facility_c_latlon, pools_latlon)
complex_costs_c = get_complex_costs(distances_c)

In [None]:
## ALL RESULTS WITH COMPLEX COSTS
results = run_optimization(n_pools, 
                           n_facilities, 
                           supply, 
                           pools_volumes, 
                           complex_costs, 
                           distances, 
                           facility_names, 
                           facility_latlon, 
                           pools_ids, 
                           pools_latlon, 
                           'all_facilities_complexcosts')




In [None]:
## CEMENT ONLY WITH COMPLEX COSTS
results_c = run_optimization(n_pools, 
                             n_facilities_c, 
                             supply_c, 
                             pools_volumes, 
                             complex_costs_c, 
                             distances_c, 
                             facility_c_names, 
                             facility_c_latlon, 
                             pools_ids, 
                             pools_latlon, 
                             'cementonly_complexcosts')

In [None]:
## ALL RESULTS WITH DISTANCE ONLY
results = run_optimization(n_pools, 
                           n_facilities, 
                           supply, 
                           pools_volumes, 
                           distances, 
                           distances, 
                           facility_names, 
                           facility_latlon, 
                           pools_ids, 
                           pools_latlon, 
                           'all_facilities_complexcosts')

In [None]:
## CEMENT ONLY WITH DISTANCE ONLY
results_c = run_optimization(n_pools, 
                             n_facilities_c, 
                             supply_c, 
                             pools_volumes, 
                             distances_c, 
                             distances_c, 
                             facility_c_names, 
                             facility_c_latlon, 
                             pools_ids, 
                             pools_latlon, 
                             'cementonly_complexcosts')