# OCO2 - Count Cities and Plants inside a capture zone around the peak
Project for **Data For Good**, season 7. 

---

## Introduction


In [119]:
import pandas as pd
import geopandas as gpd
import numpy as np
from numpy import exp, loadtxt, pi, sqrt
import math
import matplotlib
import matplotlib.pyplot as plt
import swiftclient
import json
from io import StringIO
import folium
from folium import plugins
import geopy
from shapely.geometry import Polygon
from geopy.distance import VincentyDistance

config_path = "../configs/config.json"

We establish the connexion with the swift server to pull and push ressources.

In [120]:
with open(config_path) as json_data_file:
    config = json.load(json_data_file)
    
def swift_con(config):
    user=config['swift_storage']['user']
    key=config['swift_storage']['key']
    auth_url=config['swift_storage']['auth_url']
    tenant_name=config['swift_storage']['tenant_name']
    auth_version=config['swift_storage']['auth_version']
    options = config['swift_storage']['options']
    return swiftclient.Connection(user=user,
                                  key=key,
                                  authurl=auth_url,
                                  os_options=options,
                                  tenant_name=tenant_name,
                                  auth_version=auth_version)

conn = swift_con(config)

## Retrieve Data

### Gaussian Peak Detection

The CSV contains the resulting detected peaks of the Gaussian method implemented in FC's paper, reproduced by Benoit. The CSV is stored on the Swift server.

We retrieve here the August 2018 data as an exemple.

In [121]:
from datetime import datetime
def to_date(a):
    return datetime.strptime(str(a), '%Y%m%d%H%M%S%f')

csv = conn.get_object("oco2", "/datasets/oco-2/peaks-detected/result_for_oco2_1808.csv")[1]
peak_fc = pd.read_csv(StringIO(str(csv, 'utf-8')), sep=";")
peak_fc['date'] = peak_fc['sounding_id'].apply(to_date)
peak_fc['sigma'] = peak_fc['sigma'].apply(abs)
peak_fc['amplitude'] = peak_fc['amplitude'].apply(abs)

peak_fc = gpd.GeoDataFrame(peak_fc, geometry=gpd.points_from_xy(peak_fc.longitude, peak_fc.latitude)).copy()
peak_fc.crs = {'init': 'epsg:4326'}

peak_fc.head()

Unnamed: 0,sounding_id,latitude,longitude,orbit,slope,intercept,amplitude,sigma,delta,R,windspeed_u,windspeed_v,surface_pressure,tcwv,gCO2_per_s,ktCO2_per_h,date,geometry
0,2018080101035604,25.425072,-177.34549,21709,0.00043,404.902899,14.694226,3.987167,1.470254,0.677812,-7.759225,-0.739198,1018.198181,41.952118,1775510.0,6.391835,2018-08-01 01:03:56.040,POINT (-177.34549 25.42507)
1,2018080101060803,32.777554,-179.428162,21709,0.002886,404.622407,1.493849,7.887665,0.075556,0.532213,-5.420354,-0.992332,1025.610962,32.914429,129603.7,0.466573,2018-08-01 01:06:08.030,POINT (-179.42816 32.77755)
2,2018080101062937,33.919857,-179.779083,21709,-0.006755,404.41603,11.847855,3.346435,1.412432,0.592854,-4.035365,-0.879741,1026.469604,32.334263,32855.5,0.11828,2018-08-01 01:06:29.370,POINT (-179.77908 33.91986)
3,2018080102530302,58.858116,143.781662,21710,0.003719,399.445056,0.018193,5.552475,0.001307,0.730703,-3.837832,0.812698,995.894531,23.648272,721.4001,0.002597,2018-08-01 02:53:03.020,POINT (143.78166 58.85812)
4,2018080104244674,35.680077,130.219467,21711,-0.004196,405.367275,9.859749,24.716153,0.159146,0.577045,-2.515345,-1.818006,1005.533569,43.828854,451558.8,1.625612,2018-08-01 04:24:46.740,POINT (130.21947 35.68008)


### Inventory Data

The CSV is stored on the GitHub.

In [122]:
path_invent = "https://raw.githubusercontent.com/dataforgoodfr/batch7_satellite_ges/master/dataset/Output%20inventory%20data/Merge%20of%20peaks/CO2_emissions_peaks_merged_2018.csv"
invent = pd.read_csv(path_invent, sep=",", index_col=0)

invent = gpd.GeoDataFrame(invent, geometry=gpd.points_from_xy(invent.longitude, invent.latitude))
invent.crs = {'init': 'epsg:4326'}

invent = invent[invent['longitude'].notna()]
invent = invent[invent['latitude'].notna()]
invent = invent[invent['CO2/CO2e emissions (in tonnes per year)'].notna()]

invent_types = invent['CO2/CO2e emissions source'].unique()
print("Types of inventory: ")
print(invent_types)

invent.head()

Types of inventory: 
['City' 'Gas power plant' 'Oil power plant' 'Coal power plant'
 'Cement and Lime' 'Glass' 'Power and heat' 'Ceramics' 'Pulp and paper'
 'Chemicals' 'Iron and steel' 'Combustion' 'Non ferrous metals'
 'Mineral oil' 'Coke ovens']


Unnamed: 0,latitude,longitude,Data source,CO2/CO2e emissions source,CO2/CO2e emissions (in tonnes per year),CO2 or CO2e,geometry
0,43.653226,-79.383184,Opendatasoft,City,16151019.0,CO2,POINT (-79.38318 43.65323)
4,45.802578,9.086356,Opendatasoft,City,3728678.0,CO2,POINT (9.08636 45.80258)
5,37.6689,-122.0808,Opendatasoft,City,861854.0,CO2e,POINT (-122.08080 37.66890)
6,35.689634,139.692101,Opendatasoft,City,27611000.0,CO2e,POINT (139.69210 35.68963)
11,-10.249091,-48.324286,Opendatasoft,City,589055.31,CO2e,POINT (-48.32429 -10.24909)


## Inventory Capture

In [125]:
def get_direction_from_uv(u, v):
    ''' Retrieve the heading of a vector'''
    direction = 180/math.pi * math.atan2(u,v)+180
    return direction

def get_wind_norm_from_uv(u, v):
    ''' Retrieve the magitude of a vector'''
    return math.sqrt(pow(u,2)+pow(v,2))

def get_new_coord(lat, lon, d, b):
    ''' Calculate the arrival point of a vector, given a starting point, a distance and a direcion'''
    origin = geopy.Point(lat, lon)
    point = VincentyDistance(kilometers=d).destination(origin, b)
    return [point[1], point[0]]

def capture_zone(lat, lon, u, v, angle=50):
    ''' Calculates the capture zone around a point, given the point, a wind vector and a angme to shape the zone'''
    wind_heading = get_direction_from_uv(u, v)
    wind_norm = np.log(get_wind_norm_from_uv(u,v)*3.6 +1)

    # BACK LINE
    # 1st point (back - 6h wind)
    point_1 = get_new_coord(lat, lon, wind_norm*6, wind_heading+180)
    # 2nd point (back - 6h wind - 50°)
    point_2 = get_new_coord(lat, lon, wind_norm*6, wind_heading+180-angle)
    # 3rd point (back - 6h wind - 50°)
    point_3 = get_new_coord(lat, lon, wind_norm*6, wind_heading+180+angle)

    # FRONT LINE
    # 4th point (front - 24h wind - 20°)
    point_4 = get_new_coord(lat, lon, wind_norm*24, wind_heading+angle)
    # 5th point (front - 24h wind - 20°)
    point_5 = get_new_coord(lat, lon, wind_norm*24, wind_heading-angle)
    # 6th point (front - 24h wind)
    point_6 = get_new_coord(lat, lon, wind_norm*24, wind_heading)

    points = [point_1, point_2, point_4, point_6, point_5, point_3, point_1]
    return points

def capture_df(row):
    ''' Apply the capture zone function to a dataset row'''
    return Polygon(capture_zone(row["latitude"], row["longitude"], row['windspeed_u'], row['windspeed_v']))

def join_and_count_one(peaks, invent, title):
    ''' Spacially join 3 datasets (Polygon, Point and Point)'''
    #intersect cities
    peaks_intersect_invent  = gpd.sjoin(peaks, invent.loc[:, ['CO2/CO2e emissions (in tonnes per year)','geometry']], how='left', op='intersects').rename(columns={"index_right": "index_"+str(title)})
    peaks_intersect_ag = peaks_intersect_invent.groupby([peaks_intersect_invent.index, 'sounding_id'])["index_"+str(title)].apply(list).reset_index()
    peaks_intersect_ag["number_"+str(title)] = peaks_intersect_ag["index_"+str(title)].apply(lambda x: np.count_nonzero(~np.isnan(x)))
    peaks_meters_invent = peaks.merge(peaks_intersect_ag, how='left', on='sounding_id')
    
    peaks_meters_invent = peaks_meters_invent.drop(columns=['level_0'])
    return peaks_meters_invent

### Gaussian-only Peaks

We set a capture zone and spatially join the data for FC's peaks:

In [126]:
# Remove far east and west to avoid side effects on captures zones 
peaks_fc = peak_fc.loc[(peak_fc["longitude"] < 170) & (peak_fc["longitude"] > -170), :]
peaks_fc['geometry'] = peaks_fc.apply(capture_df, axis=1)

for cl in invent_types:
    invent_pt = invent[invent['CO2/CO2e emissions source'] == cl]
    peaks_fc = join_and_count_one(peaks_fc, invent_pt, cl)
    
peaks_fc.head(50)

  point = VincentyDistance(kilometers=d).destination(origin, b)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_fc['geometry'] = peaks_fc.apply(capture_df, axis=1)


Unnamed: 0,sounding_id,latitude,longitude,orbit,slope,intercept,amplitude,sigma,delta,R,...,index_Iron and steel,number_Iron and steel,index_Combustion,number_Combustion,index_Non ferrous metals,number_Non ferrous metals,index_Mineral oil,number_Mineral oil,index_Coke ovens,number_Coke ovens
0,2018080102530302,58.858116,143.781662,21710,0.003719,399.445056,0.018193,5.552475,0.001307,0.730703,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
1,2018080104244674,35.680077,130.219467,21711,-0.004196,405.367275,9.859749,24.716153,0.159146,0.577045,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
2,2018080104244836,35.752354,130.197632,21711,-0.004529,405.341883,7.635088,6.52577,0.466759,0.511861,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
3,2018080104244906,35.789249,130.185776,21711,-0.004221,405.311837,7.751707,9.587837,0.322542,0.533806,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
4,2018080104244976,35.826172,130.17395,21711,-0.003825,405.271937,9.293184,16.454924,0.225309,0.5144,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
5,2018080104245173,35.966625,130.125778,21711,-0.004961,405.187786,8.763453,17.915269,0.195147,0.616669,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
6,2018080104245475,36.112236,130.080261,21711,-0.005632,405.134489,5.867463,20.226835,0.115726,0.714837,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
7,2018080104245535,36.149014,130.068329,21711,-0.006073,405.120973,5.166552,11.382239,0.181085,0.597837,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
8,2018080104245738,36.229542,130.045258,21711,-0.006534,405.086223,3.223339,12.760039,0.100778,0.614451,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
9,2018080104245808,36.266342,130.033279,21711,-0.006272,405.075965,1.815272,10.704547,0.067652,0.518306,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0


In [134]:
peaks_fc.describe()

Unnamed: 0,sounding_id,latitude,longitude,orbit,slope,intercept,amplitude,sigma,delta,R,...,number_Glass,number_Power and heat,number_Ceramics,number_Pulp and paper,number_Chemicals,number_Iron and steel,number_Combustion,number_Non ferrous metals,number_Mineral oil,number_Coke ovens
count,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,...,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0
mean,2018082000000000.0,20.842071,0.092224,21940.067416,-0.001568,404.168031,15.990081,16.838018,0.397463,0.60168,...,0.004815,0.022472,0.007223,0.008828,0.006822,0.000401,0.038122,0.0,0.000803,0.0
std,931405200.0,26.653732,98.411842,135.59239,0.006666,2.021748,16.216757,6.740656,0.426329,0.083893,...,0.074813,0.236002,0.135699,0.112997,0.128112,0.020032,0.4285,0.0,0.028324,0.0
min,2018080000000000.0,-40.021149,-167.962051,21710.0,-0.031395,399.106487,0.003036,2.027053,0.000341,0.500107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2018081000000000.0,-3.690342,-91.384123,21809.0,-0.006118,402.655696,5.555167,11.603454,0.1497,0.534788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2018082000000000.0,30.75905,18.615467,21957.0,-0.002877,404.493086,11.937472,16.121828,0.289604,0.578015,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2018082000000000.0,41.005142,66.940214,22057.0,0.00389,405.954873,21.111619,21.397333,0.496026,0.649437,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2018083000000000.0,72.424393,169.941254,22159.0,0.025194,408.913894,162.436539,33.197182,5.603435,0.899008,...,2.0,7.0,4.0,2.0,5.0,1.0,9.0,0.0,1.0,0.0


## CSV save

In [114]:
for i in range (1801, 1813):
    csv = conn.get_object("oco2", "/datasets/oco-2/peaks-detected/result_for_oco2_"+str(i)+".csv")[1]
    peak_fc = pd.read_csv(StringIO(str(csv, 'utf-8')), sep=";")
    peak_fc['date'] = peak_fc['sounding_id'].apply(to_date)
    peak_fc['sigma'] = peak_fc['sigma'].apply(abs)
    peak_fc['amplitude'] = peak_fc['amplitude'].apply(abs)
    peak_fc = gpd.GeoDataFrame(peak_fc, geometry=gpd.points_from_xy(peak_fc.longitude, peak_fc.latitude)).copy()
    peak_fc.crs = {'init': 'epsg:4326'}
    
    peaks_fc = peak_fc.loc[(peak_fc["longitude"] < 170) & (peak_fc["longitude"] > -170), :]
    peaks_fc['geometry'] = peaks_fc.apply(capture_df, axis=1)

    for cl in invent_types:
        invent_pt = invent[invent['CO2/CO2e emissions source'] == cl]
        peaks_fc = join_and_count_one(peaks_fc, invent_pt, cl)
        
    peaks_fc.to_csv(r'../dataset/peaks_and_invent/peaks_and_invent_'+str(i)+'.csv', index = False)


  point = VincentyDistance(kilometers=d).destination(origin, b)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_fc['geometry'] = peaks_fc.apply(capture_df, axis=1)
  point = VincentyDistance(kilometers=d).destination(origin, b)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_fc['geometry'] = peaks_fc.apply(capture_df, axis=1)
  point = VincentyDistance(kilometers=d).destination(origin, b)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

In [115]:
path_test = "../dataset/peaks_and_invent/peaks_and_invent_1808.csv"
test = pd.read_csv(path_test, sep=",", index_col=0)
test.head()

Unnamed: 0_level_0,latitude,longitude,orbit,slope,intercept,amplitude,sigma,delta,R,windspeed_u,...,index_Iron and steel,number_Iron and steel,index_Combustion,number_Combustion,index_Non ferrous metals,number_Non ferrous metals,index_Mineral oil,number_Mineral oil,index_Coke ovens,number_Coke ovens
sounding_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018080102530302,58.858116,143.781662,21710,0.003719,399.445056,0.018193,5.552475,0.001307,0.730703,-3.837832,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
2018080104244674,35.680077,130.219467,21711,-0.004196,405.367275,9.859749,24.716153,0.159146,0.577045,-2.515345,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
2018080104244836,35.752354,130.197632,21711,-0.004529,405.341883,7.635088,6.52577,0.466759,0.511861,-2.536525,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
2018080104244906,35.789249,130.185776,21711,-0.004221,405.311837,7.751707,9.587837,0.322542,0.533806,-2.545387,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
2018080104244976,35.826172,130.17395,21711,-0.003825,405.271937,9.293184,16.454924,0.225309,0.5144,-2.552335,...,[nan],0,[nan],0,[nan],0,[nan],0,[nan],0
