# Open Stack Swift storage of the project

In [1]:
# default_exp datasets

## Module Installation

In [2]:
#!pip install --user python-swiftclient python-keystoneclient --upgrade
#!pip install nbdev

## Class Datasets
Using a config file for credentials

In [3]:
#export
import swiftclient
import json
import glob
import os
#from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
import pandas as pd

class Datasets:
    """
    Utility class to access the Open Stack Swift storage of the project.
    """
    config = None # Dict configuration
    conn = None # swiftclient.Connection object
    container_name = 'oco2'
    
    def __init__(self, config_file):
        """
        Constructor
        :param config_file: str, Path to config file
        :return:
        """
        # Load config
        with open(config_file) as json_data_file:
            self.config = json.load(json_data_file)
        self.conn = self.swift_con()

    def swift_con(self, config=None):
        """
        Connect to Open Stack Swift
        :param config: dict, Config dictionary.
        :return: swiftclient.Connection
        """
        if config is None:
            config = self.config
        user=config['swift_storage']['user']
        key=config['swift_storage']['key']
        auth_url=config['swift_storage']['auth_url']
        tenant_name=config['swift_storage']['tenant_name']
        auth_version=config['swift_storage']['auth_version']
        options = config['swift_storage']['options']
        self.conn = swiftclient.Connection(user=user,
                                      key=key,
                                      authurl=auth_url,
                                      os_options=options,
                                      tenant_name=tenant_name,
                                      auth_version=auth_version)
        return self.conn

    def upload(self, mask='c:\datasets\*.csv', prefix="/Trash/",content_type='text/csv', recursive=False):
        """
        Upload files to Open Stack Swift
        :param mask: str, Mask for seraching file to upload.
        :param prefix: str, Prefix in destination. Useful to mimic folders.
        :param content_type: str, Content type on the destination.
        :param recursive: boolean, To allow search in sub-folder.
        :return:
        """
        for file in tqdm(glob.glob(mask, recursive=recursive)):
            with open(file, 'rb') as one_file:
                    upload_to = prefix+ os.path.basename(file)
                    #print('Copy from',file,'to',upload_to)
                    self.conn.put_object(self.container_name, upload_to,
                                                    contents= one_file.read(),
                                                    content_type=content_type) # 'text/csv'
    def get_files_urls(self, pattern=""):
        result=[]
        objects = self.conn.get_container(self.container_name)[1]
        for data in objects:
            if pattern in data['name']:
                url = self.config['swift_storage']['base_url']+data['name']
                result.append(url)
        return result

    def delete_files(self, pattern="/Trash/", dry_run=True):
        if dry_run:
            print('Nothing will be deleted. Use dry_run=False to delete.')
        for data in self.conn.get_container(self.container_name)[1]:
            file = data['name']
            if pattern in file:
                print('deleting', file)
                if not dry_run:
                    self.conn.delete_object(self.container_name, file)
                   

    def get_containers(self):
        return self.conn.get_account()[1]
    def get_container(self, container_name='oco2'):
        return self.conn.get_container(container_name)[1]

    def get_url_from_sounding_id(self, sounding_id):
        return config['swift_storage']['base_url']+'/datasets/oco-2/peaks-detected-details/peak_data-si_'+sounding_id+'.json'
        
    def get_dataframe(self, url):
        """
        Read the url of a file and load it with Pandas
        :param url: str, URL of the file to load.
        :return: DataFrame
        """
        # TODO : Switch to GeoPandas
        df = None
        extension = url.split('.')[-1].lower()
        if extension == 'csv':
            df = pd.read_csv(url)
            if 'sounding_id' in df.columns:
                df['sounding_id']= df['sounding_id'].astype(str)
        elif extension == 'json':
            df = pd.read_json(url)
        return df
    
    def get_gaussian_param(self, sounding_id, df_all_peak):
        df_param = df_all_peak.query("sounding_id==@sounding_id")
        if len(df_param)<1:
            print('ERROR : sounding_id not found in dataframe !')
            return {'slope' : 1,'intercept' : 1,'amplitude' : 1,'sigma': 1,'delta': 1,'R' : 1}
        param_index = df_param.index[0]

        gaussian_param = {
            'slope' : df_param.loc[param_index, 'slope'],
            'intercept' : df_param.loc[param_index, 'intercept'],
            'amplitude' : df_param.loc[param_index, 'amplitude'],
            'sigma': df_param.loc[param_index, 'sigma'],
            'delta': df_param.loc[param_index, 'delta'],
            'R' : df_param.loc[param_index, 'R'],
        }
        return gaussian_param

# Examples
## Connection

In [4]:
config = '../configs/config.json'
datasets = Datasets(config)

## Get containers names

In [5]:
for container in datasets.get_containers():
    print('Container name:', container['name'])

Container name: oco2


## List files

In [6]:
datasets.get_files_urls('html')

['https://storage.gra.cloud.ovh.net/v1/AUTH_2aaacef8e88a4ca897bb93b984bd04dd/oco2//map/peaks_and_sources.html']

### Get files objects

In [7]:
objects = datasets.get_container('oco2')
for data in objects:
    if 'oco2_1504' in data['name']:
        print('{0}\t{1}\t{2}'.format(data['name'], data['bytes'], data['last_modified']))


/datasets/oco-2/peaks-detected/result_for_oco2_1504.csv	21241	2020-05-05T17:13:30.884370
/datasets/oco-2/soudings/oco2_1504.csv.xz	75186100	2020-05-03T07:48:47.793680


## Upload files

In [8]:
datasets.upload(mask='../*.md', prefix="/Trash/",content_type='text/text')
# datasets.("/media/data-nvme/dev/datasets/OCO2/csv/*.csv", "/datasets/oco-2/peaks-detected/", 'text/csv')
# datasets.("/media/data-nvme/dev/datasets/OCO2/csv/*.json", "/datasets/oco-2/peaks-detected-details/", 'application/json')

100%|██████████| 3/3 [00:00<00:00,  4.89it/s]


In [9]:
datasets.get_files_urls('/Trash/')

['https://storage.gra.cloud.ovh.net/v1/AUTH_2aaacef8e88a4ca897bb93b984bd04dd/oco2//Trash/CONTRIBUTING.md',
 'https://storage.gra.cloud.ovh.net/v1/AUTH_2aaacef8e88a4ca897bb93b984bd04dd/oco2//Trash/README-old.md',
 'https://storage.gra.cloud.ovh.net/v1/AUTH_2aaacef8e88a4ca897bb93b984bd04dd/oco2//Trash/README.md']

### Upload HTML
Setting content type to 'text/html' allow the file to be display by browsers, without downloading.

In [10]:
#datasets.upload("chemin/peaks_and_sources.html", "/Trash/", 'text/html')

## Delete files

In [11]:
datasets.delete_files("/Trash/", dry_run=False)

deleting /Trash/CONTRIBUTING.md
deleting /Trash/README-old.md
deleting /Trash/README.md


In [12]:
df=datasets.get_dataframe('https://storage.gra.cloud.ovh.net/v1/AUTH_2aaacef8e88a4ca897bb93b984bd04dd/oco2//datasets/oco-2/peaks-detected-details/peak_data-si_2018082505142073.json')
df.head(3)

Unnamed: 0,orbit,sounding_id,latitude,longitude,xco2,xco2_uncert,windspeed_u,windspeed_v,surface_pressure_apriori,surface_pressure,altitude,land_water_indicator,land_fraction,latitude_orig,longitude_orig,distance,xco2_enhancement
0,22061,2018082505140535,35.290813,117.64283,400.87854,0.559905,-2.961818,-0.434006,983.626465,981.835144,218.204407,0,100,-43.749119,135.989822,-99.458018,-0.413101
1,22061,2018082505140503,35.292336,117.637512,400.439972,0.567133,-2.958889,-0.431859,981.793701,984.246216,234.751266,0,100,-43.749119,135.989822,-99.182306,-0.851669
2,22061,2018082505140608,35.297531,117.647171,400.820587,0.667865,-2.963,-0.423906,984.632263,984.782532,209.161057,0,100,-43.749119,135.989822,-98.822275,-0.471054


## Calcul de toutes les distances entre les lignes
1) ne conserve que les variables concernées par le calcul de distance
2) standardisation des data
3) calculer les distances
4) après nous sommes dans une problématique de clustering

In [13]:
lst = list(df.columns)
print(lst)


['orbit', 'sounding_id', 'latitude', 'longitude', 'xco2', 'xco2_uncert', 'windspeed_u', 'windspeed_v', 'surface_pressure_apriori', 'surface_pressure', 'altitude', 'land_water_indicator', 'land_fraction', 'latitude_orig', 'longitude_orig', 'distance', 'xco2_enhancement']


In [14]:
for x in ['orbit', 'sounding_id', 'latitude', 'longitude']:
    lst.remove(x)


In [15]:
lst

['xco2',
 'xco2_uncert',
 'windspeed_u',
 'windspeed_v',
 'surface_pressure_apriori',
 'surface_pressure',
 'altitude',
 'land_water_indicator',
 'land_fraction',
 'latitude_orig',
 'longitude_orig',
 'distance',
 'xco2_enhancement']

In [16]:
data = df[lst]

In [17]:
#### Scaled data has zero mean and unit variance:

In [18]:
from sklearn import preprocessing
import numpy as np
scaler = preprocessing.StandardScaler().fit(data)

z = scaler.transform(data)
print('- scaled')
print(z)

print('- std', np.std(z))
print('means', np.mean(z))



- scaled
[[-0.48320878 -1.00370044 -1.11069355 ...  0.         -1.95489176
  -0.48320878]
 [-0.77402376 -0.92000131 -1.07759097 ...  0.         -1.94980217
  -0.77402376]
 [-0.52163739  0.24649317 -1.12404289 ...  0.         -1.94315609
  -0.52163739]
 ...
 [-0.57384689  2.01126288  3.18149104 ...  0.          1.71373452
  -0.57384689]
 [-0.31051272  0.24957435  3.22139626 ...  0.          1.71479238
  -0.31051272]
 [-0.94544493  2.04753506  3.31048253 ...  0.          1.72562806
  -0.94544493]]
- std 0.9576845844606717
means -0.07692307692307639


#### Calcul de toutes les distances 

In [19]:
from scipy.spatial import distance
d = distance.cdist(z, z, 'euclidean')
print(d)


[[0.         0.49029123 1.2736413  ... 8.02971629 7.56535312 8.15091223]
 [0.49029123 0.         1.25292643 ... 7.99528889 7.56971423 8.08997042]
 [1.2736413  1.25292643 0.         ... 7.56564926 7.38062547 7.68543933]
 ...
 [8.02971629 7.99528889 7.56564926 ... 0.         1.80244384 0.54380052]
 [7.56535312 7.56971423 7.38062547 ... 1.80244384 0.         2.01214729]
 [8.15091223 8.08997042 7.68543933 ... 0.54380052 2.01214729 0.        ]]


In [20]:
from scipy.spatial import distance_matrix
d2 = distance_matrix(z, z)
print(d2)

[[0.         0.49029123 1.2736413  ... 8.02971629 7.56535312 8.15091223]
 [0.49029123 0.         1.25292643 ... 7.99528889 7.56971423 8.08997042]
 [1.2736413  1.25292643 0.         ... 7.56564926 7.38062547 7.68543933]
 ...
 [8.02971629 7.99528889 7.56564926 ... 0.         1.80244384 0.54380052]
 [7.56535312 7.56971423 7.38062547 ... 1.80244384 0.         2.01214729]
 [8.15091223 8.08997042 7.68543933 ... 0.54380052 2.01214729 0.        ]]


- distance between vectors

In [21]:
d3 = distance_matrix([z[0] ], [z[1] ])
print(d3)

[[0.49029123]]


In [22]:
[z[0] ]

[array([-0.48320878, -1.00370044, -1.11069355, -2.11417437, -0.15252158,
        -0.33083719,  0.13828169, -0.1490712 ,  0.14310656, -1.        ,
         0.        , -1.95489176, -0.48320878])]

In [23]:
from nbdev.export import notebook2script
notebook2script()

Converted 03_25_OCO2_Data_Exploration.ipynb.
Converted 04_01_OCO2_Work_Base.ipynb.
Converted 04_04_OCO2_China_Peaks.ipynb.
Converted 04_15_OCO2_Laiwu_Peak_Detection.ipynb.
Converted CO2_emissions_Inventory_data.ipynb.
Converted Find_Peaks_with_LSTM_autoencoders.ipynb.
Converted Laiwu_Plume-more_data.ipynb.
Converted Laiwu_Plume-more_data_CD_exploration_selection_peaks.ipynb.
Converted Laiwu_Plume.ipynb.
Converted Untitled.ipynb.
Converted WIP_OCO2_Capture.ipynb.
Converted WIP_OCO2_Peaks_Wind.ipynb.
Converted WIP_OCO2_Peaks_Wind_Visualization.ipynb.
Converted index.ipynb.
Converted oco2peak-datasets-Distances.ipynb.
Converted oco2peak-datasets.ipynb.
Converted oco2peak-find_peak.ipynb.
Converted oco2peak-map.ipynb.
Converted oco2peak-nc4_convert.ipynb.
Converted oco2peak-swift_utils.ipynb.
Converted show_map.ipynb.
Converted view_peak.ipynb.
