# Open Stack Swift storage of the project

### Recherche de points de mesure du satellite qui sont isolés, hors de la tendance de fond.
Cette recherche correspond aux noise points clustering  Dbscan, c'est à dire les points qui n'appartiennent à aucun clusters, en effet :

    - DBSCAN repose sur le concept de densité : 
    un cluster est une zone de l’espace où la densité d’observations est importante. En sortie, l’algorithme génère autant de clusters que de zones de l’espace de forte densité. Les points isolés sont considérés comme des outliers (valeurs aberrantes). Ce sont ces points qui nous intéressent.

In [None]:
# default_exp datasets

## Module Installation

In [None]:
#!pip install --user python-swiftclient python-keystoneclient --upgrade
#!pip install nbdev

## Class Datasets
Using a config file for credentials

In [1]:
#export
import swiftclient
import json
import glob
import os
#from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
import pandas as pd

class Datasets:
    """
    Utility class to access the Open Stack Swift storage of the project.
    """
    config = None # Dict configuration
    conn = None # swiftclient.Connection object
    container_name = 'oco2'
    
    def __init__(self, config_file):
        """
        Constructor
        :param config_file: str, Path to config file
        :return:
        """
        # Load config
        with open(config_file) as json_data_file:
            self.config = json.load(json_data_file)
        self.conn = self.swift_con()

    def swift_con(self, config=None):
        """
        Connect to Open Stack Swift
        :param config: dict, Config dictionary.
        :return: swiftclient.Connection
        """
        if config is None:
            config = self.config
        user=config['swift_storage']['user']
        key=config['swift_storage']['key']
        auth_url=config['swift_storage']['auth_url']
        tenant_name=config['swift_storage']['tenant_name']
        auth_version=config['swift_storage']['auth_version']
        options = config['swift_storage']['options']
        self.conn = swiftclient.Connection(user=user,
                                      key=key,
                                      authurl=auth_url,
                                      os_options=options,
                                      tenant_name=tenant_name,
                                      auth_version=auth_version)
        return self.conn

    def upload(self, mask='c:\datasets\*.csv', prefix="/Trash/",content_type='text/csv', recursive=False):
        """
        Upload files to Open Stack Swift
        :param mask: str, Mask for seraching file to upload.
        :param prefix: str, Prefix in destination. Useful to mimic folders.
        :param content_type: str, Content type on the destination.
        :param recursive: boolean, To allow search in sub-folder.
        :return:
        """
        for file in tqdm(glob.glob(mask, recursive=recursive)):
            with open(file, 'rb') as one_file:
                    upload_to = prefix+ os.path.basename(file)
                    #print('Copy from',file,'to',upload_to)
                    self.conn.put_object(self.container_name, upload_to,
                                                    contents= one_file.read(),
                                                    content_type=content_type) # 'text/csv'
    def get_files_urls(self, pattern=""):
        result=[]
        objects = self.conn.get_container(self.container_name)[1]
        for data in objects:
            if pattern in data['name']:
                url = self.config['swift_storage']['base_url']+data['name']
                result.append(url)
        return result

    def delete_files(self, pattern="/Trash/", dry_run=True):
        if dry_run:
            print('Nothing will be deleted. Use dry_run=False to delete.')
        for data in self.conn.get_container(self.container_name)[1]:
            file = data['name']
            if pattern in file:
                print('deleting', file)
                if not dry_run:
                    self.conn.delete_object(self.container_name, file)
                   

    def get_containers(self):
        return self.conn.get_account()[1]
    def get_container(self, container_name='oco2'):
        return self.conn.get_container(container_name)[1]

    def get_url_from_sounding_id(self, sounding_id):
        return config['swift_storage']['base_url']+'/datasets/oco-2/peaks-detected-details/peak_data-si_'+sounding_id+'.json'
        
    def get_dataframe(self, url):
        """
        Read the url of a file and load it with Pandas
        :param url: str, URL of the file to load.
        :return: DataFrame
        """
        # TODO : Switch to GeoPandas
        df = None
        extension = url.split('.')[-1].lower()
        if extension == 'csv':
            df = pd.read_csv(url)
            df['sounding_id']= df['sounding_id'].astype(str)
        elif extension == 'json':
            df = pd.read_json(url)
        return df
    
    def get_gaussian_param(self, sounding_id, df_all_peak):
        df_param = df_all_peak.query("sounding_id==@sounding_id")
        if len(df_param)<1:
            print('ERROR : sounding_id not found in dataframe !')
            return {'slope' : 1,'intercept' : 1,'amplitude' : 1,'sigma': 1,'delta': 1,'R' : 1}
        param_index = df_param.index[0]

        gaussian_param = {
            'slope' : df_param.loc[param_index, 'slope'],
            'intercept' : df_param.loc[param_index, 'intercept'],
            'amplitude' : df_param.loc[param_index, 'amplitude'],
            'sigma': df_param.loc[param_index, 'sigma'],
            'delta': df_param.loc[param_index, 'delta'],
            'R' : df_param.loc[param_index, 'R'],
        }
        return gaussian_param

## Connection

In [2]:
config = '../configs/config.json'
datasets = Datasets(config)

## Get containers names

In [3]:
for container in datasets.get_containers():
    print('Container name:', container['name'])

Container name: oco2


## List files

In [4]:
datasets.get_files_urls('html')

[]

### Get files objects

In [5]:
objects = datasets.get_container('oco2')
for data in objects:
    if 'oco2_1504' in data['name']:
        print('{0}\t{1}\t{2}'.format(data['name'], data['bytes'], data['last_modified']))


## Upload files

In [6]:
datasets.upload(mask='../*.md', prefix="/Trash/",content_type='text/text')
# datasets.("/media/data-nvme/dev/datasets/OCO2/csv/*.csv", "/datasets/oco-2/peaks-detected/", 'text/csv')
# datasets.("/media/data-nvme/dev/datasets/OCO2/csv/*.json", "/datasets/oco-2/peaks-detected-details/", 'application/json')

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.76it/s]


In [7]:
datasets.get_files_urls('/Trash/')

['https://storage.gra.cloud.ovh.net/v1/AUTH_2...d/oco2//Trash/CONTRIBUTING.md',
 'https://storage.gra.cloud.ovh.net/v1/AUTH_2...d/oco2//Trash/README.md']

### Upload HTML
Setting content type to 'text/html' allow the file to be display by browsers, without downloading.

In [8]:
#datasets.upload("chemin/peaks_and_sources.html", "/Trash/", 'text/html')

## Delete files

In [9]:
datasets.delete_files("/Trash/", dry_run=False)

deleting /Trash/CONTRIBUTING.md
deleting /Trash/README.md


In [10]:
df=datasets.get_dataframe('https://storage.gra.cloud.ovh.net/v1/AUTH_2aaacef8e88a4ca897bb93b984bd04dd/oco2//datasets/oco-2/peaks-detected-details/peak_data-si_2018082505142073.json')
df.head(3)

Unnamed: 0,orbit,sounding_id,latitude,longitude,xco2,xco2_uncert,windspeed_u,windspeed_v,surface_pressure_apriori,surface_pressure,altitude,land_water_indicator,land_fraction,latitude_orig,longitude_orig,distance,xco2_enhancement
0,22061,2018082505140535,35.290813,117.64283,400.87854,0.559905,-2.961818,-0.434006,983.626465,981.835144,218.204407,0,100,-43.749119,135.989822,-99.458018,-0.413101
1,22061,2018082505140503,35.292336,117.637512,400.439972,0.567133,-2.958889,-0.431859,981.793701,984.246216,234.751266,0,100,-43.749119,135.989822,-99.182306,-0.851669
2,22061,2018082505140608,35.297531,117.647171,400.820587,0.667865,-2.963,-0.423906,984.632263,984.782532,209.161057,0,100,-43.749119,135.989822,-98.822275,-0.471054


## Calcul de toutes les distances entre les lignes
1) ne conserve que les variables concernées par le calcul de distance
2) standardisation des data
3) calculer les distances
4) après nous sommes dans une problématique de clustering

In [11]:
lst = list(df.columns)
print(lst)


['orbit', 'sounding_id', 'latitude', 'longitude', 'xco2', 'xco2_uncert', 'windspeed_u', 'windspeed_v', 'surface_pressure_apriori', 'surface_pressure', 'altitude', 'land_water_indicator', 'land_fraction', 'latitude_orig', 'longitude_orig', 'distance', 'xco2_enhancement']


In [12]:
for x in ['orbit', 'sounding_id', 'latitude', 'longitude']:
    lst.remove(x)


- Les variables entrantes dans le calcul de la densité des point

In [15]:
lst
data = df[lst]

['xco2',
 'xco2_uncert',
 'windspeed_u',
 'windspeed_v',
 'surface_pressure_apriori',
 'surface_pressure',
 'altitude',
 'land_water_indicator',
 'land_fraction',
 'latitude_orig',
 'longitude_orig',
 'distance',
 'xco2_enhancement']

#### Scaled data has zero mean and unit variance:
- standardisation du dataset

In [17]:
from sklearn import preprocessing
import numpy as np
scaler = preprocessing.StandardScaler().fit(data)

z = scaler.transform(data)
print('- scaled')
print(z)

print('- std', np.std(z))
print('means', np.mean(z))



- scaled
[[-0.48320878 -1.00370044 -1.11069355 ...  0.         -1.95489176
  -0.48320878]
 [-0.77402376 -0.92000131 -1.07759097 ...  0.         -1.94980217
  -0.77402376]
 [-0.52163739  0.24649317 -1.12404289 ...  0.         -1.94315609
  -0.52163739]
 ...
 [-0.57384689  2.01126288  3.18149104 ...  0.          1.71373452
  -0.57384689]
 [-0.31051272  0.24957435  3.22139626 ...  0.          1.71479238
  -0.31051272]
 [-0.94544493  2.04753506  3.31048253 ...  0.          1.72562806
  -0.94544493]]
- std 0.9576845844606717
means -0.07692307692307639


### recherche des points isolés
1) recherche des hyperparamètres optimaux   
2) clustering et extraction des points isolés

In [18]:
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics


### recherche d'un parametre eps optimal

In [23]:
def do_dbcluster(z, eps=0.9, min_samples=10):
    ''' Dbscan clustering '''
    '''
    input:
    z : standardized data
    eps : Epsilon, mesure du voisinage 
    (deux points sont voisins quand ils sont à une distance plus petite que epsilon l un de l autre
    min_samples : nombre de points minimal pour créer un cluster
    
    par défault < 10 pts sont des points isolés 
    
    output:
    db : model dbscan
    class_member_mask : mask des points isoles
    '''
    
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(z)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    print('Estimated number of clusters: %d' % n_clusters_, ', eps:', eps)
    print('Estimated number of noise points: %d' % n_noise_, ', eps:', eps)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(z, labels))

    core_samples_mask[db.core_sample_indices_] = True
    k = -1 
    class_member_mask = (labels == k)

    return db, class_member_mask

In [29]:
# Compute DBSCAN
tab_eps = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9]
min_samples = 5
for eps in tab_eps:
    print('eps', eps)
    _, _ = do_dbcluster(z, eps, min_samples)

eps 0.5
Estimated number of clusters: 14 , eps: 0.5
Estimated number of noise points: 346 , eps: 0.5
Silhouette Coefficient: -0.271
eps 0.6
Estimated number of clusters: 15 , eps: 0.6
Estimated number of noise points: 223 , eps: 0.6
Silhouette Coefficient: -0.077
eps 0.7
Estimated number of clusters: 8 , eps: 0.7
Estimated number of noise points: 149 , eps: 0.7
Silhouette Coefficient: -0.000
eps 0.75
Estimated number of clusters: 7 , eps: 0.75
Estimated number of noise points: 107 , eps: 0.75
Silhouette Coefficient: 0.166
eps 0.8
Estimated number of clusters: 4 , eps: 0.8
Estimated number of noise points: 89 , eps: 0.8
Silhouette Coefficient: 0.221
eps 0.85
Estimated number of clusters: 4 , eps: 0.85
Estimated number of noise points: 77 , eps: 0.85
Silhouette Coefficient: 0.228
eps 0.9
Estimated number of clusters: 3 , eps: 0.9
Estimated number of noise points: 67 , eps: 0.9
Silhouette Coefficient: 0.301


#### Choix stratégique, 
- les plus simple consiste a afficher les points isoles et les autres sur un graphique en deux couleurs

In [31]:
#### Les points isolés pour un eps de ..
#### a verifier sur un graphique
if True:
    eps = 0.85
    min_samples = 5
    db, class_member_mask = do_dbcluster(z, eps, min_samples)
       


Estimated number of clusters: 4 , eps: 0.85
Estimated number of noise points: 77 , eps: 0.85
Silhouette Coefficient: 0.228


In [32]:
if True:
    df_isole = pd.DataFrame()
    df_incluster = pd.DataFrame()

    for index, r in df.iterrows():
        if class_member_mask[index]:
            df_isole = df_isole.append(r, ignore_index=True)
        else:
            df_incluster = df_incluster.append(r, ignore_index=True)
    print('les points dans les clusters', df_incluster.shape)        
    print('les points isoles', df_isole.shape)        
    display(df_isole.head())



les points dans les clusters (475, 17)
les points isoles (77, 17)


Unnamed: 0,altitude,distance,land_fraction,land_water_indicator,latitude,latitude_orig,longitude,longitude_orig,orbit,sounding_id,surface_pressure,surface_pressure_apriori,windspeed_u,windspeed_v,xco2,xco2_enhancement,xco2_uncert
0,207.826263,-95.514451,91.0,3.0,35.32592,-43.749119,117.63578,135.989822,22061.0,2018083000000000.0,980.419373,984.789673,-2.952792,-0.382215,402.204193,0.912552,0.560314
1,305.329376,-94.031567,100.0,0.0,35.33762,-43.749119,117.625359,135.989822,22061.0,2018083000000000.0,976.453674,974.018433,-2.945526,-0.365604,403.342834,2.051193,0.651326
2,349.941772,-92.452384,100.0,0.0,35.353027,-43.749119,117.629517,135.989822,22061.0,2018083000000000.0,973.209717,969.121826,-2.945274,-0.342623,401.489441,0.1978,0.712861
3,299.793091,-90.4721,100.0,0.0,35.367142,-43.749119,117.607788,135.989822,22061.0,2018083000000000.0,975.678406,974.635864,-2.931464,-0.323926,403.928772,2.637131,0.745471
4,223.182571,-87.075481,100.0,0.0,35.399319,-43.749119,117.61174,135.989822,22061.0,2018083000000000.0,985.541565,983.111206,-2.928306,-0.276691,401.913239,0.621597,0.710786


#### Affichage des points isole, et les autres sur un graphique

In [None]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted datasets.ipynb.
Converted find_peak.ipynb.
Converted index.ipynb.
Converted map.ipynb.
