In [None]:
import sys
import os

import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import cartopy
import cartopy.crs as ccrs
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import pyproj
import pyresample
from pyresample import create_area_def, load_area, data_reduce, utils, AreaDefinition
from pyresample.geometry import SwathDefinition
from pyresample.kd_tree import resample_nearest 

from sklearn import svm
from sklearn.linear_model import SGDOneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline


%matplotlib inline
#%matplotlib notebook

In [None]:
sys.path.append('/home/mario/Documents/Coursera\
/Unsupervised/week1/Labs/Lab2/Files/home/jovyan/work')
from utils import *
          

In [None]:
os.environ["MALLOC_TRIM_THRESHOLD_"] = "0"#"65536"

from dask.distributed import Client, progress, LocalCluster

cluster = LocalCluster()
client = Client(cluster)
client

In [None]:
dataDir = '/home/mario/Data/CMSAF/ssims/F16/'
fileID = 'BTRin20140909000000324SSF1601GL.nc'

In [None]:
#ds = xr.open_dataset(dataDir+fileID)
ds = xr.open_mfdataset(dataDir+'*.nc')

In [None]:
ds

In [None]:
scenes_list = ['scene_env1', 'scene_env2']
scene_BT = []

for scene in scenes_list:        
    scene_BT.append(xr.open_mfdataset(
        dataDir+'*.nc', combine = 'nested', 
        concat_dim='time', group = scene)) 

#for scene in scenes_list:
    #scene_BT.append(xr.open_dataset(dataDir+fileID, group = scene))
    #scene_BT.append(xr.open_mfdataset(dataDir+'*.nc', group = scene))

In [None]:
scene_BT[1]

In [None]:
ds_BT = xr.concat(scene_BT, dim = 'scene_channel').drop_vars([
])

ds_BT['lat'] = ds_BT.lat[0,:,:]
ds_BT['lon'] = ds_BT.lon[0,:,:]
ds_BT['eia'] = ds_BT.eia[0,:,:]
ds_BT['sft'] = ds_BT.sft[0,:,:]
ds_BT['qc_fov'] = ds_BT.qc_fov[0,:,:]
ds_BT['laz'] = ds_BT.laz[0,:,:]


In [None]:
ds_BT

In [None]:
ds_aux = ds_BT.assign_coords(time=ds.time).sel(
    scene_channel=[11,12,14,15]).where(ds_BT.sft==0)

ds_aux['central_freq'] = ds['central_freq'][0,0,ds_aux['scene_channel']]

ds_work = ds_aux #.drop_dims(drop_dims = ['date','channel'])

In [None]:
ds_work

In [None]:
def defineArea(corners, proj_id, datum):
    #corners=parseMeta(data_name)

    lat_0 = '{lat_0:5.2f}'.format_map(corners)
    lon_0= '{lon_0:5.2f}'.format_map(corners)
    lon_bbox = [corners['min_lon'],corners['max_lon']]
    lat_bbox = [corners['min_lat'],corners['max_lat']]
#    area_dict = dict(datum=datum,lat_0=lat_0,lon_0=lon_0,
#                proj=proj_id,units='m')

    area_dict = dict(datum=datum,lat_0=-15,lon_0=60,
                proj=proj_id,units='m',a=6370997.0,)

    prj=pyproj.Proj(area_dict)
    x, y = prj(lon_bbox, lat_bbox)
    xsize=200
    ysize=200
    area_id = 'granule'
    area_name = 'modis swath 5min granule'
    area_extent = (x[0], y[0], x[1], y[1])
    print(area_extent)
    area_def = AreaDefinition(area_id, area_name, proj_id, 
                                   area_dict, xsize, ysize,area_extent)
    return area_def



In [None]:

# Creation of area of interest:
#corners = {"min_lon": 25 , "max_lon": 75, "min_lat": -30 , "max_lat": 0, "lat_0": 60, "lon_0":-15}
corners = {"min_lon": -95 , "max_lon": 20, "min_lat": 3 , "max_lat": 50, "lat_0": 27, "lon_0":-57}
proj_id = 'eqc'  # eqc
datum = 'WGS84'
area_interest = defineArea(corners, proj_id, datum)


area_def_world = load_area('areas.yaml', 'worldeqc30km')# 'worldeqc30km70') # for plots


In [None]:
def get_TB_frame(ds, area_interest, channel, begin_t=None, end_t=None):
    
    grid_lons_interest, grid_lats_interest = area_interest.get_lonlats()

    swathDef = SwathDefinition(lons=ds.lon.values, lats=ds.lat.values)
    lon_scene, lat_scene = swathDef.get_lonlats()

    reduced_lon_scene, reduced_lat_scene, reduced_data_scene = \
                           data_reduce.swath_from_lonlat_grid(
        grid_lons_interest, grid_lats_interest,
        lon_scene, lat_scene, ds.tb[:,channel,:].values,
        radius_of_influence=3000)

    return reduced_lon_scene, reduced_lat_scene, reduced_data_scene

In [None]:
def basicMapPlotScat(x,y,data,namefile, area):
    # Make a Mercator map of the data using Cartopy
    
    crs = area.to_cartopy_crs()
    
    fig = plt.figure(figsize=(8, 6))
    #plt.figure(figsize=(8, 6))
    ax = plt.axes(projection=crs)   
    ax.add_feature(cartopy.feature.LAND, zorder=0, edgecolor='black')
    ax.set_global()
    ax.gridlines()        
    ax.set_title("TB")
    
    gl = ax.gridlines(crs=ccrs.PlateCarree(), linewidth=0.1, 
                      color='black', alpha=0.5, linestyle='--', draw_labels=True)
    gl.xformatter = LONGITUDE_FORMATTER
    gl.yformatter = LATITUDE_FORMATTER    

    # Plot the air temperature as colored circles and the wind speed as vectors.
    im = ax.scatter(
        x,
        y,
        c=data,
        s=0.15,
        cmap="viridis",
        transform=ccrs.PlateCarree(),
        #vmin=3, vmax=18         #180, 270
        vmin=130, vmax=270         #180, 270
    )
    fig.colorbar(im).set_label("Brightness temperature [K]")
    
# Use an utility function to add tick labels and land and ocean features to the map.

    #plt.tight_layout()
    plt.show()
    #plt.savefig(namefile+'.png', bbox_inches='tight', dpi=150) 
    
def basicMapPlotScat1(x,y,data,namefile, area):
    # Make a Mercator map of the data using Cartopy
    
    fig = plt.figure()
    
    ortho = ccrs.Orthographic(60,-15)
    ax = plt.axes(projection=ortho)
    
    crs = ccrs.RotatedPole(pole_longitude=177.5, pole_latitude=37.5)
    geo = ccrs.Geodetic()
    #crs = ccrs.Orthographic(60,-15)
    
    ax.add_feature(cartopy.feature.LAND, zorder=0, edgecolor='black')
    
    xy = ortho.transform_points(geo, x, y)

    ax.set_global()
    ax.gridlines()    
    
    #ax.set_title("TB")
    #ax.coastlines() 
    # Plot the air temperature as colored circles and the wind speed as vectors.
    im = ax.scatter(
        xy[:,0],
        xy[:,1],
        c=data,
        s=0.15,
        cmap="viridis",
        #transform=crs,
        #vmin=3, vmax=18,  # 180, 270
        vmin=130, vmax=270,  # 180, 270        
    )
    #fig.colorbar(im).set_label("10m Wind Speed, HOAPS [m/s]")
    fig.colorbar(im).set_label("Temp. Bright [K]")
    
# Use an utility function to add tick labels and land and ocean features to the map.

    plt.tight_layout()
    #plt.show()
    plt.savefig(namefile+'.png', bbox_inches='tight', dpi=300)      

In [None]:
for channel in range(4):
    reduced_lon_scene, reduced_lat_scene, reduced_data_scene =\
    get_TB_frame(ds_work, area_def_world, channel)
    
    basicMapPlotScat1(reduced_lon_scene, reduced_lat_scene, reduced_data_scene,
                 'scene_channel_'+str(channel), area_interest)

In [None]:
# Some histograms:

#ds_tb_log = np.log10(ds_work.tb[:,0,:]) 
#ds_work.tb[:,0,:].plot.hist(bins=20,)
#ds_tb_log.plot.hist(bins=30,)

In [None]:
def bigHistogram(da, numbins=20):
    # Computing histogram of all the values contained in dataarray da:
    # We resort to this way of computing the histogram because
    # the normal xarray.plot.hist produced strange plots:

    datamin = np.nanmin(da.values)
    datamax = np.nanmax(da.values)
    #numbins = 20

    delta = (datamax-datamin)/numbins
    mybins =np.linspace(datamin+delta/2,
                    datamax-delta/2,
                    numbins) # Bins midpoint locations
    # Cycle in time:
    #hist, _ = np.histogram(da.isel(time=0).values.ravel(), bins = numbins,
    #                       range=(np.nanmin(da.isel(time=0)),np.nanmax(da.isel(time=0))))
    #for i in range(1, len(da["time"])):
    #    hist += np.histogram(da.isel(time=i).values.ravel(), bins = numbins,
    #                        range=(np.nanmin(da.isel(time=i)),np.nanmax(da.isel(time=i))))[0]

    hist, _ = np.histogram(da.isel(scene_across_track=0).values.ravel(), bins = numbins,
                       range=(np.nanmin(da.isel(scene_across_track=0)),
                              np.nanmax(da.isel(scene_across_track=0))))
    for i in range(1, len(da["scene_across_track"])):
        hist += np.histogram(da.isel(scene_across_track=i).values.ravel(), bins = numbins,
                        range=(np.nanmin(da.isel(scene_across_track=i)),
                               np.nanmax(da.isel(scene_across_track=i))))[0]
        print('Step '+str(i)+' of '+
             str(len(da["scene_across_track"]))+
             ' done!')
    
    return hist, mybins


In [None]:
# channels: 
# 0 => 19 GHz, H
# 1 => 19 GHz, V
# 2 => 37 GHz, H
# 3 => 37 GHz, V

In [None]:
da = ds_work.tb[:,3,:].dropna(
    dim='time', how='all').chunk(
    chunks={'time':45000})

numbins = 20
hist, bins = bigHistogram(da, numbins=numbins)

In [None]:
# Plot histogram using seaborn:
plt.figure()
sns.histplot(x=bins, weights=hist, discrete=False, bins=numbins)
plt.xlabel('Temperature Brightness [K] ')
plt.grid(visible=True)
plt.title('Distribution of Temp. Brightness in channel 37V')
plt.savefig('hist_TB_channel37V.png',dpi =150) 

In [None]:
da0 = ds_work.tb[:,0,:].dropna(
    dim='time', how='all').chunk(
    chunks={'time':45000})
da1 = ds_work.tb[:,3,:].dropna(
    dim='time', how='all').chunk(
    chunks={'time':45000})


In [None]:
da0

In [None]:
plt.figure()
plt.scatter(da0.stack(index=("time","scene_across_track")), 
           da1.stack(index=("time","scene_across_track")))
plt.xlabel('Temperature Brightness [K], 19H')
plt.ylabel('Temperature Brightness [K], 37V')
plt.grid(visible=True)
plt.title('Scatter plot 19H vs 37V')
#plt.show()
plt.savefig('scatter_19H_37V.png',dpi =150) 

In [None]:
ds_tb = ds_work.tb[:,:,:].dropna(
    dim='time', how='all')
ds_tb

In [None]:
nrows = ds_work.tb[:,:,:].stack(
    index=('time','scene_across_track'
          )).transpose("index", "scene_channel"
                      ).dropna(how='all', dim = 'index'
                   ).to_pandas().shape[0] #.to_csv('scores.csv')

newIndex = np.arange(nrows)

dataframe_TB = ds_work.tb[:,:,:].stack(
    index=('time','scene_across_track'
          )).transpose("index", "scene_channel"
                      ).dropna(how='all', dim = 'index'
                              ).to_pandas().set_index(
    keys=newIndex)
dataframe_TB.index.name = 'example'
dataframe_TB #.to_csv('eigenVal.csv')

In [None]:
#dataframe_TB.to_csv('dataframe_TB.csv')
dataframe_TB = pd.read_csv('dataframe_TB.csv')
del dataframe_TB['example']
dataframe_TB.index.name = 'example'
dataframe_TB

In [None]:
X_train, X_test = train_test_split(dataframe_TB, test_size=0.2, random_state=42)

In [None]:
X_test, X_outliers = train_test_split(X_test, test_size=0.2, random_state=42)

In [None]:
X_outliers.iloc[0:100000,:] = X_outliers.iloc[0:100000,:] + 3
X_outliers.iloc[100001:200000,:] = X_outliers.iloc[100001:200000,:] - 3
X_outliers.iloc[200001:300000,:] = X_outliers.iloc[200001:300000,:] + 5
X_outliers.iloc[300001:400000,:] = X_outliers.iloc[300001:400000,:] - 5
X_outliers.iloc[400001:500000,:] = X_outliers.iloc[400001:500000,:] + 10
X_outliers.iloc[500001:600000,:] = X_outliers.iloc[500001:600000,:] - 10
X_outliers.iloc[600001:645857,:] = X_outliers.iloc[600001:645857,:] + 15

In [None]:
# fit the model

#clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1, verbose = 1)
#clf.fit(X_train)
#y_pred_train = clf.predict(X_train)
#y_pred_test = clf.predict(X_test)
#y_pred_outliers = clf.predict(X_outliers)
#n_error_train = y_pred_train[y_pred_train == -1].size
#n_error_test = y_pred_test[y_pred_test == -1].size
#n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

nu = 0.05
gamma = 2.0
random_state = 42
# Fit the One-Class SVM using a kernel approximation and SGD
transform = Nystroem(gamma=gamma, random_state=random_state)
clf_sgd = SGDOneClassSVM(nu=nu, shuffle=True, 
                         fit_intercept=True, random_state=random_state, 
                         tol=1e-4, verbose = 1)

pipe_sgd = make_pipeline(transform, clf_sgd)
pipe_sgd.fit(X_train)
y_pred_train_sgd = pipe_sgd.predict(X_train)
y_pred_test_sgd = pipe_sgd.predict(X_test)
y_pred_outliers_sgd = pipe_sgd.predict(X_outliers)
n_error_train_sgd = y_pred_train_sgd[y_pred_train_sgd == -1].size
n_error_test_sgd = y_pred_test_sgd[y_pred_test_sgd == -1].size
n_error_outliers_sgd = y_pred_outliers_sgd[y_pred_outliers_sgd == 1].size

In [None]:
def covariance(da):
    
    # Inputs:
    # da, xarray datarray
    
    # Outputs:
    # listMatrices, list of covariances to be shaped as a numpy 2D array.
    
    listMatrices = []
    #listIndices = []
    
    for channel1 in da.scene_channel:
        for channel2 in da.scene_channel:
        
            listMatrices.append(  # Compute the variance and append it to the list of variances.
                xr.cov( da.sel(scene_channel=channel1).stack(
                    index=('time','scene_across_track')).chunk(
                    chunks={'index':1000000}), 
                   da.sel(scene_channel=channel2).stack(
                    index=('time','scene_across_track')).chunk(
                    chunks={'index':1000000}), 
                       dim='index').compute().values
            ) 
            #print('Variable: '+str(channel1)+str(channel2)+', appended')          

    print("Computed variances: ")
    print(listMatrices)
    #print(listIndices)
    
    
    # Return the diagonal matrix of covariances and the names of the indices
    return listMatrices #np.diag(out), listIndices  

In [None]:
# With xarray option 1 (only diagonal terms):

#ds_cov = xr.cov(ds_tb, ds_tb, dim = 'index')
#ds_cov

# With xarray option 2 (full matrix):
covList = covariance(ds_tb)
covList

covMatrix = np.asarray(covList).reshape((4,4))


In [None]:
eigenVal, eigenVec = np.linalg.eig(covMatrix)

In [None]:
eigVal_DataArray = xr.DataArray(data=np.diag(eigenVal), 
                                dims=['channel_latentSpace','channel_latentSpace_T'])
eigVal_DataArray

In [None]:
cov_DataArray = xr.DataArray(data=covMatrix, 
                             dims=['scene_channel','scene_channel_T'])
cov_DataArray

In [None]:
eigenVec_DataArray = xr.DataArray(data=eigenVec, 
                                  dims=['scene_channel','scene_channel_reduced'])
eigenVec_DataArray

In [None]:
cov_DataArray.to_pandas().to_csv('covariance.csv')
eigenVec_DataArray.to_pandas().to_csv('eigenVec.csv')
eigVal_DataArray.to_pandas().to_csv('eigenVal.csv')

In [None]:

#ds_T = ds_tb.stack(
#    index=('time','scene_across_track')).chunk(
#    chunks={'index':1000000}).dot(w_DataArray)
#ds_T

ds_T = xr.dot(ds_tb.stack(
    index=('time','scene_across_track')).chunk(
    chunks={'index':1000000}), 
              eigenVec_DataArray)
ds_T

In [None]:
plt.figure()
plt.scatter(ds_T[:,0], 
           ds_T[:,1])
plt.xlabel('Scores_0 [Units]')
plt.ylabel('Scores_3 [Units]')
plt.grid(visible=True)
plt.title('Scatter plot Scores_0 vs Scores_3')
#plt.show()
plt.savefig('scatter_Scores_0_Scores_3.png',dpi =150) 

In [None]:


fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(ds_T[:,0], 
           ds_T[:,1], ds_T[:,2])

ax.set_xlabel('Scores_0 [Units]')
ax.set_ylabel('Scores_1 [Units]')
ax.set_zlabel('Scores_2 [Units]')

#plt.grid(visible=True)
#plt.title('Scatter plot Scores_0_1_2')
plt.show()

In [None]:
nrows = ds_T.dropna(how='all', dim = 'index').to_pandas().shape[0] #.to_csv('scores.csv')
newIndex = np.arange(nrows)

dataframe_scores = ds_T.dropna(how='all', dim = 'index'
                              ).to_pandas().set_index(
    keys=newIndex)
dataframe_scores.index.name = 'example'
dataframe_scores #.to_csv('eigenVal.csv')

In [None]:
dataframe_scores.to_csv('scores.csv')

In [None]:
#scores = pd.read_csv('scores.csv')
#scores
dataframe_scores.iloc[:,0]

In [None]:
# Plot histogram using seaborn:
plt.figure()
sns.histplot(data = dataframe_scores.iloc[:,0], bins=20)
plt.xlabel('Score_0')
plt.grid(visible=True)
plt.title('Distribution of Score 0')
plt.savefig('hist_Score0.png',dpi =150) 

In [None]:
plt.figure()
sns.jointplot(dataframe_scores.iloc[:,0:2], x = 0, y = 1)
plt.xlabel('Score_0')
plt.ylabel('Score_1')
plt.grid(visible=True)
plt.title('Distribution of Score 0 and 1')
plt.savefig('JoinPlot_Score0_1.png',dpi =150) 

In [None]:
plt.figure()
sns.displot(dataframe_scores.iloc[:,0:2], x = 0, y = 1)
plt.xlabel('Score_0')
plt.ylabel('Score_1')
plt.grid(visible=True)
plt.title('Distribution of Score 0 and 1')
plt.savefig('hist2D_Score0_1.png',dpi =150) 

In [None]:
scores = pd.read_csv('scores.csv')

In [None]:
scores