In [None]:
import iris

In [None]:
import requests
from requests import HTTPError
from typing import List, Optional
from bs4 import BeautifulSoup

In [None]:
site = 'http://37.128.186.209/LAURA/ERA5/30year'
def get_html_data_list(site: str) -> List[str]:
    result = []
    html = requests.get(site)
    try:
        html.raise_for_status()
    except HTTPError as err:
        print(f'cannot get html from {site} because of {html.status_code}')
    soup = BeautifulSoup(html.text, 'html.parser')
    for link in soup.find_all('a'):
        if link.get('href').startswith('ERA5'):
            result.append(link.get('href'))
    return result


In [None]:
html = get_html_data_list('http://37.128.186.209/LAURA/ERA5/30year')

In [None]:
def make_url(base_url: str, filename: str) -> str:
    stripped_url = base_url.rstrip('/')
    return f'{stripped_url}/{filename}'

In [None]:
from pathlib import Path
def make_dest_path(filename: str) -> str:
    cwd = Path.cwd()
    temp = cwd / 'temp'
    temp.mkdir(exist_ok=True)
    return f'{str(temp)}/{filename}'

In [None]:
from os import path
def download_file(site: str, dest_path: str) -> Optional[str]:
    with requests.get(site, stream=True) as r:
        try:
            r.raise_for_status()
        except HTTPError as err:
            print(f'cannot download file from {site} because of {r.status_code}')
        try:
            if path.exists(dest_path):
                print(f'file {dest_path} already downloaded')
                return dest_path
            with open(dest_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        except Exception as e:
            print(f'cannot download file because of {e}')
            dest_path = None
    return dest_path

In [None]:
filename = download_file('http://37.128.186.209/LAURA/ERA5/30year/ERA5_30year_2mTemp.tar.gz','ERA5_30year_2mTemp.tar.gz')

In [None]:
def make_state_file (filetype: str) -> str:
    cwd = Path.cwd()
    temp = cwd / 'temp'
    temp.mkdir(exist_ok=True)
    return f'{str(temp)}/state_{filetype}.csv'

In [None]:
def insert_into_state_file(state_file_path : str, filename: str, status: str) -> None:
    with open(state_file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        if downloaded_file is not None:
            writer.writerow([f'{filename}', f'{status}'])
    print(f'inserted status {status} for file {filename}')

In [None]:
def make_unpacked_folder(filename: str) -> str:
    filename = filename.rstrip('.tar.gz')
    cwd = Path.cwd()
    temp = cwd / 'unpacked_temp' / filename
    temp.mkdir(mode=0o777, parents= True, exist_ok=True)
    return str(temp)

In [None]:
def unpack_file(compressed_file: str, destination: str) -> None:
    if len(os.listdir(destination))>0 :
        print(f'file {compressed_file} already unpacked in destination {destination}')
        return
    with tarfile.open(compressed_file, mode='r:gz') as tar:
        tar.extractall(path=destination)
#     write state unpacked
    print (f'file extracted in {destination}')

In [None]:
def get_child_unpacked_folder(unpacked_folder: str) -> str:
    list_dir = os.listdir(unpacked_folder)
    if len(list_dir) != 1:
        raise ValueError(f'{unpacked_folder} contains more than a child or is empty')
    return unpacked_folder + '/' + list_dir[0]

In [None]:
from typing import Generator
def get_zipped_nc_file(folder: str) -> Generator[Path, None, None]:
    return Path(folder).glob('*.gz')


In [None]:
import gzip
def unzip_nc_file(filepath: str) -> str:
    with gzip.open(filepath, 'rb') as f:
        output_filepath = filepath.rstrip('.gz')
        with open(output_filepath, 'wb') as w:
            while True:
                piece = f.read(1024)
                if not piece:
                    break
                w.write(piece)
    return output_filepath

In [None]:
def delete_compressed_nc_file(filepath: str) -> Optional[str]:
    if filepath.endswith('.gz'):
        os.remove(filepath)
        return filepath
    else:
        raise ValueError(f'file; {filepath} is not compressed')

In [None]:
def wait_for_decompress(filepath: str) -> None:
    while True:
        if len(list(Path(filepath).glob('*.gz'))) == 0:
            break
        print (f'there are still netcdf files to decompress')
        sleep(1)
    print(f'decompressed all netcdfs ------------------------------------------')

In [None]:
from iris.experimental.equalise_cubes import equalise_attributes

def merge_cubes(cube_list):
    equalise_attributes(cube_list)
    return cube_list.merge_cube()


In [None]:
def make_output_nc_filepath(folder_with_nc: str) -> str:
    cwd = Path.cwd()
    temp = cwd / 'results'
    temp.mkdir(mode=0o777, exist_ok=True)
    base = folder_with_nc.rsplit('/', 1)[0]
    filename = base.rsplit('/', 1)[-1] + '.nc'
    return str(temp) + '/' + filename

In [None]:
def merge_all_nc(filepath: str, output_path:str) -> str:
    nc_gen = Path(filepath).glob('*.nc')
    cube_list = []
    for nc in nc_gen:
        cube_to_merge = iris.load_cube(str(nc))
        cube_list.append(cube_to_merge)
    merged_cube = merge_cubes(iris.cube.CubeList(cube_list))
    iris.save(merged_cube, output_path)
    print (f'saved merged nc file in {output_path}')
    return output_path

In [None]:
from shutil import rmtree
def cleanup_folder(folder_path: str)->None:
    rmtree(folder_path)

In [None]:
from typing import Tuple
def get_aws_config() -> Tuple[str, str, str, str]:
    endpoint_url = os.getenv('S3_URL','http://s3-uk-1.sa-catapult.co.uk')
    access_key = os.getenv('S3_ID', '')
    secreat_access_key = os.getenv('S3_KEY', '')
    s3_bucket = os.getenv('S3_BUCKET', '')
    return endpoint_url, access_key, secreat_access_key, s3_bucket

In [None]:
import boto3
from boto3.s3.transfer import TransferConfig
from time import sleep

def upload_to_s3(src_path: str, dest_path: str) -> None:
    endpoint_url, access_key, secreat_access_key, s3_bucket = get_aws_config()
    print (f'starting uploading {src_path}')
    s3 = boto3.client('s3', endpoint_url=endpoint_url, aws_access_key_id=access_key,
                      aws_secret_access_key=secreat_access_key)
    s3.upload_file(src_path, s3_bucket, dest_path)
    print (f'Uploaded {src_path} to S3 as {dest_path}')

In [None]:
def uncompress_downloaded_tar(downloaded_file_path: str, filename: str) -> str:
    dest_unpacked = make_unpacked_folder(filename=filename)
    unpack_file(compressed_file=downloaded_file_path, destination=dest_unpacked)
    child_unpacked = get_child_unpacked_folder(unpacked_folder=dest_unpacked)
    for nc_file in get_zipped_nc_file(child_unpacked):
        _ = unzip_nc_file(str(nc_file))
        _ = delete_compressed_nc_file(str(nc_file))
    print(f'nc files unzipped in folder {dest_unpacked}')
    return child_unpacked

In [None]:
def make_s3_destination_filename(url: str, merged_nc_filepath: str) -> str:
    data_recurrence = url.rsplit('/', 1)[-1]
    merged_nc_filename = merged_nc_filepath.rsplit('/', 1)[-1]
    return data_recurrence + '/' + merged_nc_filename

In [None]:
def check_if_file_exsists_in_results(file_path: str) -> bool:
    pass

In [None]:
std_names = {
    '2 metre temperature': 'surface_temperature',
    'Total precipitation': 'precipitation_flux',
    'Sea surface temperature': 'sea_surface_temperature',
    'Soil temperature level 1': 'soil_temperature',
    'Volumetric soil water layer 1': 'volume_fraction_of_condensed_water_in_soil'
}
def add_standard_name_to_cube(path_to_file: str, std_name_dict: dict) -> str:
    original_cube = iris.load_cube(path_to_file)
    print(original_cube)
    if original_cube.standard_name is not None:
        return path_to_file
    std_name = std_name_dict[original_cube.long_name]
    temp_out = path_to_file.rsplit('/', 1)[0] + '/' + std_name + '.nc'
    print(temp_out)
    original_cube.standard_name=std_name
    iris.save(original_cube, temp_out)
    Path(path_to_file).unlink()
    Path(temp_out).rename(path_to_file)
    print(f'added std name {std_name} to {path_to_file}')
    return path_to_file


In [None]:
import os
import csv
import tarfile
def main(url: str) -> None:
    data_list = get_html_data_list(site=url)
    print (f'there are {len(data_list)} file to process')
    count = 0
    for data in data_list:
        count += 1
        site = make_url(url, data)
        dest_path = make_dest_path(filename=data)
        print(f'start downloading {data}')
        downloaded_file = download_file(site=site, dest_path=dest_path)
        print (f'downloaded file {downloaded_file}')
        folder_with_nc = uncompress_downloaded_tar(downloaded_file_path=downloaded_file, filename=data)
        wait_for_decompress(folder_with_nc)
        merged_nc_filepath = make_output_nc_filepath(folder_with_nc=folder_with_nc)
        merge_all_nc(filepath=folder_with_nc,output_path=merged_nc_filepath)
        add_standard_name_to_cube(path_to_file=merged_nc_filepath, std_name_dict=std_names)
        cleanup_folder(folder_path=str(Path(folder_with_nc).parent))
        print(f'processed {count} of {len(data_list)}')
        s3_destination = make_s3_destination_filename(url=url, merged_nc_filepath=merged_nc_filepath)
        print(f'S3 destination: {s3_destination}')
        upload_to_s3(src_path=merged_nc_filepath, dest_path=s3_destination)
        print(f'uploaded {count} files of {len(data_list)} | file uploaded to destination {s3_destination}')
        Path(downloaded_file).unlink()
        Path(merged_nc_filepath).unlink()
    cleanup_folder(str(Path.cwd() / 'temp'))
    cleanup_folder(str(Path.cwd() / 'unpacked_temp'))
    cleanup_folder(str(Path.cwd() / 'results'))


In [None]:
main(url='http://37.128.186.209/LAURA/ERA5/30year')

In [None]:
import iris
from pathlib import Path
cube = iris.load_cube(str(Path.cwd() / 'results' / 'ERA5_30year_2mTemp_std_name.nc'))

In [None]:
Path.unlink()

In [None]:
print(cube.standard_name)

In [None]:
import iris
from pathlib import Path
cube = iris.load_cube(str(Path.cwd() / 'results' / 'ERA5_30year_2mTemp.nc'))

In [None]:
fn = str(Path.cwd() / 'results' / 'ERA5_30year_2mTemp.nc')
add_standard_name_to_cube(path_to_file=fn, std_name_dict=std_names)

In [None]:
file = 'ERA5_30year_2mTemp.tar.gz'
state_file = make_state_file('yearly')
site = make_url('http://37.128.186.209/LAURA/ERA5/30year', 'ERA5_30year_2mTemp.tar.gz')
dest_path = make_dest_path(filename=file)
downloaded_file = download_file(site=site, dest_path=dest_path)
print (f'downloaded file {downloaded_file}')
dest_unpacked = make_unpacked_folder(filename=file)
print(f'dest unpacked: {dest_unpacked}')
unpack_file(compressed_file=downloaded_file, destination=dest_unpacked)
child_unpacked = get_child_unpacked_folder(unpacked_folder=dest_unpacked)
print (child_unpacked)
for nc_file in get_zipped_nc_file(child_unpacked):
    output_filepath = unzip_nc_file(str(nc_file))
    _ = delete_compressed_nc_file(str(nc_file))
wait_for_decompress(child_unpacked)
output_nc_path = make_output_nc_filepath(base_path=dest_unpacked, file_name=dest_unpacked.rsplit('/',1)[-1]+'.nc')
saved_file = merge_all_nc(child_unpacked,output_path=output_nc_path)

In [None]:
dest_unpacked.rsplit('/',1)[-1]+'.nc'

In [None]:
p = Path('/home/jovyan/work/unpacked_temp/ERA5_30year_2mTemp/2mTemp/')
[f for f in p.iterdir()]

In [None]:
merge_all_nc(filepath='/home/jovyan/work/unpacked_temp/ERA5_30year_2mTemp/2mTemp/')

In [None]:
cube_1 = iris.load_cube('/home/jovyan/work/unpacked_temp/ERA5_30year_2mTemp/2mTemp/2mTemp_ERA5_SouthPacific_30_year_av_0101.nc')
cube_2 = iris.load_cube('/home/jovyan/work/unpacked_temp/ERA5_30year_2mTemp/2mTemp/2mTemp_ERA5_SouthPacific_30_year_av_0102.nc')
cube_list = merge_2_cubes(cube_1=cube_1, cube_2=cube_2)

In [None]:
def concatenate_2_cubes(cube_1, cube_2):
    print(cube_1.coords(dim_coords=False))
    cube_1.add_dim_coord('time',0)
    cube_2.add_dim_coord('time',0)
    cube_list = iris.cube.CubeList([cube_1, cube_2])
    return cube_list.concatenate_cube()

In [None]:
cube_1 = iris.load_cube('/home/jovyan/work/unpacked_temp/ERA5_30year_2mTemp/2mTemp/2mTemp_ERA5_SouthPacific_30_year_av_0101.nc')
cube_2 = iris.load_cube('/home/jovyan/work/unpacked_temp/ERA5_30year_2mTemp/2mTemp/2mTemp_ERA5_SouthPacific_30_year_av_0102.nc')
cube_list = concatenate_2_cubes(cube_1=cube_1, cube_2=cube_2)

In [None]:
print(cube_list)

In [None]:
def merge_all_nc(filepath: str):
    nc_gen = Path(filepath).glob('*.nc')
    cube_list = []
    for nc in nc_gen:
        cube_to_merge = iris.load_cube(str(nc))
        cube_list.append(cube_to_merge)
    print(cube_list)

In [None]:
from iris.experimental.equalise_cubes import equalise_attributes
equalise_attributes(cube_list)
res = cube_list.merge_cube()

In [None]:
print(res)

In [None]:
res.derived_coords

In [None]:
import iris

cube = iris.load_cube('/home/jovyan/work/unpacked_temp/ERA5_30year_2mTemp/2mTemp/2mTemp_ERA5_SouthPacific_30_year_av_0317.nc')
for coord in cube.coords():
    print(coord.name())

In [None]:
def extract_nc_files_in_folder(nc_folder: str) -> str:
#     get list of files
    pass

In [None]:
from bs4 import BeautifulSoup

def get_list
soup = BeautifulSoup(html.text, 'html.parser')

In [None]:
print(soup.prettify())

In [None]:
for link in soup.find_all('a'):
    if link.get('href').startswith('ERA5'):
        print(link.get('href'))

