## Generating a data model for CLIWOC

In [1]:
from __future__ import annotations

import json
import os
import shutil

import pandas as pd

from cdm_reader_mapper import mdf_reader, test_data

try:
    from importlib.resources import files as get_files
except ImportError:
    from importlib_resources import files as get_files

from tempfile import TemporaryDirectory

2024-06-06 10:48:27,524 - root - INFO - init basic configure of logging success
  from .autonotebook import tqdm as notebook_tqdm
2024-06-06 10:48:30,630 - root - INFO - init basic configure of logging success


In [2]:
schema = "imma1"
data_file_path = test_data.test_063_714["source"]
data_raw = mdf_reader.read(data_file_path, data_model=schema)

2024-06-06 10:48:30,755 - root - INFO - Attempting to fetch remote file: imma1_714/input/063-714_2010-07-01_subset.imma.md5
2024-06-06 10:48:31,193 - root - INFO - READING DATA MODEL SCHEMA FILE...
2024-06-06 10:48:31,199 - root - INFO - EXTRACTING DATA FROM MODEL: imma1
2024-06-06 10:48:31,200 - root - INFO - Getting data string from source...
2024-06-06 10:48:31,516 - root - INFO - CREATING OUTPUT DATA ATTRIBUTES FROM DATA MODEL


In [3]:
data_raw.data["c99"].head()

0    99 0 48683,20100701,0000,  88.3790,  43.2100,1...
1    99 0 25629,20100701,0000,  87.8090, -54.8550,1...
2    99 0 25595,20100701,0000,  87.5610,  41.7610,1...
3    99 0 26558,20100701,0000,  87.4040,  17.9720,1...
4    99 0 26559,20100701,0000,  87.4030,  17.9730,1...
Name: c99, dtype: object

In [4]:
line = data_raw.data["c99"].iloc[3]
line

'99 0 26558,20100701,0000,  87.4040,  17.9720,1,        ,    ,       ,       , ,   1.60,1,1013.60,1,       , ,       , ,       ,'

## Custom Schema

To use a custom schema we need to use the `data_model_path` agrument in `mdf_reader.read`. The structure of the directory is:

```
name_of_model/
    name_of_model.json
    code_tables/
        ...
```

The `code_tables` sub-directory contains the code tables that map the key columns in the data to their values.

In this demonstration we will create a new model in a temporary directory. It will be a copy of the `imma1_d703` schema and code tables.

In [5]:
tmp_dir = TemporaryDirectory()
model_name = "imma1_d730"
my_model_path = os.path.join(tmp_dir.name, model_name)
os.mkdir(my_model_path)

# Load schema and save to json file
schema = mdf_reader.schemas.read_schema(model_name)
json_object = json.dumps(schema, indent=2)

with open(os.path.join(my_model_path, model_name + ".json"), "w") as outfile:
    outfile.write(json_object)

# Get code tables and copy to the directory
code_tables_path = get_files(
    ".".join([mdf_reader.properties._base, "code_tables", "imma1"])
)
shutil.copytree(code_tables_path, os.path.join(my_model_path, "code_tables"))

'/var/folders/vf/pskk3w4j38l8kk7bc9xm07j00000gp/T/tmpude1dlw9/imma1_d730/code_tables'

In [6]:
data_file_path = test_data.test_133_730["source"]
data = mdf_reader.read(data_file_path, data_model_path=my_model_path)

2024-06-06 10:48:31,813 - root - INFO - Attempting to fetch remote file: imma1_730/input/133-730_1776-10-01_subset.imma.md5
2024-06-06 10:48:31,930 - root - INFO - READING DATA MODEL SCHEMA FILE...
2024-06-06 10:48:31,942 - root - INFO - EXTRACTING DATA FROM MODEL: /var/folders/vf/pskk3w4j38l8kk7bc9xm07j00000gp/T/tmpude1dlw9/imma1_d730
2024-06-06 10:48:31,944 - root - INFO - Getting data string from source...
2024-06-06 10:48:32,503 - root - ERROR - Error validating coded element ('c99_logbook', 'Illustr'):
2024-06-06 10:48:32,503 - root - ERROR - Code table file /var/folders/vf/pskk3w4j38l8kk7bc9xm07j00000gp/T/tmpude1dlw9/imma1_d730/code_tables/ICOADS.C99.ILLUSTRATIONI.json not found
2024-06-06 10:48:32,504 - root - ERROR - Error validating coded element ('c99_voyage', 'LatInd'):
2024-06-06 10:48:32,505 - root - ERROR - Code table file /var/folders/vf/pskk3w4j38l8kk7bc9xm07j00000gp/T/tmpude1dlw9/imma1_d730/code_tables/ICOADS.C99.COORDI.json not found
2024-06-06 10:48:32,506 - root - E

In [7]:
data.data[["c99_sentinal"]].head()

Unnamed: 0_level_0,c99_sentinal,c99_sentinal,c99_sentinal
Unnamed: 0_level_1,ATTI,ATTL,BLK
0,99,0,
1,99,0,
2,99,0,
3,99,0,
4,99,0,


In [8]:
data.data[["c99_logbook"]].c99_logbook.describe(include="all")

Unnamed: 0,InstAbbr,InsName,InsPlace,InsLand,No_data_entry,NameArchiveSet,ArchivePart,Specification,Logbook_id,Logbook_language,Image_No,Illustr
count,5,5,5,5,1,0.0,1,1,5,5,1,5
unique,4,4,4,4,1,0.0,1,1,5,4,1,1
top,AGI,ARCHIVO GENERAL DE INDIAS,SEVILLE,SPAIN,20,,MCC,1391,"CORREOS, 275A R11",SPANISH,MCC_20_1391_0032,0
freq,2,2,2,2,1,,1,1,1,2,1,5


In [9]:
pd.options.display.max_columns = None
data.data[["c99_voyage"]].c99_voyage.describe(include="all")

Unnamed: 0,drLatDeg,drLatMin,drLatSec,drLatHem,drLonDeg,drLonMin,drLonSec,drLonHem,LatDeg,LatMin,LatSec,LatHem,LonDeg,LonMin,LonSec,LonHem,LatInd,LonInd,ZeroMeridian,LMname1,LMdirection1,LMdistance1,LMname2,LMdirection2,LMdistance2,LMname3,LMdirection3,LMdistance4,PosCoastal,Calendar_type,logbook_date,TimeOB,Day_of_the_week,PartDay,Watch,Glasses,Start_day,ShipName,Nationality,Ship_type,Company,Name1,Rank1,Name2,Rank2,Name3,Rank3,voyage_from,voyage_to,Anchored_ind,AnchorPlace,DASno,VoyageIni,Course_ship,Ship_speed,Distance,EncName,EncNat
count,4.0,4.0,4.0,4,2.0,2.0,2.0,2,2.0,2.0,2.0,2,3.0,3.0,3.0,3,5.0,5.0,5,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,1,1.0,1,1.0,5,5,5,4,2,4,4,1,1,0.0,0.0,5,5,5.0,0.0,0.0,5.0,2,0.0,4.0,0.0,0.0
unique,,,,2,,,,1,,,,2,,,,2,2.0,3.0,4,1,1,,0.0,0.0,,0.0,0.0,,1.0,1.0,1.0,,1,1.0,1,,2,5,4,4,2,4,3,1,1,0.0,0.0,5,4,1.0,0.0,0.0,5.0,2,0.0,4.0,0.0,0.0
top,,,,N,,,,E,,,,N,,,,E,1.0,2.0,TENERIFE,LIZARD,N87:17E,,,,,,,,0.0,2.0,17711001.0,,TUESDAY,3.0,VM,,UNKNOWN,EL COLON,SPANISH,PAQUEBOTE,MCC,THOMAS D'ORVES,CAPITAN,CHARLES WARREN,2ND OFFICER/LIEUTENANT,,,LA HABANA,LA CORUÑA,0.0,,,17710819.0,WTZ,,175.0,,
freq,,,,3,,,,2,,,,1,,,,2,3.0,2.0,2,1,1,,,,,,,,5.0,5.0,5.0,,1,1.0,1,,4,1,2,1,1,1,2,1,1,,,1,2,5.0,,,1.0,1,,1.0,,
mean,27.25,24.25,0.0,,26.5,36.0,0.0,,22.0,9.5,0.0,,121.666667,42.666667,0.0,,,,,,,230.0,,,,,,,,,,12.0,,,,8.0,,,,,,,,,,,,,,,,,,,,,,
std,21.884165,16.879475,0.0,,19.091883,19.79899,0.0,,29.698485,13.435029,0.0,,195.208436,11.23981,0.0,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,
min,1.0,5.0,0.0,,13.0,22.0,0.0,,1.0,0.0,0.0,,4.0,33.0,0.0,,,,,,,230.0,,,,,,,,,,12.0,,,,8.0,,,,,,,,,,,,,,,,,,,,,,
25%,13.75,17.0,0.0,,19.75,29.0,0.0,,11.5,4.75,0.0,,9.0,36.5,0.0,,,,,,,230.0,,,,,,,,,,12.0,,,,8.0,,,,,,,,,,,,,,,,,,,,,,
50%,29.5,23.0,0.0,,26.5,36.0,0.0,,22.0,9.5,0.0,,14.0,40.0,0.0,,,,,,,230.0,,,,,,,,,,12.0,,,,8.0,,,,,,,,,,,,,,,,,,,,,,
75%,43.0,30.25,0.0,,33.25,43.0,0.0,,32.5,14.25,0.0,,180.5,47.5,0.0,,,,,,,230.0,,,,,,,,,,12.0,,,,8.0,,,,,,,,,,,,,,,,,,,,,,


In [10]:
data.data[["c99_voyage"]].c99_voyage.ZeroMeridian.head()

0     TENERIFE
1    GREENWICH
2      NL_0_01
3      BERMUDA
4     TENERIFE
Name: ZeroMeridian, dtype: object

e.g. the ship types on this deck will be given in a tons of different languages. There is no code table for this variable in the CLIWOC website.

In [11]:
data.data[["c99_voyage"]].c99_voyage.Ship_type.dropna().head()

0    PAQUEBOTE
2        SNAUW
3     5TH RATE
4     PAQUEBOT
Name: Ship_type, dtype: object

In [12]:
data.data[["c99_data"]].c99_data.describe(include="all")

Unnamed: 0,AT_reading_units,SST_reading_units,AP_reading_units,BART_reading_units,ReferenceCourse,ReferenceWindDirection,Decl,Distance_units,Distance_units_to_landmark,Distance_units_travelled,Longitude_units,units_of_measurement,humidity_units,water_at_pump_units,wind_scale,BARO_type,BARO_brand,API,Humidity_method,compas_error,compas_correction,AT_outside,SST,AP,wind_dir,current_dir,current_speed,attached_tem,pump_water,Humidity,wind_force,weather,prcp_descriptor,sea_state,shape_coulds,dir_coulds,Clearness,cloud_fraction,gusts,Rain,Fog,Snow,Thunder,Hail,Sea_ice,Trivial_correction,Release
count,0.0,0.0,0.0,0.0,2,5,5.0,0.0,1,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,5,2,0.0,4,0.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5
unique,0.0,0.0,0.0,0.0,1,1,5.0,0.0,1,4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,5,0.0,0.0,,0.0,0.0,5,2,0.0,4,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2
top,,,,,UNKNOWN,UNKNOWN,-20.0,,LEAGUES,MILLAS,360 DEGREES,,,,,,,,,,,,,,S,,,,,,EN REFREGONES FUERTES Y DESPUES BONANCIBLE,"MUY MALOS CARICES. AGUACEROS, RELAMPAGOS Y TRU...",,GRANDE DEL O Y DEL ENE,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,CLIWOC VERSION 2.0
freq,,,,,2,5,1.0,,1,1,3,,,,,,,,,,,,,,1,,,,,,1,1,,1,,,,,5.0,4.0,5.0,5.0,4.0,5.0,5.0,5.0,4
mean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


What about the different scales for the wind force, given different languages?

In [13]:
data.data[["c99_data"]].c99_data.wind_force.head()

0    EN REFREGONES FUERTES Y DESPUES BONANCIBLE
1                                        FOIBLE
2               STIJVE GEREEFDE MARSZEILSKOELTE
3                       FRESH GALES AND SQUALLY
4                                    BONANCIBLE
Name: wind_force, dtype: object