In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate, ParameterGrid

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix

# sklearn metrics
from sklearn.metrics import average_precision_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

# Objetivo

A partir dos dados fornecidos, queremos que responda, preferencialmente com gráficos, às seguintes questões:

1. Qual é a temperatura mais baixa de cada um dos Bouys?
  Em que mês geralmente ocorre?
  
2. Onde (lat/long) temos o maior nível de água?
  Em que mês geralmente ocorre?
  
3. Como os comprimentos de onda se correlacionam com a temperatura do mar?
  É possível prever com precisão o Comprimento de Onda, com base no Mar Temperatura e a localização Bouy?

## Entendendo os dados

Uma análise sobre Open Ocean Data do Digital Ocean Institute da Irlanda. 


**Intervalo de tempo:** \
Análise feita num intervalo de 1 ano, do dia 21/09/2021 a 21/09/2022.

## Coleta de dados

**Tides** \
Medidas sobre Marés coletadas por várias bóias no mar da Irlanda

Time - UTC \
Station ID \
Latitude - degrees_north \
Longitude - degrees_east \
Water Level (LAT) - meters \
Water Level (OD Malin) - meters \
Quality Control Flag \

In [0]:
# File location and type
file_location_td = "/FileStore/tables/IrishNationalTideGaugeNetwork_615f_3972_7c4e.csv"

df = pd.read_csv(file_location_td)
df

In [0]:
# File location and type
file_location_td = "/FileStore/tables/IrishNationalTideGaugeNetwork_615f_3972_7c4e.csv"

# The applied options are for CSV files. For other file types, these will be ignored.
df_tides = spark.read.format('csv') \
                .option("inferSchema", 'true') \
                .option("header", 'true') \
                .option("sep", ',') \
                .load(file_location_td)

display(df_tides)

time,station_id,latitude,longitude,Water_Level_LAT,Water_Level_OD_Malin,QC_Flag
UTC,,degrees_north,degrees_east,metres,metres,
2021-09-14T20:55:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.448,0.245,1.0
2021-09-14T21:00:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.4,0.197,1.0
2021-09-14T21:05:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.51,0.307,1.0
2021-09-14T21:10:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.484,0.281,1.0
2021-09-14T21:15:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.37,0.167,1.0
2021-09-14T21:20:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.428,0.225,1.0
2021-09-14T21:25:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.631,0.428,1.0
2021-09-14T21:30:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.551,0.348,1.0
2021-09-14T21:35:00Z,Aranmore Island - Leabgarrow,54.9905,-8.4955,2.6,0.397,1.0


**Waves** \
Medidas sobre Ondas coletadas por várias bóias no mar da Irlanda


Time - UTC\
Station_id \
Latitude - degrees_north \
Longitude - degrees_east \
Peak Period - S\
Peak Direction - degrees_true\
Upcross Period - S\
Significant Wave Height - cm\
Maximum Wave Height - cm\
Sea Temperature - degree_C\
Current Speed - m/s\
Current Direction - degrees_true

In [0]:
# File location and type
file_location_wv = "/FileStore/tables/IWaveBNetwork30Min_834a_dfe2_58ae.csv"

# The applied options are for CSV files. For other file types, these will be ignored.
df_waves = spark.read.format('csv') \
                .option("inferSchema", 'true') \
                .option("header", 'true') \
                .option("sep", ',') \
                .load(file_location_wv) \
                .select('time', 'station_id', 'latitude', 'longitude', 'PeakPeriod', 'PeakDirection', 'UpcrossPeriod', 'SignificantWaveHeight', 'Hmax', 'SeaTemperature', 'MeanCurSpeed', 'MeanCurDirTo')

display(df_waves)

time,station_id,latitude,longitude,PeakPeriod,PeakDirection,UpcrossPeriod,SignificantWaveHeight,Hmax,SeaTemperature,MeanCurSpeed,MeanCurDirTo
UTC,,degrees_north,degrees_east,s,degrees_true,s,cm,cm,degree_C,m/s,degrees_true
2021-09-14T21:22:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,14.29,299.5,11.111,435.0,,16.25,,
2021-09-14T21:52:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,14.29,286.9,11.111,421.0,,16.2,,
2021-09-14T22:22:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,15.38,291.1,10.526,384.0,,16.2,,
2021-09-14T22:52:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,15.38,299.5,11.429,426.0,,16.2,,
2021-09-14T23:22:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,13.33,299.5,10.811,429.0,,16.2,,
2021-09-14T23:52:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,13.33,286.9,10.526,406.0,,16.25,,
2021-09-15T00:22:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,15.38,300.9,10.526,417.0,,16.25,,
2021-09-15T00:52:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,16.67,293.9,10.526,429.0,,16.25,,
2021-09-15T01:22:00Z,AMETS Berth A Wave Buoy,54.2753,-10.29737,14.29,288.3,10.0,416.0,,16.2,,


## Limpeza dos dados

In [0]:
def just_check_nan(df):
    # Check NaN Pandas and Numpy
    missing_val_count_by_column = (df.isna().sum())

    columns_with_nan = missing_val_count_by_column[missing_val_count_by_column > 0]
    print('Dados com NaN:')
    print(columns_with_nan)
    print('')
    print('columns_with_nan.shape', columns_with_nan.shape)
    print('')

In [0]:
just_check_nan(df_tides)

In [0]:
just_check_nan(df_waves)