# Procedimientos para el pronóstico estadístico de reportes METAR

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("files/metar_data.csv")
data.head(10)

Unnamed: 0,ANIO,MES,DIA,HORA,MINUTO,DIR,MAG,RAF,VIS,RA,...,CONVECTIVA1,CAPA2,ALTURA2,CONVECTIVA2,CAPA3,ALTURA3,CONVECTIVA3,CAPA4,ALTURA4,CONVECTIVA4
0,2005,1,1,0,0,110.0,11.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
1,2005,1,1,1,0,100.0,7.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
2,2005,1,1,2,0,999.0,3.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
3,2005,1,1,3,0,60.0,8.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
4,2005,1,1,4,0,80.0,4.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
5,2005,1,1,5,0,70.0,3.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
6,2005,1,1,6,0,110.0,14.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
7,2005,1,1,10,0,60.0,6.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
8,2005,1,1,11,0,80.0,3.0,0.0,9999.0,0,...,0,,,0,,,0,,,0
9,2005,1,1,12,0,30.0,6.0,0.0,9999.0,0,...,0,,,0,,,0,,,0


In [5]:
data.shape

(121230, 31)

In [6]:
colnames = data.columns.values.tolist()
colnames

['ANIO',
 'MES',
 'DIA',
 'HORA',
 'MINUTO',
 'DIR',
 'MAG',
 'RAF',
 'VIS',
 'RA',
 'SHRA',
 'TSRA',
 'BCFG',
 'BR',
 'FG',
 'CAVOK',
 'TEMP',
 'DPTEMP',
 'QNH',
 'CAPA1',
 'ALTURA1',
 'CONVECTIVA1',
 'CAPA2',
 'ALTURA2',
 'CONVECTIVA2',
 'CAPA3',
 'ALTURA3',
 'CONVECTIVA3',
 'CAPA4',
 'ALTURA4',
 'CONVECTIVA4']

In [5]:
metar = 'MROC 110600Z 07004KT 040V100 CAVOK 21/19 A2999 NOSIG'

In [15]:
predictors = ['MES', 'HORA', 'MAG', 'TEMP']
target = ['QNH']
data = data.dropna()
X = data[predictors]
Y = data[target]

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
regtree = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, random_state=0)
regtree.fit(X, Y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=10,
                      min_samples_split=30, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [18]:
preds = regtree.predict(data[predictors])

In [19]:
data["preds"] = preds

In [20]:
data[["preds", "QNH"]]

Unnamed: 0,preds,QNH
2519,29.968947,30.02
3562,29.977647,30.01
3930,29.941538,29.98
14092,29.925926,29.84
19362,30.011429,29.96
...,...,...
117939,29.977647,30.02
118565,29.964000,29.97
119021,29.964000,29.94
119644,29.925926,29.90


In [22]:
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

In [24]:
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(regtree, X, Y, cv=cv, n_jobs=1)
print(scores)
score = np.mean(scores)
print(score)

[-0.11882996 -0.72502235 -0.00487913  0.07187645 -0.06436965 -0.1679333
  0.35747491 -0.39914211  0.26933356  0.40279411]
-0.03786974576506652


In [25]:
list(zip(predictors, regtree.feature_importances_))

[('ANIO', 0.0),
 ('MES', 0.4761885564648634),
 ('DIA', 0.0),
 ('HORA', 0.4293989080314313),
 ('MINUTO', 0.0),
 ('DIR', 0.0),
 ('MAG', 0.013567952212435877),
 ('TEMP', 0.0808445832912694),
 ('DPTEMP', 0.0)]