In [1]:
!pip install lightgbm==3.1.1

Collecting lightgbm==3.1.1
  Downloading lightgbm-3.1.1-py2.py3-none-manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 4.1.0
    Uninstalling lightgbm-4.1.0:
      Successfully uninstalled lightgbm-4.1.0
Successfully installed lightgbm-3.1.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#@title Load station and meteorological model. Save fusion file

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import seaborn as sns
import pickle


def transform_value_l(val):
    first_decimal = int(str(val)[2]) if len(str(val)) > 2 else 0
    return f'CL{first_decimal}'

def transform_value_m(val):
    first_decimal = int(str(val)[2]) if len(str(val)) > 2 else 0
    return f'CM{first_decimal}'

def transform_to_nearest_odd(arr):
    # Make a copy of the input array
    arr_copy = arr.copy()

    # Find even numbers
    even_indices = arr_copy % 2 == 0

    # Increment even numbers to the nearest odd number
    arr_copy[even_indices] += 1

    return arr_copy.astype(str)

def custom_round(value):
    if value > 9001:
        return str(9999)
    elif value < 1000:
        return str(500)
    else:
        return str(round(value / 1000) * 1000)



pd.options.display.max_rows = 999

station_raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/input_files/LEVXY2018Y2022.csv",
                          parse_dates=["time"]).dropna().set_index("time")

#Load
mody2018 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/input_files/lat42.22lon-8.63p2R1KmD0Y2018.csv",parse_dates=["time"])
mody2019 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/input_files/lat42.22lon-8.63p2R1KmD0Y2019.csv",parse_dates=["time"])
mody2020 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/input_files/lat42.22lon-8.63p2R1KmD0Y2020.csv",parse_dates=["time"])
mody2021 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/input_files/lat42.22lon-8.63p2R1KmD0Y2021.csv",parse_dates=["time"])
mody2022 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/input_files/lat42.22lon-8.63p2R1KmD0Y2022.csv",parse_dates=["time"])
df_all = pd.concat([mody2018,mody2019,mody2020,mody2021,mody2022]).drop(columns=["Unnamed: 0"]).set_index("time")

#add time variables
df_all["hour"] = df_all.index.hour
df_all["month"] = df_all.index.month
df_all["dayofyear"] = df_all.index.dayofyear
df_all["weekofyear"] = df_all.index.isocalendar().week.astype(int)

#algorithms directory
algo_dir = "/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/algorithms/"

#wind direction to words
df_all["dir0_l"] = result = ["0" + c if len(c) == 2 else ("00" + c if len(c) == 1 else c) for c in (round(df_all["dir0"],-1).astype(int).astype(str))]
all = round(df_all["mod0"] * 1.94384, 0).astype(int).values
list_mod0 = ["0" + str(c) if len(c) == 1 else str(c) for c in transform_to_nearest_odd(all)]
df_all["wind"]=df_all["dir0_l"]+list_mod0+"kt"

#visibility words
"""
interval = pd.IntervalIndex.from_tuples([(-1.5, 20000),(20000,30000)])
labels = ['vi20d', 'vi20u']
df_all["visibility0_l"] = pd.cut(df_all["visibility0"], bins=interval,retbins=False,
                        labels=labels).map({a:b for a,b in zip(interval,labels)}).astype("str")
df_all[["visibility0_l","visibility0"]].sample(100)

# Define the old and new min and max values
old_min = 26.731182
old_max = 24235.0
new_min = 50
new_max = 9999

# Calculate the scale (a) and shift (b) factors
a = (new_max - new_min) / (old_max - old_min)
b = new_min - a * old_min

df_all['visibility0_l'] = ((a * df_all['visibility0'] + b).round().astype(int)).apply(custom_round)
"""
alg = pickle.load(open(algo_dir+"llmvis_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["vis_ml"] =  alg["pipe"].predict(model_x_var)


#rh in words
interval = pd.IntervalIndex.from_tuples([(-1.5, .80),(.80,100)])
labels = ['rhd80', 'rhu80']
df_all["rh0_l"] = pd.cut(df_all["rh0"], bins=interval,retbins=False,
                        labels=labels).map({a:b for a,b in zip(interval,labels)}).astype("str")
df_all[["rh0_l","rh0"]].sample(100)


#wx words
alg = pickle.load(open(algo_dir+"llmwx_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["wx_ml"] =  alg["pipe"].predict(model_x_var)

"""
alg = pickle.load(open(algo_dir+"llmprec_LECO_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["prec_ml"] =  alg["pipe"].predict(model_x_var)

interval = pd.IntervalIndex.from_tuples([(-1.5, 0.1),(.1,180)])
labels = ['prec0n', 'prec0y']
df_all["prec0_l"] = pd.cut(df_all["prec0"], bins=interval,retbins=False,
                        labels=labels).map({a:b for a,b in zip(interval,labels)}).astype("str")
df_all[["prec0_l","prec0"]].sample(100)
"""

#cfl words
#df_all['cfl0_l'] = round(df_all["cfl0"],1).apply(transform_value_l)
alg = pickle.load(open(algo_dir+"llmskyc1_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["cfl_ml"] =  alg["pipe"].predict(model_x_var)

#clouds height
alg = pickle.load(open(algo_dir+"llmskyl1_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["cfll1_ml"] =  alg["pipe"].predict(model_x_var)


#cfm words
#df_all['cfm0_l'] = round(df_all["cfm0"],1).apply(transform_value_m)
alg = pickle.load(open(algo_dir+"llmskyc2_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["cfm_ml"] =  alg["pipe"].predict(model_x_var)

#temp words
alg = pickle.load(open(algo_dir+"temp_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["temp"] =  alg["pipe"].predict(model_x_var)
df_all["temp_ml"] = ["0" + c if len(c) == 1 else c for c in round(df_all["temp"]-273.16,0).astype(int).astype(str)]
df_all["temp0_l"] = ["0" + c if len(c) == 1 else c for c in round(df_all["temp0"]-273.16,0).astype(int).astype(str)]

#temp dew words
alg = pickle.load(open(algo_dir+"llmtempd_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["tempd"] =  alg["pipe"].predict(model_x_var)
df_all["tempd_ml"] = ["0" + c if len(c) == 1 else c for c in round(df_all["tempd"]-273.16,0).astype(int).astype(str)]


#mslp words
alg = pickle.load(open(algo_dir+"pres_LEVX_d0.al","rb"))
model_x_var = df_all[alg["x_var"]]
df_all["pres"] =  alg["pipe"].predict(model_x_var)
pres = round(df_all["pres"],0).astype(int).astype(str)
df_all["mslp_ml"] =[ "q"+p if len(p)==4 else "q0"+p for p in pres]

#time variables
df_all['hour'] = 'H' + df_all.index.hour.astype(str)
df_all['month'] = 'M' + df_all.index.month.astype(str)

#select variable met model
df_all["model_seed"]= df_all["wind"]+" "+df_all["vis_ml"]+" "+df_all["wx_ml"]+" "+df_all["cfl_ml"]+df_all["cfll1_ml"]+" "+df_all["cfm_ml"]+" "+df_all["temp_ml"]+ " "+df_all["tempd_ml"]+" "+df_all["mslp_ml"]


station_raw["metar_o"] = station_raw["metar_o"].astype(str)
station_raw["metar_o"] = station_raw["metar_o"].str.split().apply(lambda x: ' '.join(x[2:]) if len(x) > 2 else '')
station_clean_auto = [station_raw["metar_o"][i].replace("AUTO"+ " ", "") for i in range(0,len(station_raw["metar_o"]))]
station_raw["metar_o"] =station_clean_auto

result = pd.concat([station_raw,df_all["model_seed"]],axis=1).dropna()
result["fusion"]= result["model_seed"]+" "+result["metar_o"]

#save fusion
#path_result = "/content/drive/MyDrive/Colab Notebooks/gpt/i+d/fusionml.csv"
path_result = "/content/drive/MyDrive/Colab Notebooks/airport_ml/LEVX/notebooks/LEVXfusionml.csv"
result["fusion"].to_csv(path_result)
result["fusion"].sample(100).values

array(['19005kt 9999 WM FEW020 SCT 14 09 q1010 20003KT 140V240 9999 BKN043 14/10 Q1010 NOSIG',
       '16001kt 9999 WM MNClD M 19 16 q1017 VRB01KT CAVOK 19/17 Q1017 NOSIG',
       '28011kt 9999 WM FEW015 M 20 14 q1023 02004KT 350V060 9999 SCT028 19/15 Q1023 NOSIG',
       '01011kt 9999 WM MNClD M 19 10 q1022 36003KT 290V040 CAVOK 21/13 Q1023 NOSIG',
       '16005kt 9999 WM MNClD M 13 08 q1025 20005KT 160V230 CAVOK 13/09 Q1025 NOSIG',
       '29005kt 9999 WM MNClD M 14 05 q1032 VRB01KT CAVOK 14/07 Q1032 NOSIG',
       '35011kt 9999 WM SCT030 SCT 09 02 q1020 35006KT 300V030 9999 SCT040 SCT050 10/03 Q1020 NOSIG',
       '12003kt 9999 WM FEW030 M 18 10 q1019 04005KT 340V110 CAVOK 19/10 Q1019 NOSIG',
       '33003kt 9999 WM FEWNClD M 12 10 q1023 30004KT CAVOK 12/09 Q1023 NOSIG',
       '17005kt 9999 WM M040 M 16 12 q1017 VRB02KT CAVOK 17/12 Q1017 NOSIG',
       '01003kt 9999 WM NSCNClD M 06 04 q1033 21006KT 180V250 CAVOK 06/05 Q1033 NOSIG',
       '17007kt 9999 DZ BKN015 M 07 07 q1027 21006

In [4]:
#@title Pressure
df_p = pd.concat([df_all[["mslp0","pres"]].astype(int),station_raw["mslp_o"].astype(int)],axis=1).dropna()
df_p["dif_metmodel"]=df_p["mslp0"]/100-df_p["mslp_o"]
df_p["dif_ml"]=df_p["pres"]-df_p["mslp_o"]
df_p.describe()



Unnamed: 0,mslp0,pres,mslp_o,dif_metmodel,dif_ml
count,41257.0,41257.0,41257.0,41257.0,41257.0
mean,101833.800834,1017.528007,1018.025159,0.312849,-0.497152
std,703.841975,6.786746,6.82273,0.908467,0.707517
min,97196.0,976.0,976.0,-17.46,-13.0
25%,101482.0,1014.0,1015.0,-0.25,-1.0
50%,101839.0,1018.0,1018.0,0.27,0.0
75%,102254.0,1022.0,1022.0,0.84,0.0
max,103768.0,1034.0,1036.0,8.59,6.0


In [5]:
#@title Temperature

df_t = pd.concat([df_all[["temp0_l","temp_ml"]].astype(int),round(station_raw["temp_o"]-273.16,0).astype(int)],axis=1).dropna()
df_t["dif_metmodel"]=df_t["temp0_l"]-df_t["temp_o"]
df_t["dif_ml"]=df_t["temp_ml"]-df_t["temp_o"]
df_t.describe()

Unnamed: 0,temp0_l,temp_ml,temp_o,dif_metmodel,dif_ml
count,41257.0,41257.0,41257.0,41257.0,41257.0
mean,13.736772,14.411494,14.568243,-0.831471,-0.156749
std,5.414399,5.659911,5.78582,1.732269,1.105359
min,0.0,-1.0,-3.0,-11.0,-6.0
25%,10.0,10.0,11.0,-2.0,-1.0
50%,13.0,14.0,14.0,-1.0,0.0
75%,17.0,18.0,18.0,0.0,1.0
max,35.0,36.0,38.0,9.0,6.0


In [6]:
#@title dew Temperature

df_tw = pd.concat([df_all["tempd_ml"].astype(int),round(station_raw["tempd_o"]-273.16,0).astype(int)],axis=1).dropna()

df_tw["dif_ml"]=df_tw["tempd_ml"]-df_tw["tempd_o"]
df_tw.describe()

Unnamed: 0,tempd_ml,tempd_o,dif_ml
count,41257.0,41257.0,41257.0
mean,10.531958,10.690428,-0.15847
std,4.494999,4.66849,1.169965
min,-10.0,-13.0,-7.0
25%,7.0,7.0,-1.0
50%,11.0,11.0,0.0
75%,14.0,14.0,1.0
max,23.0,25.0,9.0
