In [12]:
import pandas as pd
import holidays
import joblib as jl



In [13]:
dados_teste = pd.read_csv("dados/test_data.csv", encoding="latin1")

In [14]:
dados_teste.head()

Unnamed: 0,city_name,record_date,AVERAGE_FREE_FLOW_SPEED,AVERAGE_TIME_DIFF,AVERAGE_FREE_FLOW_TIME,LUMINOSITY,AVERAGE_TEMPERATURE,AVERAGE_ATMOSP_PRESSURE,AVERAGE_HUMIDITY,AVERAGE_WIND_SPEED,AVERAGE_CLOUDINESS,AVERAGE_PRECIPITATION,AVERAGE_RAIN
0,Porto,2019-02-13 23:00:00,39.2,0.0,91.0,DARK,8.0,1026.0,71.0,1.0,céu claro,0.0,
1,Porto,2018-11-28 20:00:00,42.5,12.2,76.8,DARK,11.0,1020.0,93.0,4.0,nuvens dispersas,0.0,
2,Porto,2018-08-14 05:00:00,45.9,0.0,86.3,DARK,14.0,1017.0,93.0,0.0,,0.0,
3,Porto,2019-07-06 17:00:00,33.2,51.7,89.9,LIGHT,22.0,1016.0,77.0,4.0,céu pouco nublado,0.0,
4,Porto,2018-10-15 06:00:00,44.0,3.5,85.5,DARK,12.0,1004.0,100.0,9.0,,0.0,chuva fraca


In [17]:
dados_teste.isna().sum()

city_name                     0
record_date                   0
AVERAGE_FREE_FLOW_SPEED       0
AVERAGE_TIME_DIFF             0
AVERAGE_FREE_FLOW_TIME        0
LUMINOSITY                    0
AVERAGE_TEMPERATURE           0
AVERAGE_ATMOSP_PRESSURE       0
AVERAGE_HUMIDITY              0
AVERAGE_WIND_SPEED            0
AVERAGE_CLOUDINESS          599
AVERAGE_PRECIPITATION         0
AVERAGE_RAIN               1360
dtype: int64

### Tratamento

In [15]:
dados_teste.drop(['city_name', 'AVERAGE_RAIN', 'AVERAGE_PRECIPITATION'], axis=1, inplace=True)

dados_teste["record_date"] = pd.to_datetime(dados_teste["record_date"])

dados_teste["year"] = dados_teste["record_date"].dt.year
dados_teste["month"] = dados_teste["record_date"].dt.month
dados_teste["day"] = dados_teste["record_date"].dt.day
dados_teste["hour"] = dados_teste["record_date"].dt.hour
dados_teste["weekday"] = dados_teste["record_date"].dt.dayofweek  # 0=monday, 6=sunday
dados_teste["is_weekend"] = (dados_teste["weekday"] >= 5).astype(int)

pt_holidays = holidays.Portugal()
dados_teste["is_holiday"] = dados_teste["record_date"].dt.date.isin(pt_holidays).astype(int)

def season(month, day):
    if (month == 12 and day >= 21) or (month <= 3 and day < 20) or (month in [1,2]):
        return "Winter"
    elif (month == 3 and day >= 20) or (month in [4,5]) or (month == 6 and day < 21):
        return "Spring"
    elif (month == 6 and day >= 21) or (month in [7,8]) or (month == 9 and day < 23):
        return "Summer"
    else:
        return "Autumn"

dados_teste["season"] = dados_teste.apply(lambda x: season(x["month"], x["day"]), axis=1)


map_dict = {
    "DARK": 0,
    "LOW_LIGHT": 1,
    "LIGHT": 2
}

dados_teste["LUMINOSITY_N"] = dados_teste["LUMINOSITY"].map(map_dict)

map_cloudiness = {
    "céu limpo": 0,
    "céu claro": 1,
    "céu pouco nublado": 2,
    "algumas nuvens": 3,
    "nuvens dispersas": 4,
    "nuvens quebradas": 5,
    "nuvens quebrados": 5,
    "tempo nublado": 6,
    "nublado": 7
}

dados_teste["AVERAGE_CLOUDINESS_N"] = dados_teste["AVERAGE_CLOUDINESS"].map(map_cloudiness)

hourly_avg = dados_teste.groupby('hour')['AVERAGE_TIME_DIFF'].mean()
threshold = hourly_avg.median()
peak_hours = hourly_avg[hourly_avg > threshold].index.tolist()
dados_teste['is_peak_hour'] = dados_teste['hour'].apply(lambda x: 1 if x in peak_hours else 0)

dados_teste['poor_visibility'] = ((dados_teste['LUMINOSITY_N'] < 2) | (dados_teste['AVERAGE_CLOUDINESS_N'] > 3)).astype(int)
dados_teste['hour_weekday'] = dados_teste['hour'] + dados_teste['weekday']*24
dados_teste['time_ratio'] = dados_teste['AVERAGE_TIME_DIFF'] / dados_teste['AVERAGE_FREE_FLOW_TIME']


### Específico para C1

In [16]:
def preencher_cloudiness(row):
    if pd.notna(row["AVERAGE_CLOUDINESS_N"]):
        return row["AVERAGE_CLOUDINESS_N"]
    
    if row["LUMINOSITY_N"] == 2: # light
        return 1   # céu claro
    
    if row["LUMINOSITY_N"] == 0: # dark
        return 7   # nublado
    
    if row["LUMINOSITY_N"] == 1: #LOW_LIGHT
        return 3 # algumas nuvens

dados_teste["AVERAGE_CLOUDINESS_N"] = dados_teste.apply(preencher_cloudiness, axis=1)

### Aplicar

In [17]:
modelo = jl.load("modelos/lgbm_c1.pkl")
preprocessor = modelo.named_steps['preprocessor']

colunas_numericas = preprocessor.transformers_[0][2]  
colunas_categoricas = preprocessor.transformers_[1][2]  

colunas_esperadas = colunas_numericas + colunas_categoricas
print("Colunas esperadas - ordem:", colunas_esperadas)

Colunas esperadas - ordem: ['AVERAGE_FREE_FLOW_SPEED', 'AVERAGE_TIME_DIFF', 'AVERAGE_FREE_FLOW_TIME', 'AVERAGE_TEMPERATURE', 'AVERAGE_ATMOSP_PRESSURE', 'AVERAGE_HUMIDITY', 'AVERAGE_WIND_SPEED', 'year', 'month', 'day', 'hour', 'weekday', 'is_weekend', 'is_holiday', 'LUMINOSITY_N', 'AVERAGE_CLOUDINESS_N', 'is_peak_hour', 'poor_visibility', 'hour_weekday', 'time_ratio', 'season']


In [18]:
dados_teste['probabilidade'] = modelo.predict_proba(dados_teste[['AVERAGE_FREE_FLOW_SPEED', 'AVERAGE_TIME_DIFF', 'AVERAGE_FREE_FLOW_TIME', 'AVERAGE_TEMPERATURE', 'AVERAGE_ATMOSP_PRESSURE', 'AVERAGE_HUMIDITY', 'AVERAGE_WIND_SPEED', 'year', 'month', 'day', 'hour', 'weekday', 'is_weekend', 'is_holiday', 'LUMINOSITY_N', 'AVERAGE_CLOUDINESS_N', 'is_peak_hour', 'poor_visibility', 'hour_weekday', 'time_ratio', 'season']])[:, 1]
dados_teste['previsao'] = modelo.predict(dados_teste[['AVERAGE_FREE_FLOW_SPEED', 'AVERAGE_TIME_DIFF', 'AVERAGE_FREE_FLOW_TIME', 'AVERAGE_TEMPERATURE', 'AVERAGE_ATMOSP_PRESSURE', 'AVERAGE_HUMIDITY', 'AVERAGE_WIND_SPEED', 'year', 'month', 'day', 'hour', 'weekday', 'is_weekend', 'is_holiday', 'LUMINOSITY_N', 'AVERAGE_CLOUDINESS_N', 'is_peak_hour', 'poor_visibility', 'hour_weekday', 'time_ratio', 'season']])



#### REDE

In [63]:
features = ['AVERAGE_FREE_FLOW_SPEED', 'AVERAGE_TIME_DIFF', 
            'AVERAGE_FREE_FLOW_TIME', 'AVERAGE_TEMPERATURE', 
            'hour', 'LUMINOSITY_N', 'is_peak_hour', 
            'hour_weekday', 'time_ratio']

X_teste_df = dados_teste[features].copy()


previsoes = modelo.predict(X_teste_df)
previsoes= previsoes.argmax(axis=1)

dados_teste['previsao'] = previsoes


[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step




In [19]:
map_dict = {
    "Low": 1,
    "Medium": 2,
    "High": 3,
    "Very_High": 4,
    "None": 0
}


dados_teste["Speed_Diff"] = dados_teste["previsao"].map({v: k for k, v in map_dict.items()})


In [20]:
dados_teste

Unnamed: 0,record_date,AVERAGE_FREE_FLOW_SPEED,AVERAGE_TIME_DIFF,AVERAGE_FREE_FLOW_TIME,LUMINOSITY,AVERAGE_TEMPERATURE,AVERAGE_ATMOSP_PRESSURE,AVERAGE_HUMIDITY,AVERAGE_WIND_SPEED,AVERAGE_CLOUDINESS,...,season,LUMINOSITY_N,AVERAGE_CLOUDINESS_N,is_peak_hour,poor_visibility,hour_weekday,time_ratio,probabilidade,previsao,Speed_Diff
0,2019-02-13 23:00:00,39.2,0.0,91.0,DARK,8.0,1026.0,71.0,1.0,céu claro,...,Winter,0,1.0,0,1,71,0.000000,0.006743,0,
1,2018-11-28 20:00:00,42.5,12.2,76.8,DARK,11.0,1020.0,93.0,4.0,nuvens dispersas,...,Autumn,0,4.0,0,1,68,0.158854,0.456717,2,Medium
2,2018-08-14 05:00:00,45.9,0.0,86.3,DARK,14.0,1017.0,93.0,0.0,,...,Summer,0,7.0,0,1,29,0.000000,0.006987,0,
3,2019-07-06 17:00:00,33.2,51.7,89.9,LIGHT,22.0,1016.0,77.0,4.0,céu pouco nublado,...,Summer,2,2.0,1,0,137,0.575083,0.008771,2,Medium
4,2018-10-15 06:00:00,44.0,3.5,85.5,DARK,12.0,1004.0,100.0,9.0,,...,Autumn,0,7.0,0,1,6,0.040936,0.786573,1,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,2018-11-24 05:00:00,50.2,1.1,80.5,DARK,11.0,1013.0,100.0,5.0,,...,Autumn,0,7.0,0,1,125,0.013665,0.738099,1,Low
1496,2019-03-01 02:00:00,38.0,4.6,85.6,DARK,9.0,1028.0,100.0,0.0,,...,Winter,0,7.0,0,1,98,0.053738,0.230845,0,
1497,2018-09-05 01:00:00,43.8,0.0,80.4,DARK,16.0,1013.0,100.0,1.0,,...,Summer,0,7.0,0,1,49,0.000000,0.009312,0,
1498,2018-11-05 08:00:00,46.0,83.2,83.2,LIGHT,10.0,1004.0,87.0,5.0,,...,Autumn,2,1.0,1,0,8,1.000000,0.000104,4,Very_High


In [21]:
dados_teste["RowId"] = range(1, len(dados_teste) + 1)
filtro = dados_teste[["RowId", "Speed_Diff"]]

In [22]:
filtro.to_csv("dados/sub_c1_lgbm_02_12.csv", index= False)