In [25]:
import warnings
warnings.filterwarnings('ignore')

import warnings
import numpy as np
np.random.seed(7)
import pandas as pd
import vtreat
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from fast_ml.model_development import train_valid_test_split

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [26]:
TRAIN_PATH = "../../datasets/train.csv"
TEST_PATH = "../../datasets/test.csv"
SAMPLE_SUBMISSION_PATH = "../../datasets/sample_submission.csv"

# Read the CSV file into a DataFrame
train_dat = pd.read_csv(TRAIN_PATH)
test_dat = pd.read_csv(TEST_PATH)

# Data preparation I

In [27]:
# temp
def prep_temp(temp):
    if isinstance(temp, str) :
        temp = temp.replace("Celcius", "")
        temp = temp.replace("C", "")
        temp = temp.replace("°", "")
        temp = temp.strip()
    return temp

def categorize_hour(hour) :
    if hour in [4,5]:
        return "dawn"
    elif hour in [6,7]:
        return "early morning"
    elif hour in [8,9,10]:
        return "late morning"
    elif hour in [11,12,13]:
        return "noon"
    elif hour in [14,15,16]:
        return "afternoon"
    elif hour in [17, 18,19]:
        return "evening"
    elif hour in [20, 21, 22]:
        return "night"
    elif hour in [23,0,1,2,3]:
        return "midnight"

# datetime -> dihapus karena redundant
train_dat.drop(columns=["datetime"], inplace=True)
test_dat.drop(columns=["datetime"], inplace=True)

# Convert the 'timestamp' column to a datetime object
train_dat['datetime_iso'] = pd.to_datetime(train_dat['datetime_iso'])
test_dat['datetime_iso'] = pd.to_datetime(test_dat['datetime_iso'])

# Extract year, month, day, and hour
# train_dat['year'] = train_dat['datetime_iso'].dt.year.astype(float)
train_dat['month'] = train_dat['datetime_iso'].dt.month.astype(float)
train_dat['day'] = train_dat['datetime_iso'].dt.day.astype
train_dat['hour'] = train_dat['datetime_iso'].dt.hour
train_dat['hour_cat'] = train_dat['hour'].apply(categorize_hour)

# test_dat['year'] = test_dat['datetime_iso'].dt.year.astype(float)
test_dat['month'] = test_dat['datetime_iso'].dt.month.astype(float)
test_dat['day'] = test_dat['datetime_iso'].dt.day.astype
test_dat['hour'] = test_dat['datetime_iso'].dt.hour
test_dat['hour_cat'] = test_dat['hour'].apply(categorize_hour)

train_dat.drop(columns=["datetime_iso"], inplace=True)
test_dat.drop(columns=["datetime_iso"], inplace=True)

# time-zone -> dihapus karena data bukan unique
train_dat.drop(columns=["time-zone"], inplace=True)
test_dat.drop(columns=["time-zone"], inplace=True)

# prssr
def prep_prssr(prssr):
    if isinstance(prssr, str):
        prssr = prssr.replace("hPa.", "")
        prssr = prssr.replace("hPa", "")
        prssr = prssr.strip()
    return prssr

# hum
def prep_hum(hum):
    if isinstance(hum, str):
        hum = hum.replace("%", "")
        hum = hum.strip()
    return hum

# wind_spd & wind_deg
def prep_wind(wind):
    if isinstance(wind, str):
        wind = wind.replace("m/s", "")
        wind = wind.replace("°", "")
        wind = wind.strip()
    return wind

# rain_1h
def prep_rain_1h(rain):
    if isinstance(rain, str):
        rain = rain.replace("mm", "")
        rain = rain.replace("zero", "0")
        rani = rain.replace(" ", "0")
        rain = rain.strip()
        try:
            float(rain)
            return rain
        except:
            new_rain = 0.0
            return new_rain
    return rain

# rain_3h -> 0 artinya ga hujan
def prep_rain_3h(rain):
    if isinstance(rain, str):
        rain = rain.replace("milimeter", "")
        rain = rain.replace("mm", "")
        rain = rain.replace("no-rain", "0")
        rain = rain.replace("volume:0", "0")
        rain = rain.replace("nol", "0")
        rain = rain.replace("no_rain", "0")
        rain = rain.replace("volume:zero", "0")
        rain = rain.replace("zero", "0")
        rain = rain.strip()
    return rain

# snow_1h
def prep_snow(snow):
    if isinstance(snow, str):
        snow = snow.replace("milimeter", "")
        snow = snow.replace("mm", "")
        snow = snow.replace("no-snow", "0")
        snow = snow.replace("volume:0", "0")
        snow = snow.replace("nol", "0")
        snow = snow.replace("no_snow", "0")
        snow = snow.replace("volume:zero", "0")
        snow = snow.replace("zero", "0")
        snow = snow.strip()
    return snow

# clouds


def prep_clouds(clouds):
    if isinstance(clouds, str):
        clouds = clouds.replace("%", "")
        clouds = clouds.strip()
    return clouds

preped_train = train_dat.copy()
preped_test = test_dat.copy()

# Prepping temp column --> including temp, d_point, feels, min_temp, max_temp
for column in ['temp','d_point','feels','min_temp','max_temp']:
    preped_train[column] = preped_train[column].apply(lambda x: prep_temp(x))
    preped_test[column] = preped_test[column].apply(lambda x: prep_temp(x))

# Prepping prssr column
for column in ['prssr']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_prssr(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_prssr(x))

# Prepping hum column
for column in ['hum']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_hum(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_hum(x))

# Prepping wind column --> including wind_spd, wind_deg
for column in ['wind_spd', 'wind_deg']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_wind(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_wind(x))

# Prepping rain_1h
for column in ['rain_1h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_rain_1h(x))

# Prepping rain_3h column
for column in ['rain_3h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_rain_3h(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_rain_3h(x))

# Prepping snow column --> including snow_1h, snow_3h
for column in ['snow_1h', 'snow_3h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_snow(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_snow(x))

# Prepping clunds column
for column in ['clouds']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_clouds(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_clouds(x))

preped_train['month']= np.sin(0.6 * (preped_train['month'] - 1)) + 1
preped_test['month']= np.sin(0.6 * (preped_test['month'] - 1)) + 1

# preped_train['day_of_year']= 55 * (np.sin(0.015 * (preped_train['day_of_year'] - 60)))
# preped_test['day_of_year']= 55 * (np.sin(0.015 * (preped_test['day_of_year'] - 60)))

In [28]:
preped_train['rain_1h'] = preped_train['rain_1h'].astype(float)

In [29]:
# plt.figure(figsize=(12,10))
# sns.lineplot(x='year', y='rain_1h', data=preped_train)

In [30]:
# Changing to `category` data type for normal categorical data

# cat_cols = ['visibility','sea_level', 'grnd_level', 'year', 'day', 'hour']

# no year
cat_cols = ['visibility','sea_level', 'grnd_level', 'day', 'hour']

preped_train[cat_cols] = preped_train[cat_cols].astype('category')
preped_test[cat_cols] = preped_train[cat_cols].astype('category')

# di num_cols_test ga ada rain_1h
num_cols_train = ['temp', 'd_point', 'feels', 'min_temp', 'max_temp', 'prssr', 'hum', 'wind_spd', 'wind_deg', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds', 'month']
num_cols_test = ['temp', 'd_point', 'feels', 'min_temp', 'max_temp', 'prssr', 'hum', 'wind_spd', 'wind_deg', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds', 'month']

preped_train[num_cols_train] = preped_train[num_cols_train].astype('float')
preped_test[num_cols_test] = preped_test[num_cols_test].astype('float')

##### Feature Engineering

In [31]:
# feature engineering : cloudy => 1 if yes (>70), 0 if no (<=70) 
preped_train['cloudy'] = preped_train['clouds'].apply(lambda x: 1 if x > 70 else 0)
preped_test['cloudy'] = preped_test['clouds'].apply(lambda x: 1 if x > 70 else 0)

# feature engineering : temp_range => max_temp - min_temp
preped_train['temp_range'] = preped_train['max_temp'] - preped_train['min_temp']
preped_test['temp_range'] = preped_test['max_temp'] - preped_test['min_temp']

# feature engineering : hot_day => 1 if yes (>25), 0 if no (<=25) 
preped_train['hot_day'] = preped_train['temp'].apply(lambda x: 1 if x > 25 else 0)
preped_test['hot_day'] = preped_test['temp'].apply(lambda x: 1 if x > 25 else 0)

# Data preparation II

## vtreat

In [32]:
transform = vtreat.NumericOutcomeTreatment(
    outcome_name="rain_1h",
    params=vtreat.vtreat_parameters({
        "missingness_imputation": np.mean,
        "filter_to_recommended": False
    })
)

transformed_train = transform.fit_transform(preped_train)
transformed_test = transform.transform(preped_test)
transform.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,sea_level_is_bad,sea_level,missing_indicator,False,True,-0.000632,3.993124e-07,0.7117695,6.0,0.027778,False
1,rain_3h_is_bad,rain_3h,missing_indicator,False,True,0.001743,3.036715e-06,0.3082437,6.0,0.027778,False
2,snow_3h_is_bad,snow_3h,missing_indicator,False,True,-0.001694,2.869954e-06,0.3219091,6.0,0.027778,False
3,visibility_is_bad,visibility,missing_indicator,False,True,0.000579,3.347306e-07,0.7351483,6.0,0.027778,False
4,snow_1h_is_bad,snow_1h,missing_indicator,False,True,0.001202,1.444011e-06,0.4822924,6.0,0.027778,False
5,grnd_level_is_bad,grnd_level,missing_indicator,False,True,0.001847,3.411539e-06,0.2801565,6.0,0.027778,False
6,temp_range,temp_range,clean_copy,False,True,-0.001554,2.414684e-06,0.3635691,16.0,0.010417,False
7,hour,hour,clean_copy,False,True,-0.142188,0.02021756,0.0,16.0,0.010417,True
8,hot_day,hot_day,clean_copy,False,True,0.087,0.007568985,0.0,16.0,0.010417,True
9,day,day,clean_copy,False,True,-0.008735,7.630048e-05,3.264731e-07,16.0,0.010417,True


In [33]:
# babai rain_1h < 0
transformed_train = transformed_train[transformed_train["rain_1h"] >= 0]

In [34]:
num_col_aneh = ["prssr", "wind_deg", "max_temp", "feels", "hum", "d_point", "temp", "min_temp", "wind_spd"]
transformed_train[num_col_aneh].describe().T[["min", "max"]]

Unnamed: 0,min,max
prssr,1001.0,5115.51
wind_deg,0.0,1810.8
max_temp,22.37,187.17
feels,22.35,202.71
hum,29.0,503.0
d_point,14.49,141.59
temp,21.55,177.26
min_temp,14.12,162.92
wind_spd,0.0,25.0


In [35]:
transformed_test[num_col_aneh].describe().T[["min", "max"]]

Unnamed: 0,min,max
prssr,-100.0,1016.0
wind_deg,0.0,360.0
max_temp,-273.15,35.68
feels,-10005.49,41.2
hum,0.0,100.0
d_point,19.51,28.67
temp,-9998.49,34.46
min_temp,-9998.97,33.0
wind_spd,0.0,9999.0


## Nganu outlier pake anu

In [36]:
MAX_TEMP = 50
MAX_HUM = 150
MAX_MAX_TEMP = 50
MAX_FEELS = 60
MAX_PRESSURE  = 1200
MAX_MIN_TENP = 60

print(transformed_train.shape)
transformed_train = transformed_train[transformed_train["max_temp"] < MAX_MAX_TEMP]
transformed_train = transformed_train[transformed_train["hum"] < MAX_HUM]
transformed_train = transformed_train[transformed_train["feels"] < MAX_FEELS]
transformed_train = transformed_train[transformed_train["prssr"] < MAX_PRESSURE]
transformed_train = transformed_train[transformed_train["min_temp"] < MAX_MIN_TENP]
transformed_train = transformed_train[transformed_train["temp"] < MAX_TEMP]
print(transformed_train.shape)

(336438, 48)
(307319, 48)


In [37]:
# win deg nya di anu pake modulus
transformed_train["wind_deg"] = transformed_train["wind_deg"] \
                                .apply(lambda x: x % 360 if x > 360 else x)

In [38]:
transformed_train.columns

Index(['rain_1h', 'sea_level_is_bad', 'rain_3h_is_bad', 'snow_3h_is_bad',
       'visibility_is_bad', 'snow_1h_is_bad', 'grnd_level_is_bad',
       'temp_range', 'hour', 'hot_day', 'day', 'month', 'cloudy', 'wind_deg',
       'min_temp', 'wind_spd', 'd_point', 'clouds', 'prssr', 'max_temp',
       'feels', 'temp', 'hum', 'sea_level_impact_code',
       'sea_level_deviation_code', 'sea_level_prevalence_code',
       'sea_level_lev_undetermined', 'sea_level_lev__NA_',
       'hour_cat_impact_code', 'hour_cat_deviation_code',
       'hour_cat_prevalence_code', 'hour_cat_lev_midnight',
       'hour_cat_lev_late_morning', 'hour_cat_lev_noon',
       'hour_cat_lev_afternoon', 'hour_cat_lev_evening', 'hour_cat_lev_night',
       'hour_cat_lev_dawn', 'hour_cat_lev_early_morning',
       'visibility_impact_code', 'visibility_deviation_code',
       'visibility_prevalence_code', 'visibility_lev__NA_',
       'grnd_level_impact_code', 'grnd_level_deviation_code',
       'grnd_level_prevalence_cod

## Train-Test-Validation Split

In [39]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(
                                        transformed_train,
                                        target = 'rain_1h',
                                        train_size=0.7, valid_size=0.15, test_size=0.15)

# Machine Learning I

In [40]:
params = {
        'iterations': 35000,
        'early_stopping_rounds' : 10,
        'eval_metric': "RMSE",
        'task_type': "GPU",
        'verbose':True,
        'od_type':'Iter'}

cat_regressor = CatBoostRegressor(**params)
xgb_regressor = XGBRegressor()
lgbm_regressor = LGBMRegressor()

In [41]:
full_train = transformed_train.drop('rain_1h', axis=1)
full_target = transformed_train['rain_1h']

In [42]:
cat_regressor.fit(X_train, y_train, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.009774
0:	learn: 0.9413200	total: 12.1ms	remaining: 7m 3s
1:	learn: 0.9395877	total: 22.7ms	remaining: 6m 36s
2:	learn: 0.9378195	total: 35.2ms	remaining: 6m 50s
3:	learn: 0.9361205	total: 45.9ms	remaining: 6m 41s
4:	learn: 0.9344449	total: 56.2ms	remaining: 6m 33s
5:	learn: 0.9327601	total: 66.4ms	remaining: 6m 27s
6:	learn: 0.9311053	total: 77.2ms	remaining: 6m 26s
7:	learn: 0.9294866	total: 87.5ms	remaining: 6m 22s
8:	learn: 0.9279409	total: 98.5ms	remaining: 6m 22s
9:	learn: 0.9263649	total: 109ms	remaining: 6m 20s
10:	learn: 0.9248399	total: 119ms	remaining: 6m 19s
11:	learn: 0.9233245	total: 129ms	remaining: 6m 16s
12:	learn: 0.9218346	total: 139ms	remaining: 6m 15s
13:	learn: 0.9203797	total: 150ms	remaining: 6m 14s
14:	learn: 0.9189501	total: 163ms	remaining: 6m 19s
15:	learn: 0.9175375	total: 174ms	remaining: 6m 20s
16:	learn: 0.9161427	total: 184ms	remaining: 6m 18s
17:	learn: 0.9147698	total: 194ms	remaining: 6m 17s
18:	learn: 0.9134204	total: 204ms	re

<catboost.core.CatBoostRegressor at 0x19d1156d820>

In [43]:
xgb_regressor.fit(X_train, y_train)

In [44]:
lgbm_regressor.fit(X_train, y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2762
[LightGBM] [Info] Number of data points in the train set: 215123, number of used features: 47
[LightGBM] [Info] Start training from score 0.359470


In [45]:
cat_y_pred = cat_regressor.predict(X_test)
xgb_y_pred = xgb_regressor.predict(X_test)
lgbm_y_pred = lgbm_regressor.predict(X_test)

ensembled_y_pred = (
   ( 0.6 * cat_y_pred ) +
   ( 0.1 * xgb_y_pred ) +
   ( 0.3 * lgbm_y_pred ) 
)

print('RMSE:', np.sqrt(mean_squared_error(y_test, ensembled_y_pred)))

RMSE: 0.7906317962459903


last RMSE: 0.71274170007604

# Submission

In [46]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)

## Export

In [44]:
transformed_train.to_csv('../cleaned/submission_3/train_outlier_-1_radif.csv', index=False)
transformed_test.to_csv('../cleaned/submission_3/test_outlier_-1_radif.csv', index=False)

In [49]:
import pickle

pickle.dump(cat_regressor, open('cat_regressor.pkl', 'wb'))
pickle.dump(xgb_regressor, open('xgb_regressor.pkl', 'wb'))
pickle.dump(lgbm_regressor, open('lgbm_regressor.pkl', 'wb'))