In [90]:
import warnings
warnings.filterwarnings('ignore')

import warnings
import numpy as np
np.random.seed(7)
import pandas as pd
import vtreat
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from fast_ml.model_development import train_valid_test_split

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [91]:
TRAIN_PATH = "../../datasets/train.csv"
TEST_PATH = "../../datasets/test.csv"
SAMPLE_SUBMISSION_PATH = "../../datasets/sample_submission.csv"

# Read the CSV file into a DataFrame
train_dat = pd.read_csv(TRAIN_PATH)
test_dat = pd.read_csv(TEST_PATH)

# Data preparation I

In [92]:
# temp
def prep_temp(temp):
    if isinstance(temp, str) :
        temp = temp.replace("Celcius", "")
        temp = temp.replace("C", "")
        temp = temp.replace("°", "")
        temp = temp.strip()
    return temp

# datetime -> dihapus karena redundant
train_dat.drop(columns=["datetime"], inplace=True)
test_dat.drop(columns=["datetime"], inplace=True)

# Convert the 'timestamp' column to a datetime object
train_dat['datetime_iso'] = pd.to_datetime(train_dat['datetime_iso'])
test_dat['datetime_iso'] = pd.to_datetime(test_dat['datetime_iso'])

# Extract year, month, day, and hour
train_dat['year'] = train_dat['datetime_iso'].dt.year
train_dat['month'] = train_dat['datetime_iso'].dt.month
train_dat['day'] = train_dat['datetime_iso'].dt.day
train_dat['hour'] = train_dat['datetime_iso'].dt.hour

test_dat['year'] = test_dat['datetime_iso'].dt.year
test_dat['month'] = test_dat['datetime_iso'].dt.month
test_dat['day'] = test_dat['datetime_iso'].dt.day
test_dat['hour'] = test_dat['datetime_iso'].dt.hour

train_dat.drop(columns=["datetime_iso"], inplace=True)
test_dat.drop(columns=["datetime_iso"], inplace=True)

# time-zone -> dihapus karena data bukan unique
train_dat.drop(columns=["time-zone"], inplace=True)
test_dat.drop(columns=["time-zone"], inplace=True)

# prssr
def prep_prssr(prssr):
    if isinstance(prssr, str):
        prssr = prssr.replace("hPa.", "")
        prssr = prssr.replace("hPa", "")
        prssr = prssr.strip()
    return prssr

# hum
def prep_hum(hum):
    if isinstance(hum, str):
        hum = hum.replace("%", "")
        hum = hum.strip()
    return hum

# wind_spd & wind_deg
def prep_wind(wind):
    if isinstance(wind, str):
        wind = wind.replace("m/s", "")
        wind = wind.replace("°", "")
        wind = wind.strip()
    return wind

# rain_1h
def prep_rain_1h(rain):
    if isinstance(rain, str):
        rain = rain.replace("mm", "")
        rain = rain.replace("zero", "0")
        rani = rain.replace(" ", "0")
        rain = rain.strip()
        try:
            float(rain)
            return rain
        except:
            new_rain = 0
            return new_rain
    return rain

# rain_3h -> 0 artinya ga hujan
def prep_rain_3h(rain):
    if isinstance(rain, str):
        rain = rain.replace("milimeter", "")
        rain = rain.replace("mm", "")
        rain = rain.replace("no-rain", "0")
        rain = rain.replace("volume:0", "0")
        rain = rain.replace("nol", "0")
        rain = rain.replace("no_rain", "0")
        rain = rain.replace("volume:zero", "0")
        rain = rain.replace("zero", "0")
        rain = rain.strip()
    return rain

# snow_1h
def prep_snow(snow):
    if isinstance(snow, str):
        snow = snow.replace("milimeter", "")
        snow = snow.replace("mm", "")
        snow = snow.replace("no-snow", "0")
        snow = snow.replace("volume:0", "0")
        snow = snow.replace("nol", "0")
        snow = snow.replace("no_snow", "0")
        snow = snow.replace("volume:zero", "0")
        snow = snow.replace("zero", "0")
        snow = snow.strip()
    return snow

# clouds


def prep_clouds(clouds):
    if isinstance(clouds, str):
        clouds = clouds.replace("%", "")
        clouds = clouds.strip()
    return clouds

preped_train = train_dat.copy()
preped_test = test_dat.copy()

# Prepping temp column --> including temp, d_point, feels, min_temp, max_temp
for column in ['temp','d_point','feels','min_temp','max_temp']:
    preped_train[column] = preped_train[column].apply(lambda x: prep_temp(x))
    preped_test[column] = preped_test[column].apply(lambda x: prep_temp(x))

# Prepping prssr column
for column in ['prssr']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_prssr(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_prssr(x))

# Prepping hum column
for column in ['hum']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_hum(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_hum(x))

# Prepping wind column --> including wind_spd, wind_deg
for column in ['wind_spd', 'wind_deg']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_wind(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_wind(x))

# Prepping rain_1h
for column in ['rain_1h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_rain_1h(x))

# Prepping rain_3h column
for column in ['rain_3h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_rain_3h(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_rain_3h(x))

# Prepping snow column --> including snow_1h, snow_3h
for column in ['snow_1h', 'snow_3h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_snow(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_snow(x))

# Prepping clunds column
for column in ['clouds']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_clouds(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_clouds(x))

preped_train['month']= np.sin(-0.449 * (preped_train['month'] - 3))
preped_test['month']= np.sin(-0.449 * (preped_test['month'] - 3))

In [93]:
# Changing to `category` data type for normal categorical data

cat_cols = ['visibility','sea_level', 'grnd_level', 'year', 'day', 'hour']

preped_train[cat_cols] = preped_train[cat_cols].astype('category')
preped_test[cat_cols] = preped_train[cat_cols].astype('category')

# di num_cols_test ga ada rain_1h
num_cols_train = ['temp', 'd_point', 'feels', 'min_temp', 'max_temp', 'prssr', 'hum', 'wind_spd', 'wind_deg', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds', 'month']
num_cols_test = ['temp', 'd_point', 'feels', 'min_temp', 'max_temp', 'prssr', 'hum', 'wind_spd', 'wind_deg', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds', 'month']

preped_train[num_cols_train] = preped_train[num_cols_train].astype('float')
preped_test[num_cols_test] = preped_test[num_cols_test].astype('float')

##### Feature Engineering

In [94]:
# feature engineering : cloudy => 1 if yes (>70), 0 if no (<=70) 
preped_train['cloudy'] = preped_train['clouds'].apply(lambda x: 1 if x > 70 else 0)
preped_test['cloudy'] = preped_test['clouds'].apply(lambda x: 1 if x > 70 else 0)

# feature engineering : temp_range => max_temp - min_temp
preped_train['temp_range'] = preped_train['max_temp'] - preped_train['min_temp']
preped_test['temp_range'] = preped_test['max_temp'] - preped_test['min_temp']

# feature engineering : hot_day => 1 if yes (>25), 0 if no (<=25) 
preped_train['hot_day'] = preped_train['temp'].apply(lambda x: 1 if x > 25 else 0)
preped_test['hot_day'] = preped_test['temp'].apply(lambda x: 1 if x > 25 else 0)

# Data preparation II

## vtreat

In [95]:
transform = vtreat.NumericOutcomeTreatment(
    outcome_name="rain_1h",
    params=vtreat.vtreat_parameters({
        "missingness_imputation": np.mean,
        "filter_to_recommended": False
    })
)

transformed_train = transform.fit_transform(preped_train)
transformed_test = transform.transform(preped_test)
transform.score_frame_

Unnamed: 0,variable,orig_variable,treatment,y_aware,has_range,PearsonR,R2,significance,vcount,default_threshold,recommended
0,snow_3h_is_bad,snow_3h,missing_indicator,False,True,-0.001694,2.869954e-06,0.3219091,6.0,0.027778,False
1,snow_1h_is_bad,snow_1h,missing_indicator,False,True,0.001202,1.444011e-06,0.4822924,6.0,0.027778,False
2,visibility_is_bad,visibility,missing_indicator,False,True,0.000579,3.347306e-07,0.7351483,6.0,0.027778,False
3,rain_3h_is_bad,rain_3h,missing_indicator,False,True,0.001743,3.036715e-06,0.3082437,6.0,0.027778,False
4,grnd_level_is_bad,grnd_level,missing_indicator,False,True,0.001847,3.411539e-06,0.2801565,6.0,0.027778,False
5,sea_level_is_bad,sea_level,missing_indicator,False,True,-0.000632,3.993124e-07,0.7117695,6.0,0.027778,False
6,d_point,d_point,clean_copy,False,True,0.015154,0.0002296314,7.940011999999999e-19,17.0,0.009804,True
7,max_temp,max_temp,clean_copy,False,True,0.024703,0.0006102212,2.659845e-47,17.0,0.009804,True
8,hour,hour,clean_copy,False,True,-0.142188,0.02021756,0.0,17.0,0.009804,True
9,wind_deg,wind_deg,clean_copy,False,True,-0.021913,0.0004801828,1.365802e-37,17.0,0.009804,True


In [96]:
# babai rain_1h < 0
transformed_train = transformed_train[transformed_train["rain_1h"] >= 0]

In [97]:
num_col_aneh = ["prssr", "wind_deg", "max_temp", "feels", "hum", "d_point", "temp", "min_temp", "wind_spd"]
transformed_train[num_col_aneh].describe().T[["min", "max"]]

Unnamed: 0,min,max
prssr,1001.0,5115.51
wind_deg,0.0,1810.8
max_temp,22.37,187.17
feels,22.35,202.71
hum,29.0,503.0
d_point,14.49,141.59
temp,21.55,177.26
min_temp,14.12,162.92
wind_spd,0.0,25.0


In [98]:
transformed_test[num_col_aneh].describe().T[["min", "max"]]

Unnamed: 0,min,max
prssr,-100.0,1016.0
wind_deg,0.0,360.0
max_temp,-273.15,35.68
feels,-10005.49,41.2
hum,0.0,100.0
d_point,19.51,28.67
temp,-9998.49,34.46
min_temp,-9998.97,33.0
wind_spd,0.0,9999.0


## Nganu outlier pake anu

In [99]:
MAX_TEMP = 50
MAX_HUM = 150
MAX_MAX_TEMP = 50
MAX_FEELS = 60
MAX_PRESSURE  = 1200
MAX_MIN_TENP = 60

print(transformed_train.shape)
transformed_train = transformed_train[transformed_train["max_temp"] < MAX_MAX_TEMP]
transformed_train = transformed_train[transformed_train["hum"] < MAX_HUM]
transformed_train = transformed_train[transformed_train["feels"] < MAX_FEELS]
transformed_train = transformed_train[transformed_train["prssr"] < MAX_PRESSURE]
transformed_train = transformed_train[transformed_train["min_temp"] < MAX_MIN_TENP]
transformed_train = transformed_train[transformed_train["temp"] < MAX_TEMP]
print(transformed_train.shape)

(336438, 38)
(307319, 38)


In [100]:
# win deg nya di anu pake modulus
transformed_train["wind_deg"] = transformed_train["wind_deg"] \
                                .apply(lambda x: x % 360 if x > 360 else x)

In [101]:
transformed_train.columns

Index(['rain_1h', 'snow_3h_is_bad', 'snow_1h_is_bad', 'visibility_is_bad',
       'rain_3h_is_bad', 'grnd_level_is_bad', 'sea_level_is_bad', 'd_point',
       'max_temp', 'hour', 'wind_deg', 'min_temp', 'temp_range', 'hum',
       'feels', 'month', 'clouds', 'prssr', 'wind_spd', 'temp', 'hot_day',
       'year', 'day', 'cloudy', 'visibility_impact_code',
       'visibility_deviation_code', 'visibility_prevalence_code',
       'visibility_lev__NA_', 'grnd_level_impact_code',
       'grnd_level_deviation_code', 'grnd_level_prevalence_code',
       'grnd_level_lev__NA_', 'grnd_level_lev_undetermined',
       'sea_level_impact_code', 'sea_level_deviation_code',
       'sea_level_prevalence_code', 'sea_level_lev_undetermined',
       'sea_level_lev__NA_'],
      dtype='object')

## Train-Test-Validation Split

In [102]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(
                                        transformed_train,
                                        target = 'rain_1h',
                                        train_size=0.7, valid_size=0.15, test_size=0.15)

# Machine Learning I

In [103]:
params = {
        'iterations': 35000,
        'eval_metric': "RMSE",
        'task_type': "GPU",
        'verbose':True,
        'od_type':'Iter'}

cat_regressor = CatBoostRegressor(**params)
xgb_regressor = XGBRegressor()
lgbm_regressor = LGBMRegressor()

In [104]:
full_train = transformed_train.drop('rain_1h', axis=1)
full_target = transformed_train['rain_1h']

In [105]:
cat_regressor.fit(full_train, full_target, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.010242
0:	learn: 0.9433272	total: 13.1ms	remaining: 7m 38s
1:	learn: 0.9415058	total: 25.4ms	remaining: 7m 25s
2:	learn: 0.9397112	total: 38.1ms	remaining: 7m 24s
3:	learn: 0.9379620	total: 50ms	remaining: 7m 17s
4:	learn: 0.9362322	total: 62.3ms	remaining: 7m 16s
5:	learn: 0.9345230	total: 75.1ms	remaining: 7m 18s
6:	learn: 0.9328494	total: 89.9ms	remaining: 7m 29s
7:	learn: 0.9312070	total: 102ms	remaining: 7m 27s
8:	learn: 0.9295921	total: 115ms	remaining: 7m 25s
9:	learn: 0.9280023	total: 127ms	remaining: 7m 24s
10:	learn: 0.9264416	total: 140ms	remaining: 7m 25s
11:	learn: 0.9248987	total: 153ms	remaining: 7m 26s
12:	learn: 0.9233953	total: 166ms	remaining: 7m 26s
13:	learn: 0.9219328	total: 178ms	remaining: 7m 25s
14:	learn: 0.9205170	total: 191ms	remaining: 7m 24s
15:	learn: 0.9190765	total: 203ms	remaining: 7m 23s
16:	learn: 0.9176737	total: 215ms	remaining: 7m 22s
17:	learn: 0.9163114	total: 227ms	remaining: 7m 21s
18:	learn: 0.9149745	total: 239ms	remai

<catboost.core.CatBoostRegressor at 0x226cbdbd130>

In [106]:
xgb_regressor.fit(full_train, full_target)

In [107]:
lgbm_regressor.fit(full_train, full_target)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2704
[LightGBM] [Info] Number of data points in the train set: 307319, number of used features: 37
[LightGBM] [Info] Start training from score 0.359662


In [108]:
cat_y_pred = cat_regressor.predict(X_test)
xgb_y_pred = xgb_regressor.predict(X_test)
lgbm_y_pred = lgbm_regressor.predict(X_test)

ensembled_y_pred = (
   ( 0.6 * cat_y_pred ) +
   ( 0.1 * xgb_y_pred ) +
   ( 0.3 * lgbm_y_pred ) 
)

print('RMSE:', np.sqrt(mean_squared_error(y_test, ensembled_y_pred)))

RMSE: 0.71274170007604


last RMSE: 0.787459131129465

# Submission

In [110]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)

In [None]:
cat_y_pred = cat_regressor.predict(transformed_test)
xgb_y_pred = xgb_regressor.predict(transformed_test)
lgbm_y_pred = lgbm_regressor.predict(transformed_test)

ensembled_y_pred = (
   ( 0.6 * cat_y_pred ) +
   ( 0.1 * xgb_y_pred ) +
   ( 0.3 * lgbm_y_pred ) 
)

submission['rain_1h'] = ensembled_y_pred
submission['rain_1h'] = submission['rain_1h'].apply(lambda x: 0. if x < 0 else x)
submission.to_csv('../predictions/submission3_rang.csv', index=False)

## Export

In [44]:
transformed_train.to_csv('../cleaned/submission_3/train_outlier_-1_radif.csv', index=False)
transformed_test.to_csv('../cleaned/submission_3/test_outlier_-1_radif.csv', index=False)

In [46]:
import pickle 

pickle.dump(cat_regressor, open('../model/submission_3.pkl', 'wb'))