In [2]:
import warnings
warnings.filterwarnings('ignore')

import warnings
import numpy as np
np.random.seed(7)
import pandas as pd
import vtreat
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from fast_ml.model_development import train_valid_test_split

In [4]:
TRAIN_PATH = "../../datasets/train.csv"
TEST_PATH = "../../datasets/test.csv"
SAMPLE_SUBMISSION_PATH = "../../datasets/sample_submission.csv"

# Read the CSV file into a DataFrame
train_dat = pd.read_csv(TRAIN_PATH)
test_dat = pd.read_csv(TEST_PATH)

# Data preparation I

In [5]:
# temp
def prep_temp(temp):
    if isinstance(temp, str) :
        temp = temp.replace("Celcius", "")
        temp = temp.replace("C", "")
        temp = temp.replace("°", "")
        temp = temp.strip()
    return temp

# datetime -> dihapus karena redundant
train_dat.drop(columns=["datetime"], inplace=True)
test_dat.drop(columns=["datetime"], inplace=True)

# Convert the 'timestamp' column to a datetime object
train_dat['datetime_iso'] = pd.to_datetime(train_dat['datetime_iso'])
test_dat['datetime_iso'] = pd.to_datetime(test_dat['datetime_iso'])

# Extract year, month, day, and hour
train_dat['year'] = train_dat['datetime_iso'].dt.year
train_dat['month'] = train_dat['datetime_iso'].dt.month
train_dat['day'] = train_dat['datetime_iso'].dt.day
train_dat['hour'] = train_dat['datetime_iso'].dt.hour

test_dat['year'] = test_dat['datetime_iso'].dt.year
test_dat['month'] = test_dat['datetime_iso'].dt.month
test_dat['day'] = test_dat['datetime_iso'].dt.day
test_dat['hour'] = test_dat['datetime_iso'].dt.hour

train_dat.drop(columns=["datetime_iso"], inplace=True)
test_dat.drop(columns=["datetime_iso"], inplace=True)

# time-zone -> dihapus karena data bukan unique
train_dat.drop(columns=["time-zone"], inplace=True)
test_dat.drop(columns=["time-zone"], inplace=True)

# prssr
def prep_prssr(prssr):
    if isinstance(prssr, str):
        prssr = prssr.replace("hPa.", "")
        prssr = prssr.replace("hPa", "")
        prssr = prssr.strip()
    return prssr

# hum
def prep_hum(hum):
    if isinstance(hum, str):
        hum = hum.replace("%", "")
        hum = hum.strip()
    return hum

# wind_spd & wind_deg
def prep_wind(wind):
    if isinstance(wind, str):
        wind = wind.replace("m/s", "")
        wind = wind.replace("°", "")
        wind = wind.strip()
    return wind

# rain_1h
def prep_rain_1h(rain):
    if isinstance(rain, str):
        rain = rain.replace("mm", "")
        rain = rain.replace("zero", "0")
        rani = rain.replace(" ", "0")
        rain = rain.strip()
        try:
            float(rain)
            return rain
        except:
            new_rain = 0
            return new_rain
    return rain

# rain_3h -> 0 artinya ga hujan
def prep_rain_3h(rain):
    if isinstance(rain, str):
        rain = rain.replace("milimeter", "")
        rain = rain.replace("mm", "")
        rain = rain.replace("no-rain", "0")
        rain = rain.replace("volume:0", "0")
        rain = rain.replace("nol", "0")
        rain = rain.replace("no_rain", "0")
        rain = rain.replace("volume:zero", "0")
        rain = rain.replace("zero", "0")
        rain = rain.strip()
    return rain

# snow_1h
def prep_snow(snow):
    if isinstance(snow, str):
        snow = snow.replace("milimeter", "")
        snow = snow.replace("mm", "")
        snow = snow.replace("no-snow", "0")
        snow = snow.replace("volume:0", "0")
        snow = snow.replace("nol", "0")
        snow = snow.replace("no_snow", "0")
        snow = snow.replace("volume:zero", "0")
        snow = snow.replace("zero", "0")
        snow = snow.strip()
    return snow

# clouds


def prep_clouds(clouds):
    if isinstance(clouds, str):
        clouds = clouds.replace("%", "")
        clouds = clouds.strip()
    return clouds

preped_train = train_dat.copy()
preped_test = test_dat.copy()

# Prepping temp column --> including temp, d_point, feels, min_temp, max_temp
for column in ['temp','d_point','feels','min_temp','max_temp']:
    preped_train[column] = preped_train[column].apply(lambda x: prep_temp(x))
    preped_test[column] = preped_test[column].apply(lambda x: prep_temp(x))

# Prepping prssr column
for column in ['prssr']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_prssr(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_prssr(x))

# Prepping hum column
for column in ['hum']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_hum(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_hum(x))

# Prepping wind column --> including wind_spd, wind_deg
for column in ['wind_spd', 'wind_deg']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_wind(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_wind(x))

# Prepping rain_1h
for column in ['rain_1h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_rain_1h(x))

# Prepping rain_3h column
for column in ['rain_3h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_rain_3h(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_rain_3h(x))

# Prepping snow column --> including snow_1h, snow_3h
for column in ['snow_1h', 'snow_3h']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_snow(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_snow(x))

# Prepping clunds column
for column in ['clouds']:
    preped_train[column] = preped_train[column].apply(lambda x:prep_clouds(x))
    preped_test[column] = preped_test[column].apply(lambda x:prep_clouds(x))

In [6]:
# Changing to `category` data type for normal categorical data

cat_cols = ['visibility','sea_level', 'grnd_level', 'year', 'month', 'day', 'hour']

preped_train[cat_cols] = preped_train[cat_cols].astype('category')
preped_test[cat_cols] = preped_train[cat_cols].astype('category')

# di num_cols_test ga ada rain_1h
num_cols_train = ['temp', 'd_point', 'feels', 'min_temp', 'max_temp', 'prssr', 'hum', 'wind_spd', 'wind_deg', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds']
num_cols_test = ['temp', 'd_point', 'feels', 'min_temp', 'max_temp', 'prssr', 'hum', 'wind_spd', 'wind_deg', 'rain_3h', 'snow_1h', 'snow_3h', 'clouds']

preped_train[num_cols_train] = preped_train[num_cols_train].astype('float')
preped_test[num_cols_test] = preped_test[num_cols_test].astype('float')

# Data preparation II

## vtreat

In [None]:
transform = vtreat.NumericOutcomeTreatment(
    outcome_name="rain_1h",
    params=vtreat.vtreat_parameters({
        "missingness_imputation": np.mean,
        "filter_to_recommended": False
    })
)

transformed_train = transform.fit_transform(preped_train)
transformed_test = transform.transform(preped_test)
transform.score_frame_

In [14]:
# babai rain_1h < 0
transformed_train = transformed_train[transformed_train["rain_1h"] >= 0]

In [22]:
num_col_aneh = ["prssr", "wind_deg", "max_temp", "feels", "hum", "d_point", "temp", "min_temp", "wind_spd"]
transformed_train[num_col_aneh].describe().T[["min", "max"]]

Unnamed: 0,min,max
prssr,1001.0,5115.51
wind_deg,0.0,1810.8
max_temp,22.37,187.17
feels,22.35,202.71
hum,29.0,503.0
d_point,14.49,141.59
temp,21.55,177.26
min_temp,14.12,162.92
wind_spd,0.0,25.0


In [23]:
transformed_test[num_col_aneh].describe().T[["min", "max"]]

Unnamed: 0,min,max
prssr,-100.0,1016.0
wind_deg,0.0,360.0
max_temp,-273.15,35.68
feels,-10005.49,41.2
hum,0.0,100.0
d_point,19.51,28.67
temp,-9998.49,34.46
min_temp,-9998.97,33.0
wind_spd,0.0,9999.0


## Nganu outlier pake anu

In [25]:
MAX_TEMP = 50
MAX_HUM = 150
MAX_MAX_TEMP = 50
MAX_FEELS = 60
MAX_PRESSURE  = 1200
MAX_MIN_TENP = 60

print(transformed_train.shape)
transformed_train = transformed_train[transformed_train["max_temp"] < MAX_MAX_TEMP]
transformed_train = transformed_train[transformed_train["hum"] < MAX_HUM]
transformed_train = transformed_train[transformed_train["feels"] < MAX_FEELS]
transformed_train = transformed_train[transformed_train["prssr"] < MAX_PRESSURE]
transformed_train = transformed_train[transformed_train["min_temp"] < MAX_MIN_TENP]
transformed_train = transformed_train[transformed_train["temp"] < MAX_TEMP]
print(transformed_train.shape)

(336438, 35)
(307319, 35)


In [30]:
# win deg nya di anu pake modulus
transformed_train["wind_deg"] = transformed_train["wind_deg"] \
                                .apply(lambda x: x % 360 if x > 360 else x)

## Train-Test-Validation Split

In [32]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(
                                        transformed_train,
                                        target = 'rain_1h',
                                        train_size=0.7, valid_size=0.15, test_size=0.15)

# Machine Learning I

In [33]:
params = {
        'iterations': 35000,
        'eval_metric': "RMSE",
        'task_type': "GPU",
        'verbose':True,
        'od_type':'Iter'}

cat_regressor = CatBoostRegressor(**params)

In [34]:
cat_regressor.fit(X_train, y_train, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.009774
0:	learn: 0.9413435	total: 31.3ms	remaining: 18m 16s
1:	learn: 0.9396111	total: 38.1ms	remaining: 11m 7s
2:	learn: 0.9378955	total: 43.3ms	remaining: 8m 25s
3:	learn: 0.9362225	total: 49.4ms	remaining: 7m 12s
4:	learn: 0.9345633	total: 54.3ms	remaining: 6m 20s
5:	learn: 0.9329774	total: 59.2ms	remaining: 5m 45s
6:	learn: 0.9313894	total: 64.9ms	remaining: 5m 24s
7:	learn: 0.9298360	total: 69.7ms	remaining: 5m 4s
8:	learn: 0.9282951	total: 75.7ms	remaining: 4m 54s
9:	learn: 0.9267686	total: 80.8ms	remaining: 4m 42s
10:	learn: 0.9252903	total: 85.7ms	remaining: 4m 32s
11:	learn: 0.9238729	total: 91.4ms	remaining: 4m 26s
12:	learn: 0.9224390	total: 96.1ms	remaining: 4m 18s
13:	learn: 0.9210450	total: 102ms	remaining: 4m 14s
14:	learn: 0.9196960	total: 107ms	remaining: 4m 9s
15:	learn: 0.9183200	total: 112ms	remaining: 4m 4s
16:	learn: 0.9169607	total: 118ms	remaining: 4m 1s
17:	learn: 0.9156181	total: 123ms	remaining: 3m 58s
18:	learn: 0.9143435	total: 129ms	

<catboost.core.CatBoostRegressor at 0x2500d42f3d0>

In [35]:
y_pred = cat_regressor.predict(X_test)
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE: 0.787459131129465


# Submission

In [36]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)

In [43]:
submission['rain_1h'] = cat_regressor.predict(transformed_test)
submission['rain_1h'] = submission['rain_1h'].apply(lambda x: 0. if x < 0 else x)
submission.to_csv('../submission/submission_3.csv', index=False)

## Export

In [44]:
transformed_train.to_csv('../cleaned/submission_3/train_outlier_-1_radif.csv', index=False)
transformed_test.to_csv('../cleaned/submission_3/test_outlier_-1_radif.csv', index=False)

In [46]:
import pickle 

pickle.dump(cat_regressor, open('../model/submission_3.pkl', 'wb'))