# Machine Learning Model Training: Predict if Today is Raining or Drizzle

In [3]:
import pandas as pd
df_ready_to_used = pd.read_csv('./data/gsod_jakarta_eda_and_preprocessed.csv').drop(columns=['date'])
df_ready_to_used

Unnamed: 0,flag_precipitation_prev_1_day,flag_precipitation_prev_2_day,flag_precipitation_prev_3_day,flag_precipitation_prev_4_day,flag_precipitation_prev_5_day,flag_precipitation_prev_6_day,flag_precipitation_prev_7_day,avg_temp_prev_1_day,avg_temp_prev_2_day,avg_temp_prev_3_day,...,avg_wind_speed_prev_6_day,avg_wind_speed_prev_7_day,total_precipitation_prev_1_day,total_precipitation_prev_2_day,total_precipitation_prev_3_day,total_precipitation_prev_4_day,total_precipitation_prev_5_day,total_precipitation_prev_6_day,total_precipitation_prev_7_day,target
0,I,I,A,I,A,B,I,86.3,84.3,83.6,...,0.5,1.3,0.00,0.00,0.08,0.00,0.31,0.26,0.00,0.0
1,I,I,OTHER,I,I,OTHER,I,84.7,84.5,83.9,...,1.0,3.9,0.00,0.00,99.99,0.00,0.00,99.99,0.00,0.0
2,B,C,I,A,OTHER,OTHER,B,83.9,85.7,85.4,...,2.7,3.9,0.00,0.00,0.00,0.08,99.99,99.99,0.02,0.0
3,I,OTHER,OTHER,I,I,I,OTHER,81.5,78.1,82.5,...,0.0,2.1,0.00,99.99,99.99,0.00,0.00,0.00,99.99,0.0
4,I,A,I,I,I,I,I,85.1,84.4,87.5,...,2.0,2.7,0.00,0.79,0.00,0.00,0.00,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2851,I,I,I,B,B,I,A,85.2,82.7,84.4,...,1.9,3.1,0.00,0.00,0.00,0.71,1.06,0.00,0.12,1.0
2852,A,I,I,I,B,B,I,85.3,85.2,82.7,...,2.6,1.9,0.47,0.00,0.00,0.00,0.71,1.06,0.00,1.0
2853,OTHER,A,I,I,I,B,B,83.0,85.3,85.2,...,0.9,2.6,99.99,0.47,0.00,0.00,0.00,0.71,1.06,1.0
2854,I,A,OTHER,A,I,I,I,85.0,79.9,83.0,...,2.5,2.5,0.00,0.04,99.99,0.47,0.00,0.00,0.00,1.0


## Split Train and Test

We will use the ratio of training/validation/test = **60/20/20**

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

df_full_train, df_test = train_test_split(df_ready_to_used, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.target.values
y_val = df_val.target.values
y_test = df_test.target.values

del df_train['target']
del df_val['target']
del df_test['target']

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

## Train Untuned Version of SGDClassifier

In [91]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

model = SGDClassifier(loss='log_loss', random_state=42)
model.fit(X_train, y_train)


In [92]:
model.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'log_loss',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [93]:
y_pred = model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 0.5155613345477207


In [94]:
y_pred = model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.5254302249817945


In [95]:
y_pred = model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.5034231921266581


## Train a Tuned Version of SGDClassfier

In [48]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'alpha': [0.00001, 0.0001, 0.001, 0.01],
    'tol': [0.00001, 0.0001, 0.001, 0.01],
    'penalty' : ['l2', 'l1', 'elasticnet', None],
    'eta0': [0.00001, 0.0001, 0.001, 0.01],
    'learning_rate' : ['constant', 'optimal', 'invscaling', 'adaptive'],
    'max_iter': [100, 1000],
}
grid_search = RandomizedSearchCV(
    SGDClassifier(loss='log_loss', random_state=42),
    param_distributions=param_grid,
    n_iter=10,
    cv=5, verbose=3, n_jobs=-1, scoring='roc_auc', random_state=42
)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END alpha=0.001, eta0=1e-05, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.001;, score=0.665 total time=   0.0s
[CV 3/5] END alpha=0.001, eta0=1e-05, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.001;, score=0.649 total time=   0.1s
[CV 1/5] END alpha=0.001, eta0=0.01, learning_rate=optimal, max_iter=1000, penalty=l2, tol=0.01;, score=0.666 total time=   0.0s
[CV 2/5] END alpha=0.001, eta0=1e-05, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.001;, score=0.647 total time=   0.1s
[CV 4/5] END alpha=0.001, eta0=1e-05, learning_rate=adaptive, max_iter=100, penalty=l1, tol=0.001;, score=0.684 total time=   0.1s
[CV 2/5] END alpha=0.001, eta0=0.01, learning_rate=optimal, max_iter=1000, penalty=l2, tol=0.01;, score=0.653 total time=   0.0s
[CV 3/5] END alpha=0.001, eta0=0.01, learning_rate=optimal, max_iter=1000, penalty=l2, tol=0.01;, score=0.635 total time=   0.0s
[CV 5/5] END alpha=0.001, et

[CV 5/5] END alpha=0.001, eta0=1e-05, learning_rate=adaptive, max_iter=100, penalty=elasticnet, tol=0.001;, score=0.656 total time=   0.1s
[CV 5/5] END alpha=0.01, eta0=0.0001, learning_rate=optimal, max_iter=1000, penalty=None, tol=1e-05;, score=0.648 total time=   0.0s
[CV 2/5] END alpha=0.001, eta0=1e-05, learning_rate=invscaling, max_iter=100, penalty=l1, tol=0.01;, score=0.649 total time=   0.1s
[CV 3/5] END alpha=0.01, eta0=0.0001, learning_rate=optimal, max_iter=1000, penalty=None, tol=1e-05;, score=0.667 total time=   0.0s
[CV 3/5] END alpha=0.001, eta0=1e-05, learning_rate=adaptive, max_iter=100, penalty=elasticnet, tol=0.001;, score=0.656 total time=   0.1s
[CV 1/5] END alpha=0.001, eta0=1e-05, learning_rate=constant, max_iter=1000, penalty=l1, tol=1e-05;, score=0.634 total time=   0.0s
[CV 5/5] END alpha=0.001, eta0=1e-05, learning_rate=invscaling, max_iter=100, penalty=l1, tol=0.01;, score=0.639 total time=   0.0s
[CV 3/5] END alpha=0.001, eta0=1e-05, learning_rate=invscali



[CV 1/5] END alpha=0.01, eta0=0.0001, learning_rate=optimal, max_iter=1000, penalty=None, tol=1e-05;, score=0.620 total time=   0.5s
[CV 2/5] END alpha=0.01, eta0=0.0001, learning_rate=optimal, max_iter=1000, penalty=None, tol=1e-05;, score=0.524 total time=   0.4s




In [49]:
final_model = grid_search.best_estimator_
final_model

We get the best model with parameter alpha=0.01, eta=0.00001, learning_rate=adaptive, max_iter=100 and penalty=l1

In [50]:
y_pred = final_model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 0.6905403623987345


In [51]:
y_pred = final_model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.6943198804185352


In [52]:
y_pred = final_model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.6492756280946268


  > Suprisingly the validation set's ROC AUC is higher than the training set 

## Trying Alternate Model: RandomForestClassifier

### UnTuned version of RandomForestClassifier

In [96]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(criterion='log_loss', random_state=42)
model.fit(X_train, y_train)

In [97]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'log_loss',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [98]:
y_pred = model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 1.0


In [99]:
y_pred = model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.7628300946686597


In [100]:
y_pred = model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.7431016565804756


### Tuned version of RandomForestClassifier

In [61]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators' : [5, 10, 50, 100, 250, 500],
    'max_depth': [5, 10, 50, 100, 250, 500],
    'min_samples_split': [5, 10, 50, 100, 250, 500],
    'min_samples_leaf': [5, 10, 50, 100, 250, 500],
    'max_leaf_nodes': [5, 10, 50, 100, 250, 500],
    'max_samples': [5, 10, 50, 100, 250, 500],
}
grid_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=10,
    cv=5,
    verbose=3,
    n_jobs=-1,
    scoring='roc_auc', random_state=42
)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 2/5] END max_depth=5, max_leaf_nodes=5, max_samples=100, min_samples_leaf=500, min_samples_split=500, n_estimators=50;, score=0.500 total time=   0.1s
[CV 1/5] END max_depth=5, max_leaf_nodes=5, max_samples=100, min_samples_leaf=500, min_samples_split=500, n_estimators=50;, score=0.500 total time=   0.2s
[CV 3/5] END max_depth=5, max_leaf_nodes=5, max_samples=100, min_samples_leaf=500, min_samples_split=500, n_estimators=50;, score=0.500 total time=   0.1s
[CV 2/5] END max_depth=50, max_leaf_nodes=5, max_samples=10, min_samples_leaf=5, min_samples_split=250, n_estimators=100;, score=0.500 total time=   0.3s
[CV 4/5] END max_depth=5, max_leaf_nodes=5, max_samples=100, min_samples_leaf=500, min_samples_split=500, n_estimators=50;, score=0.500 total time=   0.2s
[CV 4/5] END max_depth=50, max_leaf_nodes=5, max_samples=10, min_samples_leaf=5, min_samples_split=250, n_estimators=100;, score=0.500 total time=   0.3s
[CV 5/5] EN

In [62]:
final_model = grid_search.best_estimator_
final_model

We get the best RandomForestClassifier with parameter: max_depth=10, max_leaf_nodes=50, max_samples=250, min_samples_leaf=10, min_samples_split=50 n_estimators=250

In [63]:
y_pred = final_model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 0.7856538516849623


In [64]:
y_pred = final_model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.7741750028745544


In [65]:
y_pred = final_model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.7396662387676509


## Trying Alternate Model: XGBoost

In [66]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.2-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m672.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:10[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.2


In [86]:
from xgboost import XGBClassifier

xgb_params = {
    'min_child_weight': [1, 5, 10],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'gamma': [0, 5, 10, 15, 20, 25],
    'max_delta_step': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0, 0.1, 0.3, 0.6, 0.9],
    'tree_method': ['auto', 'exact', 'approx', 'hist']
}

xgb = XGBClassifier(objective='binary:logistic', nthread=8, seed=1, verbosity=1)
random_search = RandomizedSearchCV(
    xgb,
    param_distributions=xgb_params,
    n_iter=30,
    scoring='roc_auc',
    n_jobs=4, cv=5, verbose=3, random_state=42
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[CV 1/5] END eta=0.6, gamma=20, max_delta_step=4, max_depth=6, min_child_weight=1, tree_method=approx;, score=0.741 total time=   2.0s
[CV 2/5] END eta=0.6, gamma=20, max_delta_step=4, max_depth=6, min_child_weight=1, tree_method=approx;, score=0.733 total time=   2.2s
[CV 4/5] END eta=0.6, gamma=20, max_delta_step=4, max_depth=6, min_child_weight=1, tree_method=approx;, score=0.728 total time=   2.4s
[CV 3/5] END eta=0.6, gamma=20, max_delta_step=4, max_depth=6, min_child_weight=1, tree_method=approx;, score=0.719 total time=   2.7s
[CV 1/5] END eta=0.3, gamma=10, max_delta_step=10, max_depth=7, min_child_weight=1, tree_method=hist;, score=0.745 total time=   0.8s
[CV 2/5] END eta=0.3, gamma=10, max_delta_step=10, max_depth=7, min_child_weight=1, tree_method=hist;, score=0.732 total time=   0.7s
[CV 3/5] END eta=0.3, gamma=10, max_delta_step=10, max_depth=7, min_child_weight=1, tree_method=hist;, score=0.723 total time=   0.6s
[CV 4/5] END eta=0.3, gamma=10, max_delta_step=10, max_dep

In [87]:
final_model = random_search.best_estimator_
{
    key: final_model.get_params()[key]
    for key in final_model.get_params()
    if key in xgb_params
}

{'gamma': 25,
 'max_delta_step': 3,
 'max_depth': 8,
 'min_child_weight': 1,
 'tree_method': 'auto',
 'eta': 0.1}

In [88]:
y_pred = final_model.predict_proba(X_train)[:, 1]
print(f"Score on training set: {roc_auc_score(y_train, y_pred)}")

Score on training set: 0.7765537366377451


In [89]:
y_pred = final_model.predict_proba(X_val)[:, 1]
print(f"Score on validation set: {roc_auc_score(y_val, y_pred)}")

Score on validation set: 0.748888505614963


In [90]:
y_pred = final_model.predict_proba(X_test)[:, 1]
print(f"Score on test set: {roc_auc_score(y_test, y_pred)}")

Score on test set: 0.7249831896815209


## Result of the Experiment

In [101]:
res = pd.DataFrame({
    "Training Approach": [
        "Untuned SGDClassifier",
        "Tuned SGDClassifier",
        "Untuned RandomForestClassifier",
        "Tuned RandomForestClassfier",
        "Tuned XGBoost"
    ],
    "ROC-AUC on Training Set": [
        0.5155613345477207,
        0.6905403623987345,
        1.0,
        0.7856538516849623,
        0.7765537366377451
    ],
    "ROC-AUC on Validation Set": [
        0.5254302249817945,
        0.6943198804185352,
        0.7628300946686597,
        0.7741750028745544,
        0.748888505614963
    ],
    "ROC-AUC on Test Set": [
        0.5034231921266581,
        0.6492756280946268,
        0.7431016565804756,
        0.7396662387676509,
        0.7249831896815209
    ],
})
res

Unnamed: 0,Training Approach,ROC-AUC on Training Set,ROC-AUC on Validation Set,ROC-AUC on Test Set
0,Untuned SGDClassifier,0.515561,0.52543,0.503423
1,Tuned SGDClassifier,0.69054,0.69432,0.649276
2,Untuned RandomForestClassifier,1.0,0.76283,0.743102
3,Tuned RandomForestClassfier,0.785654,0.774175,0.739666
4,Tuned XGBoost,0.776554,0.748889,0.724983


  > Summary:**Untuned RandomForestClassifier** gives the best ROC-AUC on the Test-Set while **Tuned RandomForestClassifier** gives the best ROC-AUC in validation test. However, the **Untuned RandomForestClassifier** tend to be overfitting. 
  
  > So, we'll take the Tuned RandomForestClassifier as model we will use since it doesn't give an overfitting and has a best performance in the validation set




Choosen model is:

```python
model = RandomForestClassifier(
  max_depth=10, max_leaf_nodes=50, max_samples=250,
  min_samples_leaf=10, min_samples_split=50,
  n_estimators=250, random_state=42
)
```