In [394]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm as tqdm
from IPython.display import display
from geopy import Nominatim
import regex
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearnex import patch_sklearn
patch_sklearn()


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [395]:
test_size = 0.


# Input

## Read inputs

In [396]:
df_train = pd.read_csv('inputs/train.csv')
df_test = pd.read_csv('inputs/test.csv')


## Preprocessing

### Concat train test

In [397]:
# fill 'Weather' of test data to 0 for convenience
df_test['Weather'] = np.zeros((len(df_test),))

# let train_end_idx be border of 'train.csv' and 'test.csv'
train_end_idx = len(df_train)

# combine into single dataframe
df: pd.DataFrame
df = pd.concat([df_train, df_test], sort=False)


### Encode datetime to year, month, day 

In [398]:
# transform date to year, month, day (reserves NaN)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
if 'Date' in df.columns:
    df['Year'] = pd.to_datetime(df['Date']).dt.year
    df['Month'] = pd.to_datetime(df['Date']).dt.month
    df['Day'] = pd.to_datetime(df['Date']).dt.day
    del df['Date']


### Encode location names to latitude and longtitude 

In [399]:
# if 'Loc' in df.columns:
#     encoder = LabelEncoder()
#     df['Loc'] = encoder.fit_transform(df['Loc'])

col = 'Loc'
if col in df.columns:
    loc_map = {}

    try:
        loc_map_json = open('cache/loc_map.json', 'r')
    except FileNotFoundError:
        loc_map_json = open('cache/loc_map.json', 'w')
        geolocator = Nominatim(user_agent="weather-prediction")
        for loc in df[col].unique():
            if loc is not np.nan:
                sepstr_loc = regex.sub(r"([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))", r"\1 ", loc) # Convert camel case into separate letters
                location = geolocator.geocode(sepstr_loc, country_codes='AU')
                loc_map[loc] = [location.latitude, location.longitude]
        loc_map_json.write(json.dumps(loc_map))
    else:
        loc_map = json.load(loc_map_json)
    finally:
        loc_map_json.close()

    loc_map[np.nan] = [np.nan, np.nan]
    print(loc_map)

    tmp = np.vstack(df[col].map(loc_map))
    df[f'{col}_latitude'] = tmp[:, 0]
    df[f'{col}_longtitude'] = tmp[:, 1]
    del df[col]


{'Canberra': [-35.2975906, 149.1012676], 'Woomera': [-31.1999142, 136.8253532], 'Tuggeranong': [-35.4209771, 149.0921341], 'Hobart': [-42.8825088, 147.3281233], 'NorahHead': [-33.2816667, 151.5677778], 'Nuriootpa': [-34.4693354, 138.9939006], 'SydneyAirport': [-33.9337758, 151.1799269], 'Walpole': [-34.9776796, 116.7310063], 'Williamtown': [-32.815, 151.8427778], 'PerthAirport': [-31.9406095, 115.96760765137932], 'Wollongong': [-34.4278083, 150.893054], 'Cobar': [-31.4983333, 145.8344444], 'Albany': [-35.0247822, 117.883608], 'Albury': [-36.0737304, 146.9135396], 'Mildura': [-34.1847265, 142.1624972], 'Moree': [-29.4617202, 149.8407153], 'Uluru': [-25.3455545, 131.03696147470208], 'Brisbane': [-27.4689682, 153.0234991], 'Sale': [-38.1050358, 147.0647902], 'Sydney': [-33.8698439, 151.2082848], 'Perth': [-31.9558964, 115.8605801], 'BadgerysCreek': [-33.8816671, 150.7441627], 'Penrith': [-33.7511954, 150.6941711], 'PearceRAAF': [-31.6739604, 116.01754351808195], 'CoffsHarbour': [-30.29859

### Encode wind directions to vectors

In [400]:
# dir_map = {
#     'N': {'N': 4},
#     'NNE': {'N': 3, 'E': 1},
#     'NE': {'N': 2, 'E': 2},
#     'ENE': {'N': 1, 'E': 3},
#     'E': {'E': 4},
#     'ESE': {'E': 3, 'S': 1},
#     'SE': {'E': 2, 'S': 2},
#     'SSE': {'E': 1, 'S': 3},
#     'S': {'S': 4},
#     'SSW': {'S': 3, 'W': 1},
#     'SW': {'S': 2, 'W': 2},
#     'WSW': {'S': 1, 'W': 3},
#     'W': {'W': 4},
#     'WNW': {'W': 3, 'N': 1},
#     'NW': {'W': 2, 'N': 2},
#     'NNW': {'W': 1, 'N': 3},
# }
# for key, val in zip(dir_map.keys(), dir_map.values()):
#     lst = [val.get('N', 0), val.get('S', 0), val.get('W', 0), val.get('E', 0)]
#     dir_map[key] = lst
# dir_map[np.nan] = [np.nan for _ in range(4)]

# for col in ['WindDir','DayWindDir','NightWindDir']:
#     if col in df.columns:
#         tmp = np.vstack(df[col].map(dir_map))
#         df[f'{col}_N'] = tmp[:, 0]
#         df[f'{col}_S'] = tmp[:, 1]
#         df[f'{col}_W'] = tmp[:, 2]
#         df[f'{col}_E'] = tmp[:, 3]
#         del df[col]


### Encode remaining

In [401]:
# one-hot encoding
df = pd.get_dummies(df)

# # label encoding
# obj_columns = [col for col in df.columns if df[col].dtype == object]
# for col in obj_columns:
#     encoder = LabelEncoder()
#     df[col] = encoder.fit_transform(df[col])

In [402]:
print(list(df.columns))


['TempLow', 'TempHigh', 'Steaming', 'Sun', 'WindSpeed', 'DayWindSpeed', 'NightWindSpeed', 'DayHumidity', 'NightHumidity', 'DayPressure', 'NightPressure', 'DayCloud', 'NightCloud', 'DayTemp', 'NightTemp', 'Weather', 'RISK_MM', 'Year', 'Month', 'Day', 'Loc_latitude', 'Loc_longtitude', 'WindDir_E', 'WindDir_ENE', 'WindDir_ESE', 'WindDir_N', 'WindDir_NE', 'WindDir_NNE', 'WindDir_NNW', 'WindDir_NW', 'WindDir_S', 'WindDir_SE', 'WindDir_SSE', 'WindDir_SSW', 'WindDir_SW', 'WindDir_W', 'WindDir_WNW', 'WindDir_WSW', 'DayWindDir_E', 'DayWindDir_ENE', 'DayWindDir_ESE', 'DayWindDir_N', 'DayWindDir_NE', 'DayWindDir_NNE', 'DayWindDir_NNW', 'DayWindDir_NW', 'DayWindDir_S', 'DayWindDir_SE', 'DayWindDir_SSE', 'DayWindDir_SSW', 'DayWindDir_SW', 'DayWindDir_W', 'DayWindDir_WNW', 'DayWindDir_WSW', 'NightWindDir_E', 'NightWindDir_ENE', 'NightWindDir_ESE', 'NightWindDir_N', 'NightWindDir_NE', 'NightWindDir_NNE', 'NightWindDir_NNW', 'NightWindDir_NW', 'NightWindDir_S', 'NightWindDir_SE', 'NightWindDir_SSE', '

In [403]:
# columns_na = pd.isna(df).sum()/df.shape[0]
# print(columns_na)

# rows_na = pd.isna(df).sum(axis=1)/df.shape[1]
# print(rows_na)

# for col, rate in zip(df.columns, naVariable):
#     if rate >= 0.23 and col != 'Weather':
#         del df[col]


### Split train test

In [404]:
X_train = df.drop(columns=['Weather']).values[:train_end_idx, :]
y_train = df['Weather'].values[:train_end_idx]
X_val = np.array([])
y_val = np.array([])
X_test = df.drop(columns=['Weather']).values[train_end_idx:, :]

if test_size > 0:
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=3)



### Remove outliers
Worse

In [405]:
# X_train_tmp = X_train
# Q1 = np.nanquantile(X_train_tmp, 0.25, axis=0, keepdims=True)
# Q3 = np.nanquantile(X_train_tmp, 0.75, axis=0, keepdims=True)
# IQR = Q3 - Q1
# low_X_train = X_train_tmp < (Q1 - 1.5 * IQR)
# high_X_train = X_train_tmp > (Q3 + 1.5 * IQR)

# X_train[(low_X_train | high_X_train)] = np.nan

### Fill missing values

In [406]:
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, Ridge, ARDRegression


# imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# imputer = KNNImputer(missing_values=np.nan, n_neighbors=8)
# imputer = KNNImputer(missing_values=np.nan, n_neighbors=8, weights='distance')

# lr = LinearRegression()
# imputer = IterativeImputer(estimator=lr, missing_values=np.nan, max_iter=100, imputation_order='roman', add_indicator=False)

est = Ridge() # 0.492591
# est = BayesianRidge()
# est = ARDRegression()
imputer = IterativeImputer(estimator=est, missing_values=np.nan, max_iter=10, imputation_order='roman', add_indicator=True, verbose=2)

X_train = imputer.fit_transform(X_train)
if test_size > 0:
    X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)


[IterativeImputer] Completing matrix with shape (35156, 69)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.78
[IterativeImputer] Change: 138.44451843650018, scaled tolerance: 2.017 
[IterativeImputer] Ending imputation round 2/10, elapsed time 1.52
[IterativeImputer] Change: 57.12304520724993, scaled tolerance: 2.017 
[IterativeImputer] Ending imputation round 3/10, elapsed time 2.28
[IterativeImputer] Change: 18.74611631450226, scaled tolerance: 2.017 
[IterativeImputer] Ending imputation round 4/10, elapsed time 3.04
[IterativeImputer] Change: 8.229495342745423, scaled tolerance: 2.017 
[IterativeImputer] Ending imputation round 5/10, elapsed time 3.81
[IterativeImputer] Change: 7.761176425201272, scaled tolerance: 2.017 
[IterativeImputer] Ending imputation round 6/10, elapsed time 4.54
[IterativeImputer] Change: 6.7284971628111165, scaled tolerance: 2.017 
[IterativeImputer] Ending imputation round 7/10, elapsed time 5.30
[IterativeImputer] Change: 5.50221893380818



[IterativeImputer] Ending imputation round 2/10, elapsed time 0.19
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.28
[IterativeImputer] Ending imputation round 4/10, elapsed time 0.38
[IterativeImputer] Ending imputation round 5/10, elapsed time 0.47
[IterativeImputer] Ending imputation round 6/10, elapsed time 0.57
[IterativeImputer] Ending imputation round 7/10, elapsed time 0.66
[IterativeImputer] Ending imputation round 8/10, elapsed time 0.77
[IterativeImputer] Ending imputation round 9/10, elapsed time 0.87
[IterativeImputer] Ending imputation round 10/10, elapsed time 0.96


In [407]:
X_train_imputed = X_train.copy()
if test_size > 0:
    X_val_imputed = X_val.copy()
X_test_imputed = X_test.copy()
y_train_splitted = y_train.copy()


### Checkpoint

In [408]:
X_train = X_train_imputed.copy()
if test_size > 0:
    X_val = X_val_imputed.copy()
X_test = X_test_imputed.copy()
y_train = y_train_splitted.copy()


### Resampling

In [409]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, EditedNearestNeighbours, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline

# resampler = RandomOverSampler()
# resampler = SMOTENC(categorical_features=[0, 1])
# resampler = RandomUnderSampler()
# resampler = SMOTEENN()

# over = BorderlineSMOTE(sampling_strategy=0.5, k_neighbors=8)
# under = EditedNearestNeighbours(sampling_strategy="all", n_neighbors=3)
# steps = [('o', over), ('u', under)]
# resampler = Pipeline(steps=steps)

# over = BorderlineSMOTE(sampling_strategy=0.5, k_neighbors=6, kind='borderline-1')
# under = EditedNearestNeighbours(sampling_strategy="all", n_neighbors=2)
# steps = [('o', over), ('u', under)]
# resampler = Pipeline(steps=steps)

# over1 = SMOTE(sampling_strategy=0.3, k_neighbors=8)
# over2 = BorderlineSMOTE(sampling_strategy=0.5, k_neighbors=8)
# under = EditedNearestNeighbours(sampling_strategy="all", n_neighbors=2)
# steps = [('o1', over1), ('o2', over2), ('u', under)]
# resampler = Pipeline(steps=steps)

over1 = BorderlineSMOTE(sampling_strategy=0.3, k_neighbors=8)
over2 = BorderlineSMOTE(sampling_strategy=0.5, k_neighbors=8)
under = EditedNearestNeighbours(sampling_strategy="auto", n_neighbors=2)
steps = [('o1', over1), ('o2', over2), ('u', under)]
resampler = Pipeline(steps=steps)

# over1 = BorderlineSMOTE(sampling_strategy=0.2, k_neighbors=8)
# over2 = BorderlineSMOTE(sampling_strategy=0.3, k_neighbors=8)
# over3 = BorderlineSMOTE(sampling_strategy=0.5, k_neighbors=8)
# under = EditedNearestNeighbours(sampling_strategy="auto", n_neighbors=2)
# steps = [('o1', over1), ('o2', over2), ('o3', over3), ('u', under)]
# resampler = Pipeline(steps=steps)

X_train, y_train = resampler.fit_resample(X_train, y_train)


### Normalize

In [410]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler

normalizer = StandardScaler()

X_train = normalizer.fit_transform(X_train)
if test_size > 0:
    X_val = normalizer.transform(X_val)
X_test = normalizer.transform(X_test)


In [411]:
y_val_pred_list = []
y_test_pred_list = []


# Training

### CatBoost

In [412]:
from catboost import CatBoostClassifier, metrics


cbm = CatBoostClassifier(iterations=2100,
                         # depth=10, # 10000: 458
                         # depth=7, # 10000: 470
                         # depth=9, # 1000: 465
                         # depth=8, # 1000: 467
                         # depth=7, # 1000: 470
                         # depth=8, # 2000: 471
                         # depth=7, # 2000: 465
                         # depth=7, # 10000, 0.01: 470
                         depth=8,
                         learning_rate=0.03,
                         loss_function='Logloss',
                         verbose=True,
                         eval_metric='F1',
                         early_stopping_rounds=1000
                         )
if test_size > 0:
    cbm.fit(X_train, y_train, eval_set=(X_val, y_val))
else:
    cbm.fit(X_train, y_train)

if test_size > 0:
    y_val_pred = cbm.predict(X_val) >= 0.5
    print('Accuracy: %f' % accuracy_score(y_val, y_val_pred))
    print('f1-score: %f' % f1_score(y_val, y_val_pred))
    y_val_pred_list.append(y_val_pred)

y_test_pred = cbm.predict(X_test) >= 0.5
y_test_pred_list.append(y_test_pred)

df_sap = pd.DataFrame(y_test_pred.astype(int), columns=['Weather'])
df_sap.to_csv('outputs/prediction_cbm.csv',  index_label='Id')


0:	learn: 0.8142140	total: 23.9ms	remaining: 50.2s
1:	learn: 0.8185803	total: 52.8ms	remaining: 55.4s
2:	learn: 0.8291706	total: 93ms	remaining: 1m 4s
3:	learn: 0.8312820	total: 161ms	remaining: 1m 24s
4:	learn: 0.8320367	total: 187ms	remaining: 1m 18s
5:	learn: 0.8288190	total: 257ms	remaining: 1m 29s
6:	learn: 0.8324634	total: 284ms	remaining: 1m 24s
7:	learn: 0.8336929	total: 352ms	remaining: 1m 32s
8:	learn: 0.8359212	total: 378ms	remaining: 1m 27s
9:	learn: 0.8378318	total: 442ms	remaining: 1m 32s
10:	learn: 0.8435729	total: 469ms	remaining: 1m 29s
11:	learn: 0.8494213	total: 540ms	remaining: 1m 33s
12:	learn: 0.8507662	total: 567ms	remaining: 1m 31s
13:	learn: 0.8506311	total: 597ms	remaining: 1m 28s
14:	learn: 0.8531201	total: 661ms	remaining: 1m 31s
15:	learn: 0.8531923	total: 687ms	remaining: 1m 29s
16:	learn: 0.8555679	total: 755ms	remaining: 1m 32s
17:	learn: 0.8578536	total: 781ms	remaining: 1m 30s
18:	learn: 0.8595932	total: 850ms	remaining: 1m 33s
19:	learn: 0.8624888	tot

### XGBoost

In [413]:
import xgboost as xgb

# xgbm = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=10000, learning_rate=0.05, objective='binary:logistic', n_jobs=os.cpu_count())
xgbm = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=600, learning_rate=0.15, objective='binary:logistic', n_jobs=os.cpu_count())

xgbm.fit(X_train, y_train)
if test_size > 0:
    y_val_pred = xgbm.predict(X_val) >= 0.5
    print('Accuracy: %f' % accuracy_score(y_val, y_val_pred))
    print('f1-score: %f' % f1_score(y_val, y_val_pred))
    y_val_pred_list.append(y_val_pred)

y_test_pred = xgbm.predict(X_test) >= 0.5
y_test_pred_list.append(y_test_pred)

df_sap = pd.DataFrame(y_test_pred.astype(int), columns=['Weather'])
df_sap.to_csv('outputs/prediction_xgbm.csv',  index_label='Id')


### LightGBM

In [414]:
import lightgbm as lgb

# lgbm = lgb.sklearn.LGBMClassifier(boosting_type='gbdt', learning_rate=0.1, n_estimators=1000, n_jobs=os.cpu_count())
lgbm = lgb.sklearn.LGBMClassifier(boosting_type='gbdt', num_leaves=100, learning_rate=0.05, n_estimators=1000, n_jobs=os.cpu_count())
# lgbm = lgb.sklearn.LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.05, min_split_gain=0.01, n_estimators=500, n_jobs=os.cpu_count())

lgbm.fit(X_train, y_train)
if test_size > 0:
    y_val_pred = lgbm.predict(X_val, num_iteration=lgbm.best_iteration_) >= 0.5
    print('Accuracy: %f' % accuracy_score(y_val, y_val_pred))
    print('f1-score: %f' % f1_score(y_val, y_val_pred))
    y_val_pred_list.append(y_val_pred)

y_test_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration_) >= 0.5
y_test_pred_list.append(y_test_pred)

df_sap = pd.DataFrame(y_test_pred.astype(int), columns=['Weather'])
df_sap.to_csv('outputs/prediction_lgbm.csv',  index_label='Id')


In [415]:
# from sklearn.ensemble import RandomForestClassifier

# rfm = RandomForestClassifier(criterion='entropy', n_estimators=600, n_jobs=os.cpu_count(), max_features='sqrt')

# rfm.fit(X_train, y_train)
# if test_size > 0:
#     y_val_pred = rfm.predict(X_val) >= 0.5
#     print('Accuracy: %f' % accuracy_score(y_val, y_val_pred))
#     print('f1-score: %f' % f1_score(y_val, y_val_pred))
#     y_val_pred_list.append(y_val_pred)

# y_test_pred = rfm.predict(X_test) >= 0.5
# y_test_pred_list.append(y_test_pred)


## sklearn

In [416]:
# from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression, RidgeClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# from sklearn.svm import SVC
# from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier, StackingClassifier

# # model = LinearRegression(n_jobs=os.cpu_count())

# # model = GradientBoostingClassifier(loss='log_loss')

# model = RandomForestClassifier(criterion='log_loss', n_estimators=500, n_jobs=os.cpu_count(), max_features='sqrt')

# # model = DecisionTreeClassifier(criterion='entropy')
# # model = AdaBoostClassifier(estimator=model, n_estimators=100)
# # model = BaggingClassifier(estimator=model, n_estimators=50, n_jobs=os.cpu_count())

# # model = RandomForestClassifier(criterion='entropy', n_estimators=200, max_features='log2')
# # model = AdaBoostClassifier(estimator=model, n_estimators=50)
# # model = BaggingClassifier(estimator=model, n_estimators=20, n_jobs=os.cpu_count())

# model.fit(X_train, y_train)
# if test_size > 0:
#     y_val_pred = model.predict(X_val) >= 0.5
#     print('Accuracy: %f' % accuracy_score(y_val, y_val_pred))
#     print('f1-score: %f' % f1_score(y_val, y_val_pred))

# y_test_pred = model.predict(X_test) >= 0.5


In [417]:
def vote(list):
    y_voted = np.zeros_like(list[0], dtype=np.float64)
    y_voted += list[0] * 0.43
    y_voted += list[1] * 0.27
    y_voted += list[2] * 0.3
    return y_voted >= 0.5


if test_size > 0:
    y_val_pred_voted = vote(y_val_pred_list)
    print('Accuracy: %f' % accuracy_score(y_val, y_val_pred_voted))
    print('f1-score: %f' % f1_score(y_val, y_val_pred_voted))
y_test_pred_voted = vote(y_test_pred_list)

y_test_pred = y_test_pred_voted

df_sap = pd.DataFrame(y_test_pred.astype(int), columns=['Weather'])
df_sap.to_csv('outputs/prediction_voted.csv',  index_label='Id')
