**LightGBM Model **

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import lightgbm as lgbm

In [None]:
df = pd.read_csv("/content/IGRA_derived.csv")

In [None]:
df = df.rename(columns={'Unnamed: 0':'date'})
df['date'] = pd.to_datetime(df.date)
df['month']= df.date.dt.month

In [None]:
y=df['rained']
X = df.iloc[:,1:]
X=X.drop(columns=['rained'])

In [None]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=df[['rained', 'month']],random_state=123)

In [None]:
# Transforming features using Min Max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X)
X = scaler.transform(X)

In [None]:
# One hot encoding
one_hot_encoded_data = pd.get_dummies(df, columns = ['month'])
df=pd.DataFrame(data=one_hot_encoded_data)

In [None]:
df.head(3)

Unnamed: 0,date,Seasonality,Lower level humidity,Mid level humidity,U-wind at lower and mid levels,V-wind at lower and mid levels,Convective Condensation Level - thermodynamics,Total Precipitable Water - thermodynamics,rained,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2008-01-07,175.0,86.499196,57.931008,-6.902578,-0.314079,938.737374,51.698735,1,1,0,0,0,0,0,0,0,0,0,0,0
1,2008-01-09,173.0,86.168358,54.514297,-3.278,0.458702,926.737374,51.07854,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2008-01-10,172.0,79.143281,57.851667,-3.922668,-0.298095,939.977719,50.557384,0,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df.shape

(2086, 21)

In [None]:
X_train.shape, y_train.shape

((1773, 8), (1773,))

**Model Building**

In [None]:
lgbm_clf = lgbm.LGBMClassifier(objective= "binary", 
                               n_estimators=1000, 
                               #learning_rate = 0.1,
                               boosting = "gbdt",
                               random_state=123)


lgbm_clf.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds = 100,
    #eval_metric = "binary_logloss",
    eval_metric= ['auc', 'accuracy']
)

[1]	valid_0's auc: 0.835185	valid_0's binary_logloss: 0.617826
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's auc: 0.858393	valid_0's binary_logloss: 0.587035
[3]	valid_0's auc: 0.866526	valid_0's binary_logloss: 0.560977
[4]	valid_0's auc: 0.865997	valid_0's binary_logloss: 0.539338
[5]	valid_0's auc: 0.864322	valid_0's binary_logloss: 0.524247
[6]	valid_0's auc: 0.865335	valid_0's binary_logloss: 0.510195
[7]	valid_0's auc: 0.869832	valid_0's binary_logloss: 0.497752
[8]	valid_0's auc: 0.871242	valid_0's binary_logloss: 0.486936
[9]	valid_0's auc: 0.871507	valid_0's binary_logloss: 0.478341
[10]	valid_0's auc: 0.872653	valid_0's binary_logloss: 0.470162
[11]	valid_0's auc: 0.87208	valid_0's binary_logloss: 0.465133
[12]	valid_0's auc: 0.872146	valid_0's binary_logloss: 0.460417
[13]	valid_0's auc: 0.872124	valid_0's binary_logloss: 0.457487
[14]	valid_0's auc: 0.871154	valid_0's binary_logloss: 0.454191
[15]	valid_0's auc: 0.870493	valid_0's binary_loglo

LGBMClassifier(boosting='gbdt', n_estimators=1000, objective='binary',
               random_state=123)

In [None]:
# determine best accuracy for test set
preds = lgbm_clf.predict(X_test)
test_acc = accuracy_score(preds, y_test)

In [None]:
train_y_pred = lgbm_clf.predict(X_train)
test_y_pred = lgbm_clf.predict(X_test)
print("AUC Train :{:.4f}\nAUC Test: {:.4f}".format(roc_auc_score(y_train,train_y_pred),
                                                    roc_auc_score(y_test,test_y_pred)))

AUC Train :0.8235
AUC Test: 0.7461


In [None]:
print('The test accuracy is :{:.6f}'.format(accuracy_score(y_test,lgbm_clf.predict(X_test))))

The test accuracy is :0.789137


In [None]:
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy : %f"%(accuracy))

accuracy : 0.789137
