**Catboost model on derived dataset**

In [1]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 11 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from catboost import CatBoostClassifier
from catboost import cv


In [3]:
df = pd.read_csv("/content/IGRA_derived.csv")

In [4]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Seasonality,Lower level humidity,Mid level humidity,U-wind at lower and mid levels,V-wind at lower and mid levels,Convective Condensation Level - thermodynamics,Total Precipitable Water - thermodynamics,rained
0,2008-01-07,175.0,86.499196,57.931008,-6.902578,-0.314079,938.737374,51.698735,1
1,2008-01-09,173.0,86.168358,54.514297,-3.278,0.458702,926.737374,51.07854,0
2,2008-01-10,172.0,79.143281,57.851667,-3.922668,-0.298095,939.977719,50.557384,0


In [5]:
df = df.rename(columns={'Unnamed: 0':'date'})
df['date'] = pd.to_datetime(df.date)
df['month']= df.date.dt.month

In [6]:
df.head(3)

Unnamed: 0,date,Seasonality,Lower level humidity,Mid level humidity,U-wind at lower and mid levels,V-wind at lower and mid levels,Convective Condensation Level - thermodynamics,Total Precipitable Water - thermodynamics,rained,month
0,2008-01-07,175.0,86.499196,57.931008,-6.902578,-0.314079,938.737374,51.698735,1,1
1,2008-01-09,173.0,86.168358,54.514297,-3.278,0.458702,926.737374,51.07854,0,1
2,2008-01-10,172.0,79.143281,57.851667,-3.922668,-0.298095,939.977719,50.557384,0,1


In [7]:
df.shape

(2086, 10)

In [9]:
y=df['rained']
X = df.iloc[:,1:]
X=X.drop(columns=['rained'])

In [10]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=df[['rained', 'month']],random_state=123)

In [11]:
# Transforming features using Min Max Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X)
X = scaler.transform(X)

In [12]:
# One hot encoding
one_hot_encoded_data = pd.get_dummies(df, columns = ['month'])
df=pd.DataFrame(data=one_hot_encoded_data)


In [13]:
df.head(3)

Unnamed: 0,date,Seasonality,Lower level humidity,Mid level humidity,U-wind at lower and mid levels,V-wind at lower and mid levels,Convective Condensation Level - thermodynamics,Total Precipitable Water - thermodynamics,rained,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2008-01-07,175.0,86.499196,57.931008,-6.902578,-0.314079,938.737374,51.698735,1,1,0,0,0,0,0,0,0,0,0,0,0
1,2008-01-09,173.0,86.168358,54.514297,-3.278,0.458702,926.737374,51.07854,0,1,0,0,0,0,0,0,0,0,0,0,0
2,2008-01-10,172.0,79.143281,57.851667,-3.922668,-0.298095,939.977719,50.557384,0,1,0,0,0,0,0,0,0,0,0,0,0


In [14]:
df.shape

(2086, 21)

In [15]:
X_train.shape, y_train.shape

((1564, 8), (1564,))

In [18]:
X_test.shape, y_test.shape

((522, 8), (522,))

In [16]:
## Building a model 

cat_model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=123)

## Set the metric for evaluation
cat_model = CatBoostClassifier(eval_metric='Accuracy',
use_best_model=True,  random_seed=123)

cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))

Learning rate set to 0.03539
0:	learn: 0.7992327	test: 0.8065134	best: 0.8065134 (0)	total: 51ms	remaining: 50.9s
1:	learn: 0.8056266	test: 0.8199234	best: 0.8199234 (1)	total: 54.1ms	remaining: 27s
2:	learn: 0.8081841	test: 0.8122605	best: 0.8199234 (1)	total: 57.4ms	remaining: 19.1s
3:	learn: 0.8075448	test: 0.8160920	best: 0.8199234 (1)	total: 60.2ms	remaining: 15s
4:	learn: 0.8126598	test: 0.8160920	best: 0.8199234 (1)	total: 62.9ms	remaining: 12.5s
5:	learn: 0.8139386	test: 0.8084291	best: 0.8199234 (1)	total: 65.7ms	remaining: 10.9s
6:	learn: 0.8132992	test: 0.8103448	best: 0.8199234 (1)	total: 68.3ms	remaining: 9.69s
7:	learn: 0.8145780	test: 0.8103448	best: 0.8199234 (1)	total: 71.1ms	remaining: 8.81s
8:	learn: 0.8132992	test: 0.8065134	best: 0.8199234 (1)	total: 73.7ms	remaining: 8.12s
9:	learn: 0.8177749	test: 0.8026820	best: 0.8199234 (1)	total: 76.3ms	remaining: 7.56s
10:	learn: 0.8177749	test: 0.8065134	best: 0.8199234 (1)	total: 80.2ms	remaining: 7.21s
11:	learn: 0.820971

<catboost.core.CatBoostClassifier at 0x7ff669b54490>

In [19]:
print('The test accuracy is :{:.6f}'.format(accuracy_score(y_test,cat_model.predict(X_test))))

The test accuracy is :0.819923
