# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm 

# config setup for fast experiment
from src.config import Config

# some utilities
from src.utils import show_samples, process_im, save_model

# boosting lib 
import catboost 


# Set up experiment pipeline

In [2]:
seed_val = 2021
np.random.seed(seed = seed_val)

# Loading data

In [3]:
# Split the data
train_sampled = pd.read_csv(os.path.join(Config.data_dir, 'train_sampled.csv'))
X, y  = train_sampled[train_sampled.columns[:-2]], train_sampled['Yield']
X_train, X_val, y_train, y_val = train_test_split(X, y)

# Modeling

In [4]:
model = catboost.CatBoostRegressor(
    # depth = Config.max_depth,
    learning_rate=Config.lr, 
    random_seed=seed_val,
    # od_wait = 30
    
)

In [5]:
# Fit the model
model.fit(X_train, y_train)

0:	learn: 1.7644879	total: 65.1ms	remaining: 1m 5s
1:	learn: 1.7644555	total: 79.3ms	remaining: 39.6s
2:	learn: 1.7644219	total: 90.8ms	remaining: 30.2s
3:	learn: 1.7643915	total: 104ms	remaining: 25.9s
4:	learn: 1.7643579	total: 116ms	remaining: 23s
5:	learn: 1.7643232	total: 129ms	remaining: 21.4s
6:	learn: 1.7642892	total: 140ms	remaining: 19.9s
7:	learn: 1.7642550	total: 154ms	remaining: 19.1s
8:	learn: 1.7642212	total: 166ms	remaining: 18.3s
9:	learn: 1.7641895	total: 182ms	remaining: 18.1s
10:	learn: 1.7641539	total: 198ms	remaining: 17.8s
11:	learn: 1.7641213	total: 212ms	remaining: 17.4s
12:	learn: 1.7640925	total: 224ms	remaining: 17s
13:	learn: 1.7640591	total: 238ms	remaining: 16.7s
14:	learn: 1.7640259	total: 248ms	remaining: 16.3s
15:	learn: 1.7639930	total: 266ms	remaining: 16.4s
16:	learn: 1.7639623	total: 283ms	remaining: 16.4s
17:	learn: 1.7639266	total: 298ms	remaining: 16.3s
18:	learn: 1.7638960	total: 311ms	remaining: 16s
19:	learn: 1.7638624	total: 324ms	remaining:

<catboost.core.CatBoostRegressor at 0x1da40fa9e50>

In [6]:
# Score with RMSE
print('Score:', mean_squared_error(y_val, model.predict(X_val), squared=False))

Score: 1.646815091488399


In [7]:
# save model
model_path = save_model(
    model_name = 'catboost_yield_predictor.cbm', 
    model=model
)

[INFO] model saved as D:\ZINDI\Computer_vision\CGIAR\data\../models\catboost_yield_predictor.cbm


# Make predictions

In [8]:
# Load the sample submission file
ss = pd.read_csv(os.path.join(Config.data_dir, 'SampleSubmission.csv'))

# Prep the data, using the same method we did for train
test_sampled = pd.read_csv(os.path.join(Config.data_dir, 'test_sampled.csv'))

trained_model = model.load_model(
    fname=model_path, 
)
# Get model predictions
preds = trained_model.predict(test_sampled)


# Store them in the submission dataframe and save
ss['Yield'] = preds
filename = f'{Config.base_model}_lr_{Config.lr}.csv'
ss.to_csv(os.path.join(Config.submissions_dir, filename), index=False)
ss.head()

Unnamed: 0,Field_ID,Yield
0,E9UZCEA,3.21206
1,1WGGS1Q,3.21329
2,EG2KXE2,3.218059
3,HC3GQXF,3.209284
4,7AK6GFK,3.208269
