# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# config setup for fast experiment
from src.config import Config

# some utilities
from src.utils import show_samples, process_im, save_model

# boosting lib 
import catboost 

import warnings
warnings.filterwarnings(action='ignore')

# Set up experiment pipeline

In [2]:
seed_val = 2021
np.random.seed(seed = seed_val)

# Loading data

In [3]:
# Split the data
train_sampled = pd.read_csv(os.path.join(Config.data_dir, 'train_sampled.csv'))
X, y  = train_sampled[train_sampled.columns[:-2]], train_sampled['Yield']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.1)

# Modeling

In [4]:
model = catboost.CatBoostRegressor(
    learning_rate = Config.lr,
    random_seed=seed_val,
    
)

# Fit the model


In [5]:
model = model.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_val, y_val)], 
    # eval_metric='rmse',
    early_stopping_rounds=300
)

0:	learn: 1.7537332	test: 1.5958961	best: 1.5958961 (0)	total: 95ms	remaining: 1m 34s
1:	learn: 1.7503631	test: 1.5933322	best: 1.5933322 (1)	total: 141ms	remaining: 1m 10s
2:	learn: 1.7470656	test: 1.5909022	best: 1.5909022 (2)	total: 173ms	remaining: 57.5s
3:	learn: 1.7438924	test: 1.5885705	best: 1.5885705 (3)	total: 190ms	remaining: 47.3s
4:	learn: 1.7407412	test: 1.5863020	best: 1.5863020 (4)	total: 199ms	remaining: 39.6s
5:	learn: 1.7377405	test: 1.5841533	best: 1.5841533 (5)	total: 207ms	remaining: 34.3s
6:	learn: 1.7347702	test: 1.5821055	best: 1.5821055 (6)	total: 221ms	remaining: 31.4s
7:	learn: 1.7316114	test: 1.5800282	best: 1.5800282 (7)	total: 227ms	remaining: 28.1s
8:	learn: 1.7286306	test: 1.5776143	best: 1.5776143 (8)	total: 232ms	remaining: 25.6s
9:	learn: 1.7259309	test: 1.5758186	best: 1.5758186 (9)	total: 241ms	remaining: 23.9s
10:	learn: 1.7230263	test: 1.5735213	best: 1.5735213 (10)	total: 252ms	remaining: 22.7s
11:	learn: 1.7202071	test: 1.5710170	best: 1.571017

In [6]:
print('Score:', mean_squared_error(y_val, model.predict(X_val), squared=False))

Score: 1.4672392266543182


In [7]:
save_model(
    model_name=os.path.join(Config.models_dir, 'yield_predictor.cbm'),
    model = model
)

[INFO] model saved as /home/zeusdric/Dric/Zindi2020/COMPETITIONS/Computer_vision/CGIAR/data/../models/yield_predictor.cbm


'/home/zeusdric/Dric/Zindi2020/COMPETITIONS/Computer_vision/CGIAR/data/../models/yield_predictor.cbm'

# Make predictions

In [8]:
# Load the sample submission file
ss = pd.read_csv(os.path.join(Config.data_dir, 'SampleSubmission.csv'))

# Prep the data, using the same method we did for train
test_sampled = pd.read_csv(os.path.join(Config.data_dir, 'test_sampled.csv'))

trained_model = model.load_model(
    fname=os.path.join(Config.models_dir, 'yield_predictor.cbm') 
)
# Get model predictions
preds = trained_model.predict(test_sampled)


# Store them in the submission dataframe and save
ss['Yield'] = preds
filename = f'{Config.base_model}_lr_{Config.lr}.csv'
ss.to_csv(os.path.join(Config.submissions_dir, filename), index=False)
ss.head()

Unnamed: 0,Field_ID,Yield
0,E9UZCEA,3.419373
1,1WGGS1Q,3.424853
2,EG2KXE2,3.479351
3,HC3GQXF,3.359853
4,7AK6GFK,3.37921


In [9]:
ss.shape

(1055, 2)