<a href="https://colab.research.google.com/github/dric2018/zindi-yield-pred/blob/main/notebooks/colab_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
! mkdir  models submissions data src

In [11]:
! cp -r /content/drive/MyDrive/zindi-CGIAR-wheat-challenge/* /content/data/

In [None]:
! unzip -a /content/data/image_arrays_train.zip -d /content/data/
! unzip -a /content/data/image_arrays_test.zip -d /content/data/

In [5]:
%%capture
! pip install -r /content/data/requirements.txt

# Import libraries

In [8]:
%%writefile /content/src/config.py

import os


class Config:
    data_dir = os.path.abspath('/content/data')
    train_data_dir = os.path.join(data_dir, 'image_arrays_train')
    test_data_dir = os.path.join(data_dir, 'image_arrays_test')
    submissions_dir = os.path.join(data_dir, '../' 'submissions')
    models_dir = os.path.join(data_dir, '../', 'models')
    bands_of_interest = ['S2_B5', 'S2_B4', 'S2_B3', 'S2_B2', 'CLIM_pr', 'CLIM_soil']
    band_names = [l.strip() for l in open(os.path.join(data_dir, 'bandnames.txt'), 'r').readlines()]
    base_model = 'catboost'
    lr = 1e-2



Writing /content/src/config.py


In [9]:
%%writefile /content/src/utils.py



# acknowledgement : The code base is from the starter nb released by Johno W.
# starter code link :
# https://zindi.africa/competitions/cgiar-crop-yield-prediction-challenge/data/Starter_Notebook_CGIAR_Yield_Estimation.ipynb

import numpy as np
import os
import pandas as pd
from matplotlib import pyplot as plt
from .config import Config


def process_im(fid, folder=Config.train_data_dir):
    fn = f'{folder}/{fid}.npy'
    arr = np.load(fn)
    Config.bands_of_interest
    values = {}
    for month in range(12):
        # Bands of interest for this month
        bns = [str(month) + '_' + b for b in Config.bands_of_interest]
        # Index of these bands
        idxs = np.where(np.isin(Config.band_names, bns))
        vs = arr[idxs, 20, 20]  # Sample the im at the center point
        for bn, v in zip(bns, vs[0]):
            values[bn] = v
    return values


def show_samples(df: pd.DataFrame, data_dir: str):
    # Look at a sample:
    fid = df['Field_ID'].sample().values[0]
    fn = os.path.join(data_dir, f'{fid}.npy')  # File name based on Field_ID
    print(f'Loading {fn} as an array')
    arr = np.load(fn)  # Loading the data with numpy
    print('Array shape:', arr.shape)  # 360 bands, images 40 or 41px a side
    # Combine three bands for viewing
    rgb_jan = np.stack([arr[4], arr[3], arr[2]], axis=-1)
    # Scale band values to (0, 1) for easy image display
    rgb_jan = rgb_jan / np.max(rgb_jan)
    plt.imshow(rgb_jan)  # View with matplotlib

    return arr


def save_model(model_name, model, params: dict = None):
    path = os.path.join(Config.models_dir, model_name)
    try:
        model.save_model(fname=path, format='cbm', export_parameters=params)
        # print(f'[INFO] model saved as {path}')
        return path

    except Exception as ex:
        print(f"[ERROR] {ex}")


Writing /content/src/utils.py


In [10]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# config setup for fast experiment
from src.config import Config

# some utilities
from src.utils import show_samples, process_im, save_model

# boosting lib 
import catboost 

import warnings
warnings.filterwarnings(action='ignore')

# Set up experiment pipeline

In [14]:
seed_val = 2021
np.random.seed(seed = seed_val)

# Loading data

In [12]:
# Split the data
train_sampled = pd.read_csv(os.path.join(Config.data_dir, 'train_sampled.csv'))
X, y  = train_sampled[train_sampled.columns[:-2]], train_sampled['Yield']
X_train, X_val, y_train, y_val = train_test_split(X, y)

# Modeling

In [15]:
model = catboost.CatBoostRegressor(
    learning_rate = Config.lr,
    random_seed=seed_val,
    
)

# Fit the model


In [16]:
model = model.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_val, y_val)], 
    # eval_metric='rmse',
    early_stopping_rounds=300
)

0:	learn: 1.7389510	test: 1.7366534	best: 1.7366534 (0)	total: 71ms	remaining: 1m 10s
1:	learn: 1.7357680	test: 1.7338661	best: 1.7338661 (1)	total: 90.3ms	remaining: 45.1s
2:	learn: 1.7327203	test: 1.7312178	best: 1.7312178 (2)	total: 110ms	remaining: 36.4s
3:	learn: 1.7300730	test: 1.7286677	best: 1.7286677 (3)	total: 129ms	remaining: 32.1s
4:	learn: 1.7270629	test: 1.7260635	best: 1.7260635 (4)	total: 148ms	remaining: 29.5s
5:	learn: 1.7239779	test: 1.7233148	best: 1.7233148 (5)	total: 167ms	remaining: 27.6s
6:	learn: 1.7213502	test: 1.7206643	best: 1.7206643 (6)	total: 186ms	remaining: 26.4s
7:	learn: 1.7184437	test: 1.7181791	best: 1.7181791 (7)	total: 206ms	remaining: 25.5s
8:	learn: 1.7156424	test: 1.7155281	best: 1.7155281 (8)	total: 225ms	remaining: 24.8s
9:	learn: 1.7126808	test: 1.7130896	best: 1.7130896 (9)	total: 245ms	remaining: 24.3s
10:	learn: 1.7101051	test: 1.7106245	best: 1.7106245 (10)	total: 264ms	remaining: 23.8s
11:	learn: 1.7073462	test: 1.7082825	best: 1.708282

In [17]:
print('Score:', mean_squared_error(y_val, model.predict(X_val), squared=False))

Score: 1.545606954476209


In [18]:
model_path = save_model(
    model_name=os.path.join(Config.models_dir, 'yield_predictor.cbm'),
    model = model
)

# Make predictions

In [19]:
# Load the sample submission file
ss = pd.read_csv(os.path.join(Config.data_dir, 'SampleSubmission.csv'))

# Prep the data, using the same method we did for train
test_sampled = pd.read_csv(os.path.join(Config.data_dir, 'test_sampled.csv'))

trained_model = model.load_model(
    fname=model_path 
)
# Get model predictions
preds = trained_model.predict(test_sampled)


# Store them in the submission dataframe and save
ss['Yield'] = preds
filename = f'{Config.base_model}_lr_{Config.lr}.csv'
ss.to_csv(os.path.join(Config.submissions_dir, filename), index=False)
ss.head()

Unnamed: 0,Field_ID,Yield
0,E9UZCEA,3.520924
1,1WGGS1Q,3.317669
2,EG2KXE2,3.496233
3,HC3GQXF,3.226672
4,7AK6GFK,3.315762


In [20]:
ss.shape

(1055, 2)