<a href="https://colab.research.google.com/github/dric2018/zindi-yield-pred/blob/main/notebooks/colab_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! mkdir  models submissions data src

In [3]:
! cp -r /content/drive/MyDrive/zindi-CGIAR-wheat-challenge/* /content/data/

In [7]:
! unzip -q /content/data/image_arrays_train.zip -d /content/data/
! unzip -q /content/data/image_arrays_test.zip -d /content/data/

In [9]:
%%capture
! pip install -r /content/data/requirements.txt

# Import libraries

In [10]:
%%writefile /content/src/config.py

import os


class Config:
    data_dir = os.path.abspath('/content/data')
    train_data_dir = os.path.join(data_dir, 'image_arrays_train')
    test_data_dir = os.path.join(data_dir, 'image_arrays_test')
    submissions_dir = os.path.join(data_dir, '../' 'submissions')
    models_dir = os.path.join(data_dir, '../', 'models')
    bands_of_interest = ['S2_B5', 'S2_B4', 'S2_B3', 'S2_B2', 'CLIM_pr', 'CLIM_soil']
    band_names = [l.strip() for l in open(os.path.join(data_dir, 'bandnames.txt'), 'r').readlines()]
    base_model = 'catboost'
    lr = 1e-2



Writing /content/src/config.py


In [11]:
%%writefile /content/src/utils.py



# acknowledgement : The code base is from the starter nb released by Johno W.
# starter code link :
# https://zindi.africa/competitions/cgiar-crop-yield-prediction-challenge/data/Starter_Notebook_CGIAR_Yield_Estimation.ipynb

import numpy as np
import os
import pandas as pd
from matplotlib import pyplot as plt
from .config import Config


def process_im(fid, folder=Config.train_data_dir):
    fn = f'{folder}/{fid}.npy'
    arr = np.load(fn)
    Config.bands_of_interest
    values = {}
    for month in range(12):
        # Bands of interest for this month
        bns = [str(month) + '_' + b for b in Config.bands_of_interest]
        # Index of these bands
        idxs = np.where(np.isin(Config.band_names, bns))
        vs = arr[idxs, 20, 20]  # Sample the im at the center point
        for bn, v in zip(bns, vs[0]):
            values[bn] = v
    return values


def show_samples(df: pd.DataFrame, data_dir: str):
    # Look at a sample:
    fid = df['Field_ID'].sample().values[0]
    fn = os.path.join(data_dir, f'{fid}.npy')  # File name based on Field_ID
    print(f'Loading {fn} as an array')
    arr = np.load(fn)  # Loading the data with numpy
    print('Array shape:', arr.shape)  # 360 bands, images 40 or 41px a side
    # Combine three bands for viewing
    rgb_jan = np.stack([arr[4], arr[3], arr[2]], axis=-1)
    # Scale band values to (0, 1) for easy image display
    rgb_jan = rgb_jan / np.max(rgb_jan)
    plt.imshow(rgb_jan)  # View with matplotlib

    return arr


def save_model(model_name, model, params: dict = None):
    path = os.path.join(Config.models_dir, model_name)
    try:
        model.save_model(fname=path, format='cbm', export_parameters=params)
        # print(f'[INFO] model saved as {path}')
        return path

    except Exception as ex:
        print(f"[ERROR] {ex}")


Writing /content/src/utils.py


In [12]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# config setup for fast experiment
from src.config import Config

# some utilities
from src.utils import show_samples, process_im, save_model

# boosting lib 
import catboost 

import warnings
warnings.filterwarnings(action='ignore')

In [13]:
# Train.csv has the Field_IDs needed to find the npy files
train = pd.read_csv(os.path.join(Config.data_dir, 'Train.csv'))
print(train.shape)
train.head()

(2977, 4)


Unnamed: 0,Field_ID,Year,Quality,Yield
0,MH2O0YH,2019,3,3.686
1,O9TURWL,2019,2,5.657
2,35AFSDD,2019,3,3.082
3,PM05EG9,2019,2,2.707
4,V7PZBCG,2019,2,2.679


In [None]:
# Make a new DF with the sampled values from each field 
train_sampled = pd.DataFrame([process_im(fid) for fid in train['Field_ID'].values])

# Add in the field ID and yield
train_sampled['Field_ID'] = train['Field_ID'].values
train_sampled['Yield'] = train['Yield'].values



# save train_sampled dataframe
train_sampled.to_csv(os.path.join(Config.data_dir, 'train_sampled.csv'), index=False)

# show data
train_sampled.head()


In [None]:
# save test_sampled dataframe
ss = pd.read_csv(os.path.join(Config.data_dir, 'SampleSubmission.csv'))

test_sampled = pd.DataFrame([process_im(fid, folder=os.path.join(Config.data_dir, 'image_arrays_test')) for fid in ss['Field_ID'].values])
test_sampled.to_csv(os.path.join(Config.data_dir, 'test_sampled.csv'), index=False)



# Set up experiment pipeline

In [None]:
seed_val = 2021
np.random.seed(seed = seed_val)

# Loading data

In [None]:
# Split the data
train_sampled = pd.read_csv(os.path.join(Config.data_dir, 'train_sampled.csv'))
X, y  = train_sampled[train_sampled.columns[:-2]], train_sampled['Yield']
X_train, X_val, y_train, y_val = train_test_split(X, y)

# Modeling

In [None]:
model = catboost.CatBoostRegressor(
    learning_rate = Config.lr,
    random_seed=seed_val,
    
)

# Fit the model


In [None]:
model = model.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_val, y_val)], 
    # eval_metric='rmse',
    early_stopping_rounds=300
)

In [None]:
print('Score:', mean_squared_error(y_val, model.predict(X_val), squared=False))

In [None]:
model_path = save_model(
    model_name=os.path.join(Config.models_dir, 'yield_predictor.cbm'),
    model = model
)

# Make predictions

In [None]:
# Load the sample submission file
ss = pd.read_csv(os.path.join(Config.data_dir, 'SampleSubmission.csv'))

# Prep the data, using the same method we did for train
test_sampled = pd.read_csv(os.path.join(Config.data_dir, 'test_sampled.csv'))

trained_model = model.load_model(
    fname=model_path 
)
# Get model predictions
preds = trained_model.predict(test_sampled)


# Store them in the submission dataframe and save
ss['Yield'] = preds
filename = f'{Config.base_model}_lr_{Config.lr}.csv'
ss.to_csv(os.path.join(Config.submissions_dir, filename), index=False)
ss.head()

In [None]:
ss.shape